diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/spdk/dpdk/kernel/linux | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/dpdk/kernel/linux')
-rw-r--r-- | src/spdk/dpdk/kernel/linux/Makefile | 9 | ||||
-rw-r--r-- | src/spdk/dpdk/kernel/linux/igb_uio/Kbuild | 2 | ||||
-rw-r--r-- | src/spdk/dpdk/kernel/linux/igb_uio/Makefile | 25 | ||||
-rw-r--r-- | src/spdk/dpdk/kernel/linux/igb_uio/compat.h | 154 | ||||
-rw-r--r-- | src/spdk/dpdk/kernel/linux/igb_uio/igb_uio.c | 660 | ||||
-rw-r--r-- | src/spdk/dpdk/kernel/linux/igb_uio/meson.build | 20 | ||||
-rw-r--r-- | src/spdk/dpdk/kernel/linux/kni/Kbuild | 6 | ||||
-rw-r--r-- | src/spdk/dpdk/kernel/linux/kni/Makefile | 34 | ||||
-rw-r--r-- | src/spdk/dpdk/kernel/linux/kni/compat.h | 136 | ||||
-rw-r--r-- | src/spdk/dpdk/kernel/linux/kni/kni_dev.h | 127 | ||||
-rw-r--r-- | src/spdk/dpdk/kernel/linux/kni/kni_fifo.h | 87 | ||||
-rw-r--r-- | src/spdk/dpdk/kernel/linux/kni/kni_misc.c | 661 | ||||
-rw-r--r-- | src/spdk/dpdk/kernel/linux/kni/kni_net.c | 844 | ||||
-rw-r--r-- | src/spdk/dpdk/kernel/linux/kni/meson.build | 28 | ||||
-rw-r--r-- | src/spdk/dpdk/kernel/linux/meson.build | 28 |
15 files changed, 2821 insertions, 0 deletions
diff --git a/src/spdk/dpdk/kernel/linux/Makefile b/src/spdk/dpdk/kernel/linux/Makefile new file mode 100644 index 000000000..c2c45a3e6 --- /dev/null +++ b/src/spdk/dpdk/kernel/linux/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright 2017 NXP + +include $(RTE_SDK)/mk/rte.vars.mk + +DIRS-$(CONFIG_RTE_EAL_IGB_UIO) += igb_uio +DIRS-$(CONFIG_RTE_KNI_KMOD) += kni + +include $(RTE_SDK)/mk/rte.subdir.mk diff --git a/src/spdk/dpdk/kernel/linux/igb_uio/Kbuild b/src/spdk/dpdk/kernel/linux/igb_uio/Kbuild new file mode 100644 index 000000000..3ab85c411 --- /dev/null +++ b/src/spdk/dpdk/kernel/linux/igb_uio/Kbuild @@ -0,0 +1,2 @@ +ccflags-y := $(MODULE_CFLAGS) +obj-m := igb_uio.o diff --git a/src/spdk/dpdk/kernel/linux/igb_uio/Makefile b/src/spdk/dpdk/kernel/linux/igb_uio/Makefile new file mode 100644 index 000000000..f83bcc7c6 --- /dev/null +++ b/src/spdk/dpdk/kernel/linux/igb_uio/Makefile @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# +# module name and path +# +MODULE = igb_uio +MODULE_PATH = drivers/net/igb_uio + +# +# CFLAGS +# +MODULE_CFLAGS += -I$(SRCDIR) --param max-inline-insns-single=100 +MODULE_CFLAGS += -I$(RTE_OUTPUT)/include +MODULE_CFLAGS += -Winline -Wall -Werror +MODULE_CFLAGS += -include $(RTE_OUTPUT)/include/rte_config.h + +# +# all source are stored in SRCS-y +# +SRCS-y := igb_uio.c + +include $(RTE_SDK)/mk/rte.module.mk diff --git a/src/spdk/dpdk/kernel/linux/igb_uio/compat.h b/src/spdk/dpdk/kernel/linux/igb_uio/compat.h new file mode 100644 index 000000000..8dbb896ae --- /dev/null +++ b/src/spdk/dpdk/kernel/linux/igb_uio/compat.h @@ -0,0 +1,154 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Minimal wrappers to allow compiling igb_uio on older kernels. + */ + +#ifndef RHEL_RELEASE_VERSION +#define RHEL_RELEASE_VERSION(a, b) (((a) << 8) + (b)) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 3, 0) +#define pci_cfg_access_lock pci_block_user_cfg_access +#define pci_cfg_access_unlock pci_unblock_user_cfg_access +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0) +#define HAVE_PTE_MASK_PAGE_IOMAP +#endif + +#ifndef PCI_MSIX_ENTRY_SIZE +#define PCI_MSIX_ENTRY_SIZE 16 +#define PCI_MSIX_ENTRY_VECTOR_CTRL 12 +#define PCI_MSIX_ENTRY_CTRL_MASKBIT 1 +#endif + +/* + * for kernels < 2.6.38 and backported patch that moves MSI-X entry definition + * to pci_regs.h Those kernels has PCI_MSIX_ENTRY_SIZE defined but not + * PCI_MSIX_ENTRY_CTRL_MASKBIT + */ +#ifndef PCI_MSIX_ENTRY_CTRL_MASKBIT +#define PCI_MSIX_ENTRY_CTRL_MASKBIT 1 +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 34) && \ + (!(defined(RHEL_RELEASE_CODE) && \ + RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(5, 9))) + +static int pci_num_vf(struct pci_dev *dev) +{ + struct iov { + int pos; + int nres; + u32 cap; + u16 ctrl; + u16 total; + u16 initial; + u16 nr_virtfn; + } *iov = (struct iov *)dev->sriov; + + if (!dev->is_physfn) + return 0; + + return iov->nr_virtfn; +} + +#endif /* < 2.6.34 */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 39) && \ + (!(defined(RHEL_RELEASE_CODE) && \ + RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 4))) + +#define kstrtoul strict_strtoul + +#endif /* < 2.6.39 */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 3, 0) && \ + (!(defined(RHEL_RELEASE_CODE) && \ + RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 3))) + +/* Check if INTX works to control irq's. + * Set's INTX_DISABLE flag and reads it back + */ +static bool pci_intx_mask_supported(struct pci_dev *pdev) +{ + bool mask_supported = false; + uint16_t orig, new; + + pci_block_user_cfg_access(pdev); + pci_read_config_word(pdev, PCI_COMMAND, &orig); + pci_write_config_word(pdev, PCI_COMMAND, + orig ^ PCI_COMMAND_INTX_DISABLE); + pci_read_config_word(pdev, PCI_COMMAND, &new); + + if ((new ^ orig) & ~PCI_COMMAND_INTX_DISABLE) { + dev_err(&pdev->dev, "Command register changed from " + "0x%x to 0x%x: driver or hardware bug?\n", orig, new); + } else if ((new ^ orig) & PCI_COMMAND_INTX_DISABLE) { + mask_supported = true; + pci_write_config_word(pdev, PCI_COMMAND, orig); + } + pci_unblock_user_cfg_access(pdev); + + return mask_supported; +} + +static bool pci_check_and_mask_intx(struct pci_dev *pdev) +{ + bool pending; + uint32_t status; + + pci_block_user_cfg_access(pdev); + pci_read_config_dword(pdev, PCI_COMMAND, &status); + + /* interrupt is not ours, goes to out */ + pending = (((status >> 16) & PCI_STATUS_INTERRUPT) != 0); + if (pending) { + uint16_t old, new; + + old = status; + if (status != 0) + new = old & (~PCI_COMMAND_INTX_DISABLE); + else + new = old | PCI_COMMAND_INTX_DISABLE; + + if (old != new) + pci_write_config_word(pdev, PCI_COMMAND, new); + } + pci_unblock_user_cfg_access(pdev); + + return pending; +} + +#endif /* < 3.3.0 */ + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) +#define HAVE_PCI_IS_BRIDGE_API 1 +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +#define HAVE_MSI_LIST_IN_GENERIC_DEVICE 1 +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) +#define HAVE_PCI_MSI_MASK_IRQ 1 +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0) +#define HAVE_ALLOC_IRQ_VECTORS 1 +#endif + +static inline bool igbuio_kernel_is_locked_down(void) +{ +#ifdef CONFIG_LOCK_DOWN_KERNEL +#ifdef CONFIG_LOCK_DOWN_IN_EFI_SECURE_BOOT + return kernel_is_locked_down(NULL); +#elif defined(CONFIG_EFI_SECURE_BOOT_LOCK_DOWN) + return kernel_is_locked_down(); +#else + return false; +#endif +#else + return false; +#endif +} diff --git a/src/spdk/dpdk/kernel/linux/igb_uio/igb_uio.c b/src/spdk/dpdk/kernel/linux/igb_uio/igb_uio.c new file mode 100644 index 000000000..039f5a5f6 --- /dev/null +++ b/src/spdk/dpdk/kernel/linux/igb_uio/igb_uio.c @@ -0,0 +1,660 @@ +// SPDX-License-Identifier: GPL-2.0 +/*- + * Copyright(c) 2010-2017 Intel Corporation. All rights reserved. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/device.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/uio_driver.h> +#include <linux/io.h> +#include <linux/irq.h> +#include <linux/msi.h> +#include <linux/version.h> +#include <linux/slab.h> + +#include <rte_pci_dev_features.h> + +#include "compat.h" + +/** + * A structure describing the private information for a uio device. + */ +struct rte_uio_pci_dev { + struct uio_info info; + struct pci_dev *pdev; + enum rte_intr_mode mode; + atomic_t refcnt; +}; + +static int wc_activate; +static char *intr_mode; +static enum rte_intr_mode igbuio_intr_mode_preferred = RTE_INTR_MODE_MSIX; +/* sriov sysfs */ +static ssize_t +show_max_vfs(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, 10, "%u\n", dev_num_vf(dev)); +} + +static ssize_t +store_max_vfs(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + int err = 0; + unsigned long max_vfs; + struct pci_dev *pdev = to_pci_dev(dev); + + if (0 != kstrtoul(buf, 0, &max_vfs)) + return -EINVAL; + + if (0 == max_vfs) + pci_disable_sriov(pdev); + else if (0 == pci_num_vf(pdev)) + err = pci_enable_sriov(pdev, max_vfs); + else /* do nothing if change max_vfs number */ + err = -EINVAL; + + return err ? err : count; +} + +static DEVICE_ATTR(max_vfs, S_IRUGO | S_IWUSR, show_max_vfs, store_max_vfs); + +static struct attribute *dev_attrs[] = { + &dev_attr_max_vfs.attr, + NULL, +}; + +static const struct attribute_group dev_attr_grp = { + .attrs = dev_attrs, +}; + +#ifndef HAVE_PCI_MSI_MASK_IRQ +/* + * It masks the msix on/off of generating MSI-X messages. + */ +static void +igbuio_msix_mask_irq(struct msi_desc *desc, s32 state) +{ + u32 mask_bits = desc->masked; + unsigned int offset = desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE + + PCI_MSIX_ENTRY_VECTOR_CTRL; + + if (state != 0) + mask_bits &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT; + else + mask_bits |= PCI_MSIX_ENTRY_CTRL_MASKBIT; + + if (mask_bits != desc->masked) { + writel(mask_bits, desc->mask_base + offset); + readl(desc->mask_base); + desc->masked = mask_bits; + } +} + +/* + * It masks the msi on/off of generating MSI messages. + */ +static void +igbuio_msi_mask_irq(struct pci_dev *pdev, struct msi_desc *desc, int32_t state) +{ + u32 mask_bits = desc->masked; + u32 offset = desc->irq - pdev->irq; + u32 mask = 1 << offset; + + if (!desc->msi_attrib.maskbit) + return; + + if (state != 0) + mask_bits &= ~mask; + else + mask_bits |= mask; + + if (mask_bits != desc->masked) { + pci_write_config_dword(pdev, desc->mask_pos, mask_bits); + desc->masked = mask_bits; + } +} + +static void +igbuio_mask_irq(struct pci_dev *pdev, enum rte_intr_mode mode, s32 irq_state) +{ + struct msi_desc *desc; + struct list_head *msi_list; + +#ifdef HAVE_MSI_LIST_IN_GENERIC_DEVICE + msi_list = &pdev->dev.msi_list; +#else + msi_list = &pdev->msi_list; +#endif + + if (mode == RTE_INTR_MODE_MSIX) { + list_for_each_entry(desc, msi_list, list) + igbuio_msix_mask_irq(desc, irq_state); + } else if (mode == RTE_INTR_MODE_MSI) { + list_for_each_entry(desc, msi_list, list) + igbuio_msi_mask_irq(pdev, desc, irq_state); + } +} +#endif + +/** + * This is the irqcontrol callback to be registered to uio_info. + * It can be used to disable/enable interrupt from user space processes. + * + * @param info + * pointer to uio_info. + * @param irq_state + * state value. 1 to enable interrupt, 0 to disable interrupt. + * + * @return + * - On success, 0. + * - On failure, a negative value. + */ +static int +igbuio_pci_irqcontrol(struct uio_info *info, s32 irq_state) +{ + struct rte_uio_pci_dev *udev = info->priv; + struct pci_dev *pdev = udev->pdev; + +#ifdef HAVE_PCI_MSI_MASK_IRQ + struct irq_data *irq = irq_get_irq_data(udev->info.irq); +#endif + + pci_cfg_access_lock(pdev); + + if (udev->mode == RTE_INTR_MODE_MSIX || udev->mode == RTE_INTR_MODE_MSI) { +#ifdef HAVE_PCI_MSI_MASK_IRQ + if (irq_state == 1) + pci_msi_unmask_irq(irq); + else + pci_msi_mask_irq(irq); +#else + igbuio_mask_irq(pdev, udev->mode, irq_state); +#endif + } + + if (udev->mode == RTE_INTR_MODE_LEGACY) + pci_intx(pdev, !!irq_state); + + pci_cfg_access_unlock(pdev); + + return 0; +} + +/** + * This is interrupt handler which will check if the interrupt is for the right device. + * If yes, disable it here and will be enable later. + */ +static irqreturn_t +igbuio_pci_irqhandler(int irq, void *dev_id) +{ + struct rte_uio_pci_dev *udev = (struct rte_uio_pci_dev *)dev_id; + struct uio_info *info = &udev->info; + + /* Legacy mode need to mask in hardware */ + if (udev->mode == RTE_INTR_MODE_LEGACY && + !pci_check_and_mask_intx(udev->pdev)) + return IRQ_NONE; + + uio_event_notify(info); + + /* Message signal mode, no share IRQ and automasked */ + return IRQ_HANDLED; +} + +static int +igbuio_pci_enable_interrupts(struct rte_uio_pci_dev *udev) +{ + int err = 0; +#ifndef HAVE_ALLOC_IRQ_VECTORS + struct msix_entry msix_entry; +#endif + + switch (igbuio_intr_mode_preferred) { + case RTE_INTR_MODE_MSIX: + /* Only 1 msi-x vector needed */ +#ifndef HAVE_ALLOC_IRQ_VECTORS + msix_entry.entry = 0; + if (pci_enable_msix(udev->pdev, &msix_entry, 1) == 0) { + dev_dbg(&udev->pdev->dev, "using MSI-X"); + udev->info.irq_flags = IRQF_NO_THREAD; + udev->info.irq = msix_entry.vector; + udev->mode = RTE_INTR_MODE_MSIX; + break; + } +#else + if (pci_alloc_irq_vectors(udev->pdev, 1, 1, PCI_IRQ_MSIX) == 1) { + dev_dbg(&udev->pdev->dev, "using MSI-X"); + udev->info.irq_flags = IRQF_NO_THREAD; + udev->info.irq = pci_irq_vector(udev->pdev, 0); + udev->mode = RTE_INTR_MODE_MSIX; + break; + } +#endif + + /* falls through - to MSI */ + case RTE_INTR_MODE_MSI: +#ifndef HAVE_ALLOC_IRQ_VECTORS + if (pci_enable_msi(udev->pdev) == 0) { + dev_dbg(&udev->pdev->dev, "using MSI"); + udev->info.irq_flags = IRQF_NO_THREAD; + udev->info.irq = udev->pdev->irq; + udev->mode = RTE_INTR_MODE_MSI; + break; + } +#else + if (pci_alloc_irq_vectors(udev->pdev, 1, 1, PCI_IRQ_MSI) == 1) { + dev_dbg(&udev->pdev->dev, "using MSI"); + udev->info.irq_flags = IRQF_NO_THREAD; + udev->info.irq = pci_irq_vector(udev->pdev, 0); + udev->mode = RTE_INTR_MODE_MSI; + break; + } +#endif + /* falls through - to INTX */ + case RTE_INTR_MODE_LEGACY: + if (pci_intx_mask_supported(udev->pdev)) { + dev_dbg(&udev->pdev->dev, "using INTX"); + udev->info.irq_flags = IRQF_SHARED | IRQF_NO_THREAD; + udev->info.irq = udev->pdev->irq; + udev->mode = RTE_INTR_MODE_LEGACY; + break; + } + dev_notice(&udev->pdev->dev, "PCI INTX mask not supported\n"); + /* falls through - to no IRQ */ + case RTE_INTR_MODE_NONE: + udev->mode = RTE_INTR_MODE_NONE; + udev->info.irq = UIO_IRQ_NONE; + break; + + default: + dev_err(&udev->pdev->dev, "invalid IRQ mode %u", + igbuio_intr_mode_preferred); + udev->info.irq = UIO_IRQ_NONE; + err = -EINVAL; + } + + if (udev->info.irq != UIO_IRQ_NONE) + err = request_irq(udev->info.irq, igbuio_pci_irqhandler, + udev->info.irq_flags, udev->info.name, + udev); + dev_info(&udev->pdev->dev, "uio device registered with irq %ld\n", + udev->info.irq); + + return err; +} + +static void +igbuio_pci_disable_interrupts(struct rte_uio_pci_dev *udev) +{ + if (udev->info.irq) { + free_irq(udev->info.irq, udev); + udev->info.irq = 0; + } + +#ifndef HAVE_ALLOC_IRQ_VECTORS + if (udev->mode == RTE_INTR_MODE_MSIX) + pci_disable_msix(udev->pdev); + if (udev->mode == RTE_INTR_MODE_MSI) + pci_disable_msi(udev->pdev); +#else + if (udev->mode == RTE_INTR_MODE_MSIX || + udev->mode == RTE_INTR_MODE_MSI) + pci_free_irq_vectors(udev->pdev); +#endif +} + + +/** + * This gets called while opening uio device file. + */ +static int +igbuio_pci_open(struct uio_info *info, struct inode *inode) +{ + struct rte_uio_pci_dev *udev = info->priv; + struct pci_dev *dev = udev->pdev; + int err; + + if (atomic_inc_return(&udev->refcnt) != 1) + return 0; + + /* set bus master, which was cleared by the reset function */ + pci_set_master(dev); + + /* enable interrupts */ + err = igbuio_pci_enable_interrupts(udev); + if (err) { + atomic_dec(&udev->refcnt); + dev_err(&dev->dev, "Enable interrupt fails\n"); + } + return err; +} + +static int +igbuio_pci_release(struct uio_info *info, struct inode *inode) +{ + struct rte_uio_pci_dev *udev = info->priv; + struct pci_dev *dev = udev->pdev; + + if (atomic_dec_and_test(&udev->refcnt)) { + /* disable interrupts */ + igbuio_pci_disable_interrupts(udev); + + /* stop the device from further DMA */ + pci_clear_master(dev); + } + + return 0; +} + +/* Remap pci resources described by bar #pci_bar in uio resource n. */ +static int +igbuio_pci_setup_iomem(struct pci_dev *dev, struct uio_info *info, + int n, int pci_bar, const char *name) +{ + unsigned long addr, len; + void *internal_addr; + + if (n >= ARRAY_SIZE(info->mem)) + return -EINVAL; + + addr = pci_resource_start(dev, pci_bar); + len = pci_resource_len(dev, pci_bar); + if (addr == 0 || len == 0) + return -1; + if (wc_activate == 0) { + internal_addr = ioremap(addr, len); + if (internal_addr == NULL) + return -1; + } else { + internal_addr = NULL; + } + info->mem[n].name = name; + info->mem[n].addr = addr; + info->mem[n].internal_addr = internal_addr; + info->mem[n].size = len; + info->mem[n].memtype = UIO_MEM_PHYS; + return 0; +} + +/* Get pci port io resources described by bar #pci_bar in uio resource n. */ +static int +igbuio_pci_setup_ioport(struct pci_dev *dev, struct uio_info *info, + int n, int pci_bar, const char *name) +{ + unsigned long addr, len; + + if (n >= ARRAY_SIZE(info->port)) + return -EINVAL; + + addr = pci_resource_start(dev, pci_bar); + len = pci_resource_len(dev, pci_bar); + if (addr == 0 || len == 0) + return -EINVAL; + + info->port[n].name = name; + info->port[n].start = addr; + info->port[n].size = len; + info->port[n].porttype = UIO_PORT_X86; + + return 0; +} + +/* Unmap previously ioremap'd resources */ +static void +igbuio_pci_release_iomem(struct uio_info *info) +{ + int i; + + for (i = 0; i < MAX_UIO_MAPS; i++) { + if (info->mem[i].internal_addr) + iounmap(info->mem[i].internal_addr); + } +} + +static int +igbuio_setup_bars(struct pci_dev *dev, struct uio_info *info) +{ + int i, iom, iop, ret; + unsigned long flags; + static const char *bar_names[PCI_STD_RESOURCE_END + 1] = { + "BAR0", + "BAR1", + "BAR2", + "BAR3", + "BAR4", + "BAR5", + }; + + iom = 0; + iop = 0; + + for (i = 0; i < ARRAY_SIZE(bar_names); i++) { + if (pci_resource_len(dev, i) != 0 && + pci_resource_start(dev, i) != 0) { + flags = pci_resource_flags(dev, i); + if (flags & IORESOURCE_MEM) { + ret = igbuio_pci_setup_iomem(dev, info, iom, + i, bar_names[i]); + if (ret != 0) + return ret; + iom++; + } else if (flags & IORESOURCE_IO) { + ret = igbuio_pci_setup_ioport(dev, info, iop, + i, bar_names[i]); + if (ret != 0) + return ret; + iop++; + } + } + } + + return (iom != 0 || iop != 0) ? ret : -ENOENT; +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 8, 0) +static int __devinit +#else +static int +#endif +igbuio_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) +{ + struct rte_uio_pci_dev *udev; + dma_addr_t map_dma_addr; + void *map_addr; + int err; + +#ifdef HAVE_PCI_IS_BRIDGE_API + if (pci_is_bridge(dev)) { + dev_warn(&dev->dev, "Ignoring PCI bridge device\n"); + return -ENODEV; + } +#endif + + udev = kzalloc(sizeof(struct rte_uio_pci_dev), GFP_KERNEL); + if (!udev) + return -ENOMEM; + + /* + * enable device: ask low-level code to enable I/O and + * memory + */ + err = pci_enable_device(dev); + if (err != 0) { + dev_err(&dev->dev, "Cannot enable PCI device\n"); + goto fail_free; + } + + /* enable bus mastering on the device */ + pci_set_master(dev); + + /* remap IO memory */ + err = igbuio_setup_bars(dev, &udev->info); + if (err != 0) + goto fail_release_iomem; + + /* set 64-bit DMA mask */ + err = pci_set_dma_mask(dev, DMA_BIT_MASK(64)); + if (err != 0) { + dev_err(&dev->dev, "Cannot set DMA mask\n"); + goto fail_release_iomem; + } + + err = pci_set_consistent_dma_mask(dev, DMA_BIT_MASK(64)); + if (err != 0) { + dev_err(&dev->dev, "Cannot set consistent DMA mask\n"); + goto fail_release_iomem; + } + + /* fill uio infos */ + udev->info.name = "igb_uio"; + udev->info.version = "0.1"; + udev->info.irqcontrol = igbuio_pci_irqcontrol; + udev->info.open = igbuio_pci_open; + udev->info.release = igbuio_pci_release; + udev->info.priv = udev; + udev->pdev = dev; + atomic_set(&udev->refcnt, 0); + + err = sysfs_create_group(&dev->dev.kobj, &dev_attr_grp); + if (err != 0) + goto fail_release_iomem; + + /* register uio driver */ + err = uio_register_device(&dev->dev, &udev->info); + if (err != 0) + goto fail_remove_group; + + pci_set_drvdata(dev, udev); + + /* + * Doing a harmless dma mapping for attaching the device to + * the iommu identity mapping if kernel boots with iommu=pt. + * Note this is not a problem if no IOMMU at all. + */ + map_addr = dma_alloc_coherent(&dev->dev, 1024, &map_dma_addr, + GFP_KERNEL); + if (map_addr) + memset(map_addr, 0, 1024); + + if (!map_addr) + dev_info(&dev->dev, "dma mapping failed\n"); + else { + dev_info(&dev->dev, "mapping 1K dma=%#llx host=%p\n", + (unsigned long long)map_dma_addr, map_addr); + + dma_free_coherent(&dev->dev, 1024, map_addr, map_dma_addr); + dev_info(&dev->dev, "unmapping 1K dma=%#llx host=%p\n", + (unsigned long long)map_dma_addr, map_addr); + } + + return 0; + +fail_remove_group: + sysfs_remove_group(&dev->dev.kobj, &dev_attr_grp); +fail_release_iomem: + igbuio_pci_release_iomem(&udev->info); + pci_disable_device(dev); +fail_free: + kfree(udev); + + return err; +} + +static void +igbuio_pci_remove(struct pci_dev *dev) +{ + struct rte_uio_pci_dev *udev = pci_get_drvdata(dev); + + igbuio_pci_release(&udev->info, NULL); + + sysfs_remove_group(&dev->dev.kobj, &dev_attr_grp); + uio_unregister_device(&udev->info); + igbuio_pci_release_iomem(&udev->info); + pci_disable_device(dev); + pci_set_drvdata(dev, NULL); + kfree(udev); +} + +static int +igbuio_config_intr_mode(char *intr_str) +{ + if (!intr_str) { + pr_info("Use MSIX interrupt by default\n"); + return 0; + } + + if (!strcmp(intr_str, RTE_INTR_MODE_MSIX_NAME)) { + igbuio_intr_mode_preferred = RTE_INTR_MODE_MSIX; + pr_info("Use MSIX interrupt\n"); + } else if (!strcmp(intr_str, RTE_INTR_MODE_MSI_NAME)) { + igbuio_intr_mode_preferred = RTE_INTR_MODE_MSI; + pr_info("Use MSI interrupt\n"); + } else if (!strcmp(intr_str, RTE_INTR_MODE_LEGACY_NAME)) { + igbuio_intr_mode_preferred = RTE_INTR_MODE_LEGACY; + pr_info("Use legacy interrupt\n"); + } else { + pr_info("Error: bad parameter - %s\n", intr_str); + return -EINVAL; + } + + return 0; +} + +static struct pci_driver igbuio_pci_driver = { + .name = "igb_uio", + .id_table = NULL, + .probe = igbuio_pci_probe, + .remove = igbuio_pci_remove, +}; + +static int __init +igbuio_pci_init_module(void) +{ + int ret; + + if (igbuio_kernel_is_locked_down()) { + pr_err("Not able to use module, kernel lock down is enabled\n"); + return -EINVAL; + } + + if (wc_activate != 0) + pr_info("wc_activate is set\n"); + + ret = igbuio_config_intr_mode(intr_mode); + if (ret < 0) + return ret; + + return pci_register_driver(&igbuio_pci_driver); +} + +static void __exit +igbuio_pci_exit_module(void) +{ + pci_unregister_driver(&igbuio_pci_driver); +} + +module_init(igbuio_pci_init_module); +module_exit(igbuio_pci_exit_module); + +module_param(intr_mode, charp, S_IRUGO); +MODULE_PARM_DESC(intr_mode, +"igb_uio interrupt mode (default=msix):\n" +" " RTE_INTR_MODE_MSIX_NAME " Use MSIX interrupt\n" +" " RTE_INTR_MODE_MSI_NAME " Use MSI interrupt\n" +" " RTE_INTR_MODE_LEGACY_NAME " Use Legacy interrupt\n" +"\n"); + +module_param(wc_activate, int, 0); +MODULE_PARM_DESC(wc_activate, +"Activate support for write combining (WC) (default=0)\n" +" 0 - disable\n" +" other - enable\n"); + +MODULE_DESCRIPTION("UIO driver for Intel IGB PCI cards"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Intel Corporation"); diff --git a/src/spdk/dpdk/kernel/linux/igb_uio/meson.build b/src/spdk/dpdk/kernel/linux/igb_uio/meson.build new file mode 100644 index 000000000..80540aece --- /dev/null +++ b/src/spdk/dpdk/kernel/linux/igb_uio/meson.build @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +mkfile = custom_target('igb_uio_makefile', + output: 'Makefile', + command: ['touch', '@OUTPUT@']) + +custom_target('igb_uio', + input: ['igb_uio.c', 'Kbuild'], + output: 'igb_uio.ko', + command: ['make', '-C', kernel_dir + '/build', + 'M=' + meson.current_build_dir(), + 'src=' + meson.current_source_dir(), + 'EXTRA_CFLAGS=-I' + meson.current_source_dir() + + '/../../../lib/librte_eal/include', + 'modules'], + depends: mkfile, + install: true, + install_dir: kernel_dir + '/extra/dpdk', + build_by_default: get_option('enable_kmods')) diff --git a/src/spdk/dpdk/kernel/linux/kni/Kbuild b/src/spdk/dpdk/kernel/linux/kni/Kbuild new file mode 100644 index 000000000..e5452d6c0 --- /dev/null +++ b/src/spdk/dpdk/kernel/linux/kni/Kbuild @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2018 Luca Boccassi <bluca@debian.org> + +ccflags-y := $(MODULE_CFLAGS) +obj-m := rte_kni.o +rte_kni-y := $(patsubst $(src)/%.c,%.o,$(wildcard $(src)/*.c)) diff --git a/src/spdk/dpdk/kernel/linux/kni/Makefile b/src/spdk/dpdk/kernel/linux/kni/Makefile new file mode 100644 index 000000000..595bac261 --- /dev/null +++ b/src/spdk/dpdk/kernel/linux/kni/Makefile @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# +# module name and path +# +MODULE = rte_kni + +# +# CFLAGS +# +MODULE_CFLAGS += -I$(SRCDIR) --param max-inline-insns-single=50 +MODULE_CFLAGS += -I$(RTE_OUTPUT)/include +MODULE_CFLAGS += -include $(RTE_OUTPUT)/include/rte_config.h +MODULE_CFLAGS += -Wall -Werror + +-include /etc/lsb-release + +ifeq ($(DISTRIB_ID),Ubuntu) +MODULE_CFLAGS += -DUBUNTU_RELEASE_CODE=$(subst .,,$(DISTRIB_RELEASE)) +UBUNTU_KERNEL_CODE := $(shell echo `grep UTS_RELEASE $(RTE_KERNELDIR)/include/generated/utsrelease.h \ + | cut -d '"' -f2 | cut -d- -f1,2 | tr .- ,`,1) +MODULE_CFLAGS += -D"UBUNTU_KERNEL_CODE=UBUNTU_KERNEL_VERSION($(UBUNTU_KERNEL_CODE))" +endif + +# +# all source are stored in SRCS-y +# +SRCS-y := kni_misc.c +SRCS-y += kni_net.c + +include $(RTE_SDK)/mk/rte.module.mk diff --git a/src/spdk/dpdk/kernel/linux/kni/compat.h b/src/spdk/dpdk/kernel/linux/kni/compat.h new file mode 100644 index 000000000..9ee45dbf6 --- /dev/null +++ b/src/spdk/dpdk/kernel/linux/kni/compat.h @@ -0,0 +1,136 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Minimal wrappers to allow compiling kni on older kernels. + */ + +#include <linux/version.h> + +#ifndef RHEL_RELEASE_VERSION +#define RHEL_RELEASE_VERSION(a, b) (((a) << 8) + (b)) +#endif + +/* SuSE version macro is the same as Linux kernel version */ +#ifndef SLE_VERSION +#define SLE_VERSION(a, b, c) KERNEL_VERSION(a, b, c) +#endif +#ifdef CONFIG_SUSE_KERNEL +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 57)) +/* SLES12SP3 is at least 4.4.57+ based */ +#define SLE_VERSION_CODE SLE_VERSION(12, 3, 0) +#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 12, 28)) +/* SLES12 is at least 3.12.28+ based */ +#define SLE_VERSION_CODE SLE_VERSION(12, 0, 0) +#elif ((LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 61)) && \ + (LINUX_VERSION_CODE < KERNEL_VERSION(3, 1, 0))) +/* SLES11 SP3 is at least 3.0.61+ based */ +#define SLE_VERSION_CODE SLE_VERSION(11, 3, 0) +#elif (LINUX_VERSION_CODE == KERNEL_VERSION(2, 6, 32)) +/* SLES11 SP1 is 2.6.32 based */ +#define SLE_VERSION_CODE SLE_VERSION(11, 1, 0) +#elif (LINUX_VERSION_CODE == KERNEL_VERSION(2, 6, 27)) +/* SLES11 GA is 2.6.27 based */ +#define SLE_VERSION_CODE SLE_VERSION(11, 0, 0) +#endif /* LINUX_VERSION_CODE == KERNEL_VERSION(x,y,z) */ +#endif /* CONFIG_SUSE_KERNEL */ +#ifndef SLE_VERSION_CODE +#define SLE_VERSION_CODE 0 +#endif /* SLE_VERSION_CODE */ + + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 39) && \ + (!(defined(RHEL_RELEASE_CODE) && \ + RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 4))) + +#define kstrtoul strict_strtoul + +#endif /* < 2.6.39 */ + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 33) +#define HAVE_SIMPLIFIED_PERNET_OPERATIONS +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 35) +#define sk_sleep(s) ((s)->sk_sleep) +#else +#define HAVE_SOCKET_WQ +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) +#define HAVE_STATIC_SOCK_MAP_FD +#else +#define kni_sock_map_fd(s) sock_map_fd(s, 0) +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 9, 0) +#define HAVE_CHANGE_CARRIER_CB +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0) +#define ether_addr_copy(dst, src) memcpy(dst, src, ETH_ALEN) +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) +#define HAVE_IOV_ITER_MSGHDR +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 0) +#define HAVE_KIOCB_MSG_PARAM +#define HAVE_REBUILD_HEADER +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) +#define HAVE_SK_ALLOC_KERN_PARAM +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) || \ + (defined(RHEL_RELEASE_CODE) && \ + RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 4)) || \ + (SLE_VERSION_CODE && SLE_VERSION_CODE == SLE_VERSION(12, 3, 0)) +#define HAVE_TRANS_START_HELPER +#endif + +/* + * KNI uses NET_NAME_UNKNOWN macro to select correct version of alloc_netdev() + * For old kernels just backported the commit that enables the macro + * (685343fc3ba6) but still uses old API, it is required to undefine macro to + * select correct version of API, this is safe since KNI doesn't use the value. + * This fix is specific to RedHat/CentOS kernels. + */ +#if (defined(RHEL_RELEASE_CODE) && \ + (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 8)) && \ + (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 34))) +#undef NET_NAME_UNKNOWN +#endif + +/* + * RHEL has two different version with different kernel version: + * 3.10 is for AMD, Intel, IBM POWER7 and POWER8; + * 4.14 is for ARM and IBM POWER9 + */ +#if (defined(RHEL_RELEASE_CODE) && \ + (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 5)) && \ + (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8, 0)) && \ + (LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0))) +#define ndo_change_mtu ndo_change_mtu_rh74 +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) +#define HAVE_MAX_MTU_PARAM +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) +#define HAVE_SIGNAL_FUNCTIONS_OWN_HEADER +#endif + +/* + * iova to kva mapping support can be provided since 4.6.0, but required + * kernel version increased to >= 4.10.0 because of the updates in + * get_user_pages_remote() kernel API + */ +#if KERNEL_VERSION(4, 10, 0) <= LINUX_VERSION_CODE +#define HAVE_IOVA_TO_KVA_MAPPING_SUPPORT +#endif + +#if KERNEL_VERSION(5, 6, 0) <= LINUX_VERSION_CODE +#define HAVE_TX_TIMEOUT_TXQUEUE +#endif diff --git a/src/spdk/dpdk/kernel/linux/kni/kni_dev.h b/src/spdk/dpdk/kernel/linux/kni/kni_dev.h new file mode 100644 index 000000000..ca5f92a47 --- /dev/null +++ b/src/spdk/dpdk/kernel/linux/kni/kni_dev.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright(c) 2010-2014 Intel Corporation. + */ + +#ifndef _KNI_DEV_H_ +#define _KNI_DEV_H_ + +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define KNI_VERSION "1.0" + +#include "compat.h" + +#include <linux/if.h> +#include <linux/wait.h> +#ifdef HAVE_SIGNAL_FUNCTIONS_OWN_HEADER +#include <linux/sched/signal.h> +#else +#include <linux/sched.h> +#endif +#include <linux/netdevice.h> +#include <linux/spinlock.h> +#include <linux/list.h> + +#include <rte_kni_common.h> +#define KNI_KTHREAD_RESCHEDULE_INTERVAL 5 /* us */ + +#define MBUF_BURST_SZ 32 + +/* Default carrier state for created KNI network interfaces */ +extern uint32_t kni_dflt_carrier; + +/** + * A structure describing the private information for a kni device. + */ +struct kni_dev { + /* kni list */ + struct list_head list; + + uint8_t iova_mode; + + uint32_t core_id; /* Core ID to bind */ + char name[RTE_KNI_NAMESIZE]; /* Network device name */ + struct task_struct *pthread; + + /* wait queue for req/resp */ + wait_queue_head_t wq; + struct mutex sync_lock; + + /* kni device */ + struct net_device *net_dev; + + /* queue for packets to be sent out */ + struct rte_kni_fifo *tx_q; + + /* queue for the packets received */ + struct rte_kni_fifo *rx_q; + + /* queue for the allocated mbufs those can be used to save sk buffs */ + struct rte_kni_fifo *alloc_q; + + /* free queue for the mbufs to be freed */ + struct rte_kni_fifo *free_q; + + /* request queue */ + struct rte_kni_fifo *req_q; + + /* response queue */ + struct rte_kni_fifo *resp_q; + + void *sync_kva; + void *sync_va; + + void *mbuf_kva; + void *mbuf_va; + + /* mbuf size */ + uint32_t mbuf_size; + + /* buffers */ + void *pa[MBUF_BURST_SZ]; + void *va[MBUF_BURST_SZ]; + void *alloc_pa[MBUF_BURST_SZ]; + void *alloc_va[MBUF_BURST_SZ]; + + struct task_struct *usr_tsk; +}; + +#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT +static inline phys_addr_t iova_to_phys(struct task_struct *tsk, + unsigned long iova) +{ + phys_addr_t offset, phys_addr; + struct page *page = NULL; + long ret; + + offset = iova & (PAGE_SIZE - 1); + + /* Read one page struct info */ + ret = get_user_pages_remote(tsk, tsk->mm, iova, 1, + FOLL_TOUCH, &page, NULL, NULL); + if (ret < 0) + return 0; + + phys_addr = page_to_phys(page) | offset; + put_page(page); + + return phys_addr; +} + +static inline void *iova_to_kva(struct task_struct *tsk, unsigned long iova) +{ + return phys_to_virt(iova_to_phys(tsk, iova)); +} +#endif + +void kni_net_release_fifo_phy(struct kni_dev *kni); +void kni_net_rx(struct kni_dev *kni); +void kni_net_init(struct net_device *dev); +void kni_net_config_lo_mode(char *lo_str); +void kni_net_poll_resp(struct kni_dev *kni); + +#endif diff --git a/src/spdk/dpdk/kernel/linux/kni/kni_fifo.h b/src/spdk/dpdk/kernel/linux/kni/kni_fifo.h new file mode 100644 index 000000000..5c91b5537 --- /dev/null +++ b/src/spdk/dpdk/kernel/linux/kni/kni_fifo.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright(c) 2010-2014 Intel Corporation. + */ + +#ifndef _KNI_FIFO_H_ +#define _KNI_FIFO_H_ + +#include <rte_kni_common.h> + +/* Skip some memory barriers on Linux < 3.14 */ +#ifndef smp_load_acquire +#define smp_load_acquire(a) (*(a)) +#endif +#ifndef smp_store_release +#define smp_store_release(a, b) *(a) = (b) +#endif + +/** + * Adds num elements into the fifo. Return the number actually written + */ +static inline uint32_t +kni_fifo_put(struct rte_kni_fifo *fifo, void **data, uint32_t num) +{ + uint32_t i = 0; + uint32_t fifo_write = fifo->write; + uint32_t fifo_read = smp_load_acquire(&fifo->read); + uint32_t new_write = fifo_write; + + for (i = 0; i < num; i++) { + new_write = (new_write + 1) & (fifo->len - 1); + + if (new_write == fifo_read) + break; + fifo->buffer[fifo_write] = data[i]; + fifo_write = new_write; + } + smp_store_release(&fifo->write, fifo_write); + + return i; +} + +/** + * Get up to num elements from the fifo. Return the number actully read + */ +static inline uint32_t +kni_fifo_get(struct rte_kni_fifo *fifo, void **data, uint32_t num) +{ + uint32_t i = 0; + uint32_t new_read = fifo->read; + uint32_t fifo_write = smp_load_acquire(&fifo->write); + + for (i = 0; i < num; i++) { + if (new_read == fifo_write) + break; + + data[i] = fifo->buffer[new_read]; + new_read = (new_read + 1) & (fifo->len - 1); + } + smp_store_release(&fifo->read, new_read); + + return i; +} + +/** + * Get the num of elements in the fifo + */ +static inline uint32_t +kni_fifo_count(struct rte_kni_fifo *fifo) +{ + uint32_t fifo_write = smp_load_acquire(&fifo->write); + uint32_t fifo_read = smp_load_acquire(&fifo->read); + return (fifo->len + fifo_write - fifo_read) & (fifo->len - 1); +} + +/** + * Get the num of available elements in the fifo + */ +static inline uint32_t +kni_fifo_free_count(struct rte_kni_fifo *fifo) +{ + uint32_t fifo_write = smp_load_acquire(&fifo->write); + uint32_t fifo_read = smp_load_acquire(&fifo->read); + return (fifo_read - fifo_write - 1) & (fifo->len - 1); +} + +#endif /* _KNI_FIFO_H_ */ diff --git a/src/spdk/dpdk/kernel/linux/kni/kni_misc.c b/src/spdk/dpdk/kernel/linux/kni/kni_misc.c new file mode 100644 index 000000000..2b464c438 --- /dev/null +++ b/src/spdk/dpdk/kernel/linux/kni/kni_misc.c @@ -0,0 +1,661 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright(c) 2010-2014 Intel Corporation. + */ + +#include <linux/version.h> +#include <linux/module.h> +#include <linux/miscdevice.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/pci.h> +#include <linux/kthread.h> +#include <linux/rwsem.h> +#include <linux/mutex.h> +#include <linux/nsproxy.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +#include <rte_kni_common.h> + +#include "compat.h" +#include "kni_dev.h" + +MODULE_VERSION(KNI_VERSION); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("Kernel Module for managing kni devices"); + +#define KNI_RX_LOOP_NUM 1000 + +#define KNI_MAX_DEVICES 32 + +/* loopback mode */ +static char *lo_mode; + +/* Kernel thread mode */ +static char *kthread_mode; +static uint32_t multiple_kthread_on; + +/* Default carrier state for created KNI network interfaces */ +static char *carrier; +uint32_t kni_dflt_carrier; + +#define KNI_DEV_IN_USE_BIT_NUM 0 /* Bit number for device in use */ + +static int kni_net_id; + +struct kni_net { + unsigned long device_in_use; /* device in use flag */ + struct mutex kni_kthread_lock; + struct task_struct *kni_kthread; + struct rw_semaphore kni_list_lock; + struct list_head kni_list_head; +}; + +static int __net_init +kni_init_net(struct net *net) +{ +#ifdef HAVE_SIMPLIFIED_PERNET_OPERATIONS + struct kni_net *knet = net_generic(net, kni_net_id); + + memset(knet, 0, sizeof(*knet)); +#else + struct kni_net *knet; + int ret; + + knet = kzalloc(sizeof(struct kni_net), GFP_KERNEL); + if (!knet) { + ret = -ENOMEM; + return ret; + } +#endif + + /* Clear the bit of device in use */ + clear_bit(KNI_DEV_IN_USE_BIT_NUM, &knet->device_in_use); + + mutex_init(&knet->kni_kthread_lock); + + init_rwsem(&knet->kni_list_lock); + INIT_LIST_HEAD(&knet->kni_list_head); + +#ifdef HAVE_SIMPLIFIED_PERNET_OPERATIONS + return 0; +#else + ret = net_assign_generic(net, kni_net_id, knet); + if (ret < 0) + kfree(knet); + + return ret; +#endif +} + +static void __net_exit +kni_exit_net(struct net *net) +{ + struct kni_net *knet __maybe_unused; + + knet = net_generic(net, kni_net_id); + mutex_destroy(&knet->kni_kthread_lock); + +#ifndef HAVE_SIMPLIFIED_PERNET_OPERATIONS + kfree(knet); +#endif +} + +static struct pernet_operations kni_net_ops = { + .init = kni_init_net, + .exit = kni_exit_net, +#ifdef HAVE_SIMPLIFIED_PERNET_OPERATIONS + .id = &kni_net_id, + .size = sizeof(struct kni_net), +#endif +}; + +static int +kni_thread_single(void *data) +{ + struct kni_net *knet = data; + int j; + struct kni_dev *dev; + + while (!kthread_should_stop()) { + down_read(&knet->kni_list_lock); + for (j = 0; j < KNI_RX_LOOP_NUM; j++) { + list_for_each_entry(dev, &knet->kni_list_head, list) { + kni_net_rx(dev); + kni_net_poll_resp(dev); + } + } + up_read(&knet->kni_list_lock); +#ifdef RTE_KNI_PREEMPT_DEFAULT + /* reschedule out for a while */ + schedule_timeout_interruptible( + usecs_to_jiffies(KNI_KTHREAD_RESCHEDULE_INTERVAL)); +#endif + } + + return 0; +} + +static int +kni_thread_multiple(void *param) +{ + int j; + struct kni_dev *dev = param; + + while (!kthread_should_stop()) { + for (j = 0; j < KNI_RX_LOOP_NUM; j++) { + kni_net_rx(dev); + kni_net_poll_resp(dev); + } +#ifdef RTE_KNI_PREEMPT_DEFAULT + schedule_timeout_interruptible( + usecs_to_jiffies(KNI_KTHREAD_RESCHEDULE_INTERVAL)); +#endif + } + + return 0; +} + +static int +kni_open(struct inode *inode, struct file *file) +{ + struct net *net = current->nsproxy->net_ns; + struct kni_net *knet = net_generic(net, kni_net_id); + + /* kni device can be opened by one user only per netns */ + if (test_and_set_bit(KNI_DEV_IN_USE_BIT_NUM, &knet->device_in_use)) + return -EBUSY; + + file->private_data = get_net(net); + pr_debug("/dev/kni opened\n"); + + return 0; +} + +static int +kni_dev_remove(struct kni_dev *dev) +{ + if (!dev) + return -ENODEV; + + if (dev->net_dev) { + unregister_netdev(dev->net_dev); + free_netdev(dev->net_dev); + } + + kni_net_release_fifo_phy(dev); + + return 0; +} + +static int +kni_release(struct inode *inode, struct file *file) +{ + struct net *net = file->private_data; + struct kni_net *knet = net_generic(net, kni_net_id); + struct kni_dev *dev, *n; + + /* Stop kernel thread for single mode */ + if (multiple_kthread_on == 0) { + mutex_lock(&knet->kni_kthread_lock); + /* Stop kernel thread */ + if (knet->kni_kthread != NULL) { + kthread_stop(knet->kni_kthread); + knet->kni_kthread = NULL; + } + mutex_unlock(&knet->kni_kthread_lock); + } + + down_write(&knet->kni_list_lock); + list_for_each_entry_safe(dev, n, &knet->kni_list_head, list) { + /* Stop kernel thread for multiple mode */ + if (multiple_kthread_on && dev->pthread != NULL) { + kthread_stop(dev->pthread); + dev->pthread = NULL; + } + + kni_dev_remove(dev); + list_del(&dev->list); + } + up_write(&knet->kni_list_lock); + + /* Clear the bit of device in use */ + clear_bit(KNI_DEV_IN_USE_BIT_NUM, &knet->device_in_use); + + put_net(net); + pr_debug("/dev/kni closed\n"); + + return 0; +} + +static int +kni_check_param(struct kni_dev *kni, struct rte_kni_device_info *dev) +{ + if (!kni || !dev) + return -1; + + /* Check if network name has been used */ + if (!strncmp(kni->name, dev->name, RTE_KNI_NAMESIZE)) { + pr_err("KNI name %s duplicated\n", dev->name); + return -1; + } + + return 0; +} + +static int +kni_run_thread(struct kni_net *knet, struct kni_dev *kni, uint8_t force_bind) +{ + /** + * Create a new kernel thread for multiple mode, set its core affinity, + * and finally wake it up. + */ + if (multiple_kthread_on) { + kni->pthread = kthread_create(kni_thread_multiple, + (void *)kni, "kni_%s", kni->name); + if (IS_ERR(kni->pthread)) { + kni_dev_remove(kni); + return -ECANCELED; + } + + if (force_bind) + kthread_bind(kni->pthread, kni->core_id); + wake_up_process(kni->pthread); + } else { + mutex_lock(&knet->kni_kthread_lock); + + if (knet->kni_kthread == NULL) { + knet->kni_kthread = kthread_create(kni_thread_single, + (void *)knet, "kni_single"); + if (IS_ERR(knet->kni_kthread)) { + mutex_unlock(&knet->kni_kthread_lock); + kni_dev_remove(kni); + return -ECANCELED; + } + + if (force_bind) + kthread_bind(knet->kni_kthread, kni->core_id); + wake_up_process(knet->kni_kthread); + } + + mutex_unlock(&knet->kni_kthread_lock); + } + + return 0; +} + +static int +kni_ioctl_create(struct net *net, uint32_t ioctl_num, + unsigned long ioctl_param) +{ + struct kni_net *knet = net_generic(net, kni_net_id); + int ret; + struct rte_kni_device_info dev_info; + struct net_device *net_dev = NULL; + struct kni_dev *kni, *dev, *n; + + pr_info("Creating kni...\n"); + /* Check the buffer size, to avoid warning */ + if (_IOC_SIZE(ioctl_num) > sizeof(dev_info)) + return -EINVAL; + + /* Copy kni info from user space */ + if (copy_from_user(&dev_info, (void *)ioctl_param, sizeof(dev_info))) + return -EFAULT; + + /* Check if name is zero-ended */ + if (strnlen(dev_info.name, sizeof(dev_info.name)) == sizeof(dev_info.name)) { + pr_err("kni.name not zero-terminated"); + return -EINVAL; + } + + /** + * Check if the cpu core id is valid for binding. + */ + if (dev_info.force_bind && !cpu_online(dev_info.core_id)) { + pr_err("cpu %u is not online\n", dev_info.core_id); + return -EINVAL; + } + + /* Check if it has been created */ + down_read(&knet->kni_list_lock); + list_for_each_entry_safe(dev, n, &knet->kni_list_head, list) { + if (kni_check_param(dev, &dev_info) < 0) { + up_read(&knet->kni_list_lock); + return -EINVAL; + } + } + up_read(&knet->kni_list_lock); + + net_dev = alloc_netdev(sizeof(struct kni_dev), dev_info.name, +#ifdef NET_NAME_USER + NET_NAME_USER, +#endif + kni_net_init); + if (net_dev == NULL) { + pr_err("error allocating device \"%s\"\n", dev_info.name); + return -EBUSY; + } + + dev_net_set(net_dev, net); + + kni = netdev_priv(net_dev); + + kni->net_dev = net_dev; + kni->core_id = dev_info.core_id; + strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE); + + /* Translate user space info into kernel space info */ + if (dev_info.iova_mode) { +#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT + kni->tx_q = iova_to_kva(current, dev_info.tx_phys); + kni->rx_q = iova_to_kva(current, dev_info.rx_phys); + kni->alloc_q = iova_to_kva(current, dev_info.alloc_phys); + kni->free_q = iova_to_kva(current, dev_info.free_phys); + + kni->req_q = iova_to_kva(current, dev_info.req_phys); + kni->resp_q = iova_to_kva(current, dev_info.resp_phys); + kni->sync_va = dev_info.sync_va; + kni->sync_kva = iova_to_kva(current, dev_info.sync_phys); + kni->usr_tsk = current; + kni->iova_mode = 1; +#else + pr_err("KNI module does not support IOVA to VA translation\n"); + return -EINVAL; +#endif + } else { + + kni->tx_q = phys_to_virt(dev_info.tx_phys); + kni->rx_q = phys_to_virt(dev_info.rx_phys); + kni->alloc_q = phys_to_virt(dev_info.alloc_phys); + kni->free_q = phys_to_virt(dev_info.free_phys); + + kni->req_q = phys_to_virt(dev_info.req_phys); + kni->resp_q = phys_to_virt(dev_info.resp_phys); + kni->sync_va = dev_info.sync_va; + kni->sync_kva = phys_to_virt(dev_info.sync_phys); + kni->iova_mode = 0; + } + + kni->mbuf_size = dev_info.mbuf_size; + + pr_debug("tx_phys: 0x%016llx, tx_q addr: 0x%p\n", + (unsigned long long) dev_info.tx_phys, kni->tx_q); + pr_debug("rx_phys: 0x%016llx, rx_q addr: 0x%p\n", + (unsigned long long) dev_info.rx_phys, kni->rx_q); + pr_debug("alloc_phys: 0x%016llx, alloc_q addr: 0x%p\n", + (unsigned long long) dev_info.alloc_phys, kni->alloc_q); + pr_debug("free_phys: 0x%016llx, free_q addr: 0x%p\n", + (unsigned long long) dev_info.free_phys, kni->free_q); + pr_debug("req_phys: 0x%016llx, req_q addr: 0x%p\n", + (unsigned long long) dev_info.req_phys, kni->req_q); + pr_debug("resp_phys: 0x%016llx, resp_q addr: 0x%p\n", + (unsigned long long) dev_info.resp_phys, kni->resp_q); + pr_debug("mbuf_size: %u\n", kni->mbuf_size); + + /* if user has provided a valid mac address */ + if (is_valid_ether_addr(dev_info.mac_addr)) + memcpy(net_dev->dev_addr, dev_info.mac_addr, ETH_ALEN); + else + /* + * Generate random mac address. eth_random_addr() is the + * newer version of generating mac address in kernel. + */ + random_ether_addr(net_dev->dev_addr); + + if (dev_info.mtu) + net_dev->mtu = dev_info.mtu; +#ifdef HAVE_MAX_MTU_PARAM + net_dev->max_mtu = net_dev->mtu; + + if (dev_info.min_mtu) + net_dev->min_mtu = dev_info.min_mtu; + + if (dev_info.max_mtu) + net_dev->max_mtu = dev_info.max_mtu; +#endif + + ret = register_netdev(net_dev); + if (ret) { + pr_err("error %i registering device \"%s\"\n", + ret, dev_info.name); + kni->net_dev = NULL; + kni_dev_remove(kni); + free_netdev(net_dev); + return -ENODEV; + } + + netif_carrier_off(net_dev); + + ret = kni_run_thread(knet, kni, dev_info.force_bind); + if (ret != 0) + return ret; + + down_write(&knet->kni_list_lock); + list_add(&kni->list, &knet->kni_list_head); + up_write(&knet->kni_list_lock); + + return 0; +} + +static int +kni_ioctl_release(struct net *net, uint32_t ioctl_num, + unsigned long ioctl_param) +{ + struct kni_net *knet = net_generic(net, kni_net_id); + int ret = -EINVAL; + struct kni_dev *dev, *n; + struct rte_kni_device_info dev_info; + + if (_IOC_SIZE(ioctl_num) > sizeof(dev_info)) + return -EINVAL; + + if (copy_from_user(&dev_info, (void *)ioctl_param, sizeof(dev_info))) + return -EFAULT; + + /* Release the network device according to its name */ + if (strlen(dev_info.name) == 0) + return -EINVAL; + + down_write(&knet->kni_list_lock); + list_for_each_entry_safe(dev, n, &knet->kni_list_head, list) { + if (strncmp(dev->name, dev_info.name, RTE_KNI_NAMESIZE) != 0) + continue; + + if (multiple_kthread_on && dev->pthread != NULL) { + kthread_stop(dev->pthread); + dev->pthread = NULL; + } + + kni_dev_remove(dev); + list_del(&dev->list); + ret = 0; + break; + } + up_write(&knet->kni_list_lock); + pr_info("%s release kni named %s\n", + (ret == 0 ? "Successfully" : "Unsuccessfully"), dev_info.name); + + return ret; +} + +static int +kni_ioctl(struct inode *inode, uint32_t ioctl_num, unsigned long ioctl_param) +{ + int ret = -EINVAL; + struct net *net = current->nsproxy->net_ns; + + pr_debug("IOCTL num=0x%0x param=0x%0lx\n", ioctl_num, ioctl_param); + + /* + * Switch according to the ioctl called + */ + switch (_IOC_NR(ioctl_num)) { + case _IOC_NR(RTE_KNI_IOCTL_TEST): + /* For test only, not used */ + break; + case _IOC_NR(RTE_KNI_IOCTL_CREATE): + ret = kni_ioctl_create(net, ioctl_num, ioctl_param); + break; + case _IOC_NR(RTE_KNI_IOCTL_RELEASE): + ret = kni_ioctl_release(net, ioctl_num, ioctl_param); + break; + default: + pr_debug("IOCTL default\n"); + break; + } + + return ret; +} + +static int +kni_compat_ioctl(struct inode *inode, uint32_t ioctl_num, + unsigned long ioctl_param) +{ + /* 32 bits app on 64 bits OS to be supported later */ + pr_debug("Not implemented.\n"); + + return -EINVAL; +} + +static const struct file_operations kni_fops = { + .owner = THIS_MODULE, + .open = kni_open, + .release = kni_release, + .unlocked_ioctl = (void *)kni_ioctl, + .compat_ioctl = (void *)kni_compat_ioctl, +}; + +static struct miscdevice kni_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = KNI_DEVICE, + .fops = &kni_fops, +}; + +static int __init +kni_parse_kthread_mode(void) +{ + if (!kthread_mode) + return 0; + + if (strcmp(kthread_mode, "single") == 0) + return 0; + else if (strcmp(kthread_mode, "multiple") == 0) + multiple_kthread_on = 1; + else + return -1; + + return 0; +} + +static int __init +kni_parse_carrier_state(void) +{ + if (!carrier) { + kni_dflt_carrier = 0; + return 0; + } + + if (strcmp(carrier, "off") == 0) + kni_dflt_carrier = 0; + else if (strcmp(carrier, "on") == 0) + kni_dflt_carrier = 1; + else + return -1; + + return 0; +} + +static int __init +kni_init(void) +{ + int rc; + + if (kni_parse_kthread_mode() < 0) { + pr_err("Invalid parameter for kthread_mode\n"); + return -EINVAL; + } + + if (multiple_kthread_on == 0) + pr_debug("Single kernel thread for all KNI devices\n"); + else + pr_debug("Multiple kernel thread mode enabled\n"); + + if (kni_parse_carrier_state() < 0) { + pr_err("Invalid parameter for carrier\n"); + return -EINVAL; + } + + if (kni_dflt_carrier == 0) + pr_debug("Default carrier state set to off.\n"); + else + pr_debug("Default carrier state set to on.\n"); + +#ifdef HAVE_SIMPLIFIED_PERNET_OPERATIONS + rc = register_pernet_subsys(&kni_net_ops); +#else + rc = register_pernet_gen_subsys(&kni_net_id, &kni_net_ops); +#endif + if (rc) + return -EPERM; + + rc = misc_register(&kni_misc); + if (rc != 0) { + pr_err("Misc registration failed\n"); + goto out; + } + + /* Configure the lo mode according to the input parameter */ + kni_net_config_lo_mode(lo_mode); + + return 0; + +out: +#ifdef HAVE_SIMPLIFIED_PERNET_OPERATIONS + unregister_pernet_subsys(&kni_net_ops); +#else + unregister_pernet_gen_subsys(kni_net_id, &kni_net_ops); +#endif + return rc; +} + +static void __exit +kni_exit(void) +{ + misc_deregister(&kni_misc); +#ifdef HAVE_SIMPLIFIED_PERNET_OPERATIONS + unregister_pernet_subsys(&kni_net_ops); +#else + unregister_pernet_gen_subsys(kni_net_id, &kni_net_ops); +#endif +} + +module_init(kni_init); +module_exit(kni_exit); + +module_param(lo_mode, charp, 0644); +MODULE_PARM_DESC(lo_mode, +"KNI loopback mode (default=lo_mode_none):\n" +"\t\tlo_mode_none Kernel loopback disabled\n" +"\t\tlo_mode_fifo Enable kernel loopback with fifo\n" +"\t\tlo_mode_fifo_skb Enable kernel loopback with fifo and skb buffer\n" +"\t\t" +); + +module_param(kthread_mode, charp, 0644); +MODULE_PARM_DESC(kthread_mode, +"Kernel thread mode (default=single):\n" +"\t\tsingle Single kernel thread mode enabled.\n" +"\t\tmultiple Multiple kernel thread mode enabled.\n" +"\t\t" +); + +module_param(carrier, charp, 0644); +MODULE_PARM_DESC(carrier, +"Default carrier state for KNI interface (default=off):\n" +"\t\toff Interfaces will be created with carrier state set to off.\n" +"\t\ton Interfaces will be created with carrier state set to on.\n" +"\t\t" +); diff --git a/src/spdk/dpdk/kernel/linux/kni/kni_net.c b/src/spdk/dpdk/kernel/linux/kni/kni_net.c new file mode 100644 index 000000000..c82c881a2 --- /dev/null +++ b/src/spdk/dpdk/kernel/linux/kni/kni_net.c @@ -0,0 +1,844 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright(c) 2010-2014 Intel Corporation. + */ + +/* + * This code is inspired from the book "Linux Device Drivers" by + * Alessandro Rubini and Jonathan Corbet, published by O'Reilly & Associates + */ + +#include <linux/device.h> +#include <linux/module.h> +#include <linux/version.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> /* eth_type_trans */ +#include <linux/ethtool.h> +#include <linux/skbuff.h> +#include <linux/kthread.h> +#include <linux/delay.h> + +#include <rte_kni_common.h> +#include <kni_fifo.h> + +#include "compat.h" +#include "kni_dev.h" + +#define WD_TIMEOUT 5 /*jiffies */ + +#define KNI_WAIT_RESPONSE_TIMEOUT 300 /* 3 seconds */ + +/* typedef for rx function */ +typedef void (*kni_net_rx_t)(struct kni_dev *kni); + +static void kni_net_rx_normal(struct kni_dev *kni); + +/* kni rx function pointer, with default to normal rx */ +static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal; + +#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT +/* iova to kernel virtual address */ +static inline void * +iova2kva(struct kni_dev *kni, void *iova) +{ + return phys_to_virt(iova_to_phys(kni->usr_tsk, (unsigned long)iova)); +} + +static inline void * +iova2data_kva(struct kni_dev *kni, struct rte_kni_mbuf *m) +{ + return phys_to_virt(iova_to_phys(kni->usr_tsk, m->buf_physaddr) + + m->data_off); +} +#endif + +/* physical address to kernel virtual address */ +static void * +pa2kva(void *pa) +{ + return phys_to_virt((unsigned long)pa); +} + +/* physical address to virtual address */ +static void * +pa2va(void *pa, struct rte_kni_mbuf *m) +{ + void *va; + + va = (void *)((unsigned long)pa + + (unsigned long)m->buf_addr - + (unsigned long)m->buf_physaddr); + return va; +} + +/* mbuf data kernel virtual address from mbuf kernel virtual address */ +static void * +kva2data_kva(struct rte_kni_mbuf *m) +{ + return phys_to_virt(m->buf_physaddr + m->data_off); +} + +static inline void * +get_kva(struct kni_dev *kni, void *pa) +{ +#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT + if (kni->iova_mode == 1) + return iova2kva(kni, pa); +#endif + return pa2kva(pa); +} + +static inline void * +get_data_kva(struct kni_dev *kni, void *pkt_kva) +{ +#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT + if (kni->iova_mode == 1) + return iova2data_kva(kni, pkt_kva); +#endif + return kva2data_kva(pkt_kva); +} + +/* + * It can be called to process the request. + */ +static int +kni_net_process_request(struct kni_dev *kni, struct rte_kni_request *req) +{ + int ret = -1; + void *resp_va; + uint32_t num; + int ret_val; + + if (!kni || !req) { + pr_err("No kni instance or request\n"); + return -EINVAL; + } + + mutex_lock(&kni->sync_lock); + + /* Construct data */ + memcpy(kni->sync_kva, req, sizeof(struct rte_kni_request)); + num = kni_fifo_put(kni->req_q, &kni->sync_va, 1); + if (num < 1) { + pr_err("Cannot send to req_q\n"); + ret = -EBUSY; + goto fail; + } + + ret_val = wait_event_interruptible_timeout(kni->wq, + kni_fifo_count(kni->resp_q), 3 * HZ); + if (signal_pending(current) || ret_val <= 0) { + ret = -ETIME; + goto fail; + } + num = kni_fifo_get(kni->resp_q, (void **)&resp_va, 1); + if (num != 1 || resp_va != kni->sync_va) { + /* This should never happen */ + pr_err("No data in resp_q\n"); + ret = -ENODATA; + goto fail; + } + + memcpy(req, kni->sync_kva, sizeof(struct rte_kni_request)); + ret = 0; + +fail: + mutex_unlock(&kni->sync_lock); + return ret; +} + +/* + * Open and close + */ +static int +kni_net_open(struct net_device *dev) +{ + int ret; + struct rte_kni_request req; + struct kni_dev *kni = netdev_priv(dev); + + netif_start_queue(dev); + if (kni_dflt_carrier == 1) + netif_carrier_on(dev); + else + netif_carrier_off(dev); + + memset(&req, 0, sizeof(req)); + req.req_id = RTE_KNI_REQ_CFG_NETWORK_IF; + + /* Setting if_up to non-zero means up */ + req.if_up = 1; + ret = kni_net_process_request(kni, &req); + + return (ret == 0) ? req.result : ret; +} + +static int +kni_net_release(struct net_device *dev) +{ + int ret; + struct rte_kni_request req; + struct kni_dev *kni = netdev_priv(dev); + + netif_stop_queue(dev); /* can't transmit any more */ + netif_carrier_off(dev); + + memset(&req, 0, sizeof(req)); + req.req_id = RTE_KNI_REQ_CFG_NETWORK_IF; + + /* Setting if_up to 0 means down */ + req.if_up = 0; + ret = kni_net_process_request(kni, &req); + + return (ret == 0) ? req.result : ret; +} + +static void +kni_fifo_trans_pa2va(struct kni_dev *kni, + struct rte_kni_fifo *src_pa, struct rte_kni_fifo *dst_va) +{ + uint32_t ret, i, num_dst, num_rx; + struct rte_kni_mbuf *kva, *prev_kva; + int nb_segs; + int kva_nb_segs; + + do { + num_dst = kni_fifo_free_count(dst_va); + if (num_dst == 0) + return; + + num_rx = min_t(uint32_t, num_dst, MBUF_BURST_SZ); + + num_rx = kni_fifo_get(src_pa, kni->pa, num_rx); + if (num_rx == 0) + return; + + for (i = 0; i < num_rx; i++) { + kva = get_kva(kni, kni->pa[i]); + kni->va[i] = pa2va(kni->pa[i], kva); + + kva_nb_segs = kva->nb_segs; + for (nb_segs = 0; nb_segs < kva_nb_segs; nb_segs++) { + if (!kva->next) + break; + + prev_kva = kva; + kva = pa2kva(kva->next); + /* Convert physical address to virtual address */ + prev_kva->next = pa2va(prev_kva->next, kva); + } + } + + ret = kni_fifo_put(dst_va, kni->va, num_rx); + if (ret != num_rx) { + /* Failing should not happen */ + pr_err("Fail to enqueue entries into dst_va\n"); + return; + } + } while (1); +} + +/* Try to release mbufs when kni release */ +void kni_net_release_fifo_phy(struct kni_dev *kni) +{ + /* release rx_q first, because it can't release in userspace */ + kni_fifo_trans_pa2va(kni, kni->rx_q, kni->free_q); + /* release alloc_q for speeding up kni release in userspace */ + kni_fifo_trans_pa2va(kni, kni->alloc_q, kni->free_q); +} + +/* + * Configuration changes (passed on by ifconfig) + */ +static int +kni_net_config(struct net_device *dev, struct ifmap *map) +{ + if (dev->flags & IFF_UP) /* can't act on a running interface */ + return -EBUSY; + + /* ignore other fields */ + return 0; +} + +/* + * Transmit a packet (called by the kernel) + */ +static int +kni_net_tx(struct sk_buff *skb, struct net_device *dev) +{ + int len = 0; + uint32_t ret; + struct kni_dev *kni = netdev_priv(dev); + struct rte_kni_mbuf *pkt_kva = NULL; + void *pkt_pa = NULL; + void *pkt_va = NULL; + + /* save the timestamp */ +#ifdef HAVE_TRANS_START_HELPER + netif_trans_update(dev); +#else + dev->trans_start = jiffies; +#endif + + /* Check if the length of skb is less than mbuf size */ + if (skb->len > kni->mbuf_size) + goto drop; + + /** + * Check if it has at least one free entry in tx_q and + * one entry in alloc_q. + */ + if (kni_fifo_free_count(kni->tx_q) == 0 || + kni_fifo_count(kni->alloc_q) == 0) { + /** + * If no free entry in tx_q or no entry in alloc_q, + * drops skb and goes out. + */ + goto drop; + } + + /* dequeue a mbuf from alloc_q */ + ret = kni_fifo_get(kni->alloc_q, &pkt_pa, 1); + if (likely(ret == 1)) { + void *data_kva; + + pkt_kva = get_kva(kni, pkt_pa); + data_kva = get_data_kva(kni, pkt_kva); + pkt_va = pa2va(pkt_pa, pkt_kva); + + len = skb->len; + memcpy(data_kva, skb->data, len); + if (unlikely(len < ETH_ZLEN)) { + memset(data_kva + len, 0, ETH_ZLEN - len); + len = ETH_ZLEN; + } + pkt_kva->pkt_len = len; + pkt_kva->data_len = len; + + /* enqueue mbuf into tx_q */ + ret = kni_fifo_put(kni->tx_q, &pkt_va, 1); + if (unlikely(ret != 1)) { + /* Failing should not happen */ + pr_err("Fail to enqueue mbuf into tx_q\n"); + goto drop; + } + } else { + /* Failing should not happen */ + pr_err("Fail to dequeue mbuf from alloc_q\n"); + goto drop; + } + + /* Free skb and update statistics */ + dev_kfree_skb(skb); + dev->stats.tx_bytes += len; + dev->stats.tx_packets++; + + return NETDEV_TX_OK; + +drop: + /* Free skb and update statistics */ + dev_kfree_skb(skb); + dev->stats.tx_dropped++; + + return NETDEV_TX_OK; +} + +/* + * RX: normal working mode + */ +static void +kni_net_rx_normal(struct kni_dev *kni) +{ + uint32_t ret; + uint32_t len; + uint32_t i, num_rx, num_fq; + struct rte_kni_mbuf *kva, *prev_kva; + void *data_kva; + struct sk_buff *skb; + struct net_device *dev = kni->net_dev; + + /* Get the number of free entries in free_q */ + num_fq = kni_fifo_free_count(kni->free_q); + if (num_fq == 0) { + /* No room on the free_q, bail out */ + return; + } + + /* Calculate the number of entries to dequeue from rx_q */ + num_rx = min_t(uint32_t, num_fq, MBUF_BURST_SZ); + + /* Burst dequeue from rx_q */ + num_rx = kni_fifo_get(kni->rx_q, kni->pa, num_rx); + if (num_rx == 0) + return; + + /* Transfer received packets to netif */ + for (i = 0; i < num_rx; i++) { + kva = get_kva(kni, kni->pa[i]); + len = kva->pkt_len; + data_kva = get_data_kva(kni, kva); + kni->va[i] = pa2va(kni->pa[i], kva); + + skb = netdev_alloc_skb(dev, len); + if (!skb) { + /* Update statistics */ + dev->stats.rx_dropped++; + continue; + } + + if (kva->nb_segs == 1) { + memcpy(skb_put(skb, len), data_kva, len); + } else { + int nb_segs; + int kva_nb_segs = kva->nb_segs; + + for (nb_segs = 0; nb_segs < kva_nb_segs; nb_segs++) { + memcpy(skb_put(skb, kva->data_len), + data_kva, kva->data_len); + + if (!kva->next) + break; + + prev_kva = kva; + kva = pa2kva(kva->next); + data_kva = kva2data_kva(kva); + /* Convert physical address to virtual address */ + prev_kva->next = pa2va(prev_kva->next, kva); + } + } + + skb->protocol = eth_type_trans(skb, dev); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + /* Call netif interface */ + netif_rx_ni(skb); + + /* Update statistics */ + dev->stats.rx_bytes += len; + dev->stats.rx_packets++; + } + + /* Burst enqueue mbufs into free_q */ + ret = kni_fifo_put(kni->free_q, kni->va, num_rx); + if (ret != num_rx) + /* Failing should not happen */ + pr_err("Fail to enqueue entries into free_q\n"); +} + +/* + * RX: loopback with enqueue/dequeue fifos. + */ +static void +kni_net_rx_lo_fifo(struct kni_dev *kni) +{ + uint32_t ret; + uint32_t len; + uint32_t i, num, num_rq, num_tq, num_aq, num_fq; + struct rte_kni_mbuf *kva, *next_kva; + void *data_kva; + struct rte_kni_mbuf *alloc_kva; + void *alloc_data_kva; + struct net_device *dev = kni->net_dev; + + /* Get the number of entries in rx_q */ + num_rq = kni_fifo_count(kni->rx_q); + + /* Get the number of free entries in tx_q */ + num_tq = kni_fifo_free_count(kni->tx_q); + + /* Get the number of entries in alloc_q */ + num_aq = kni_fifo_count(kni->alloc_q); + + /* Get the number of free entries in free_q */ + num_fq = kni_fifo_free_count(kni->free_q); + + /* Calculate the number of entries to be dequeued from rx_q */ + num = min(num_rq, num_tq); + num = min(num, num_aq); + num = min(num, num_fq); + num = min_t(uint32_t, num, MBUF_BURST_SZ); + + /* Return if no entry to dequeue from rx_q */ + if (num == 0) + return; + + /* Burst dequeue from rx_q */ + ret = kni_fifo_get(kni->rx_q, kni->pa, num); + if (ret == 0) + return; /* Failing should not happen */ + + /* Dequeue entries from alloc_q */ + ret = kni_fifo_get(kni->alloc_q, kni->alloc_pa, num); + if (ret) { + num = ret; + /* Copy mbufs */ + for (i = 0; i < num; i++) { + kva = get_kva(kni, kni->pa[i]); + len = kva->data_len; + data_kva = get_data_kva(kni, kva); + kni->va[i] = pa2va(kni->pa[i], kva); + + while (kva->next) { + next_kva = pa2kva(kva->next); + /* Convert physical address to virtual address */ + kva->next = pa2va(kva->next, next_kva); + kva = next_kva; + } + + alloc_kva = get_kva(kni, kni->alloc_pa[i]); + alloc_data_kva = get_data_kva(kni, alloc_kva); + kni->alloc_va[i] = pa2va(kni->alloc_pa[i], alloc_kva); + + memcpy(alloc_data_kva, data_kva, len); + alloc_kva->pkt_len = len; + alloc_kva->data_len = len; + + dev->stats.tx_bytes += len; + dev->stats.rx_bytes += len; + } + + /* Burst enqueue mbufs into tx_q */ + ret = kni_fifo_put(kni->tx_q, kni->alloc_va, num); + if (ret != num) + /* Failing should not happen */ + pr_err("Fail to enqueue mbufs into tx_q\n"); + } + + /* Burst enqueue mbufs into free_q */ + ret = kni_fifo_put(kni->free_q, kni->va, num); + if (ret != num) + /* Failing should not happen */ + pr_err("Fail to enqueue mbufs into free_q\n"); + + /** + * Update statistic, and enqueue/dequeue failure is impossible, + * as all queues are checked at first. + */ + dev->stats.tx_packets += num; + dev->stats.rx_packets += num; +} + +/* + * RX: loopback with enqueue/dequeue fifos and sk buffer copies. + */ +static void +kni_net_rx_lo_fifo_skb(struct kni_dev *kni) +{ + uint32_t ret; + uint32_t len; + uint32_t i, num_rq, num_fq, num; + struct rte_kni_mbuf *kva, *prev_kva; + void *data_kva; + struct sk_buff *skb; + struct net_device *dev = kni->net_dev; + + /* Get the number of entries in rx_q */ + num_rq = kni_fifo_count(kni->rx_q); + + /* Get the number of free entries in free_q */ + num_fq = kni_fifo_free_count(kni->free_q); + + /* Calculate the number of entries to dequeue from rx_q */ + num = min(num_rq, num_fq); + num = min_t(uint32_t, num, MBUF_BURST_SZ); + + /* Return if no entry to dequeue from rx_q */ + if (num == 0) + return; + + /* Burst dequeue mbufs from rx_q */ + ret = kni_fifo_get(kni->rx_q, kni->pa, num); + if (ret == 0) + return; + + /* Copy mbufs to sk buffer and then call tx interface */ + for (i = 0; i < num; i++) { + kva = get_kva(kni, kni->pa[i]); + len = kva->pkt_len; + data_kva = get_data_kva(kni, kva); + kni->va[i] = pa2va(kni->pa[i], kva); + + skb = netdev_alloc_skb(dev, len); + if (skb) { + memcpy(skb_put(skb, len), data_kva, len); + skb->ip_summed = CHECKSUM_UNNECESSARY; + dev_kfree_skb(skb); + } + + /* Simulate real usage, allocate/copy skb twice */ + skb = netdev_alloc_skb(dev, len); + if (skb == NULL) { + dev->stats.rx_dropped++; + continue; + } + + if (kva->nb_segs == 1) { + memcpy(skb_put(skb, len), data_kva, len); + } else { + int nb_segs; + int kva_nb_segs = kva->nb_segs; + + for (nb_segs = 0; nb_segs < kva_nb_segs; nb_segs++) { + memcpy(skb_put(skb, kva->data_len), + data_kva, kva->data_len); + + if (!kva->next) + break; + + prev_kva = kva; + kva = get_kva(kni, kva->next); + data_kva = get_data_kva(kni, kva); + /* Convert physical address to virtual address */ + prev_kva->next = pa2va(prev_kva->next, kva); + } + } + + skb->ip_summed = CHECKSUM_UNNECESSARY; + + dev->stats.rx_bytes += len; + dev->stats.rx_packets++; + + /* call tx interface */ + kni_net_tx(skb, dev); + } + + /* enqueue all the mbufs from rx_q into free_q */ + ret = kni_fifo_put(kni->free_q, kni->va, num); + if (ret != num) + /* Failing should not happen */ + pr_err("Fail to enqueue mbufs into free_q\n"); +} + +/* rx interface */ +void +kni_net_rx(struct kni_dev *kni) +{ + /** + * It doesn't need to check if it is NULL pointer, + * as it has a default value + */ + (*kni_net_rx_func)(kni); +} + +/* + * Deal with a transmit timeout. + */ +#ifdef HAVE_TX_TIMEOUT_TXQUEUE +static void +kni_net_tx_timeout(struct net_device *dev, unsigned int txqueue) +#else +static void +kni_net_tx_timeout(struct net_device *dev) +#endif +{ + pr_debug("Transmit timeout at %ld, latency %ld\n", jiffies, + jiffies - dev_trans_start(dev)); + + dev->stats.tx_errors++; + netif_wake_queue(dev); +} + +static int +kni_net_change_mtu(struct net_device *dev, int new_mtu) +{ + int ret; + struct rte_kni_request req; + struct kni_dev *kni = netdev_priv(dev); + + pr_debug("kni_net_change_mtu new mtu %d to be set\n", new_mtu); + + memset(&req, 0, sizeof(req)); + req.req_id = RTE_KNI_REQ_CHANGE_MTU; + req.new_mtu = new_mtu; + ret = kni_net_process_request(kni, &req); + if (ret == 0 && req.result == 0) + dev->mtu = new_mtu; + + return (ret == 0) ? req.result : ret; +} + +static void +kni_net_change_rx_flags(struct net_device *netdev, int flags) +{ + struct rte_kni_request req; + struct kni_dev *kni = netdev_priv(netdev); + + memset(&req, 0, sizeof(req)); + + if (flags & IFF_ALLMULTI) { + req.req_id = RTE_KNI_REQ_CHANGE_ALLMULTI; + + if (netdev->flags & IFF_ALLMULTI) + req.allmulti = 1; + else + req.allmulti = 0; + } + + if (flags & IFF_PROMISC) { + req.req_id = RTE_KNI_REQ_CHANGE_PROMISC; + + if (netdev->flags & IFF_PROMISC) + req.promiscusity = 1; + else + req.promiscusity = 0; + } + + kni_net_process_request(kni, &req); +} + +/* + * Checks if the user space application provided the resp message + */ +void +kni_net_poll_resp(struct kni_dev *kni) +{ + if (kni_fifo_count(kni->resp_q)) + wake_up_interruptible(&kni->wq); +} + +/* + * Fill the eth header + */ +static int +kni_net_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, const void *daddr, + const void *saddr, uint32_t len) +{ + struct ethhdr *eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); + + memcpy(eth->h_source, saddr ? saddr : dev->dev_addr, dev->addr_len); + memcpy(eth->h_dest, daddr ? daddr : dev->dev_addr, dev->addr_len); + eth->h_proto = htons(type); + + return dev->hard_header_len; +} + +/* + * Re-fill the eth header + */ +#ifdef HAVE_REBUILD_HEADER +static int +kni_net_rebuild_header(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct ethhdr *eth = (struct ethhdr *) skb->data; + + memcpy(eth->h_source, dev->dev_addr, dev->addr_len); + memcpy(eth->h_dest, dev->dev_addr, dev->addr_len); + + return 0; +} +#endif /* < 4.1.0 */ + +/** + * kni_net_set_mac - Change the Ethernet Address of the KNI NIC + * @netdev: network interface device structure + * @p: pointer to an address structure + * + * Returns 0 on success, negative on failure + **/ +static int +kni_net_set_mac(struct net_device *netdev, void *p) +{ + int ret; + struct rte_kni_request req; + struct kni_dev *kni; + struct sockaddr *addr = p; + + memset(&req, 0, sizeof(req)); + req.req_id = RTE_KNI_REQ_CHANGE_MAC_ADDR; + + if (!is_valid_ether_addr((unsigned char *)(addr->sa_data))) + return -EADDRNOTAVAIL; + + memcpy(req.mac_addr, addr->sa_data, netdev->addr_len); + memcpy(netdev->dev_addr, addr->sa_data, netdev->addr_len); + + kni = netdev_priv(netdev); + ret = kni_net_process_request(kni, &req); + + return (ret == 0 ? req.result : ret); +} + +#ifdef HAVE_CHANGE_CARRIER_CB +static int +kni_net_change_carrier(struct net_device *dev, bool new_carrier) +{ + if (new_carrier) + netif_carrier_on(dev); + else + netif_carrier_off(dev); + return 0; +} +#endif + +static const struct header_ops kni_net_header_ops = { + .create = kni_net_header, + .parse = eth_header_parse, +#ifdef HAVE_REBUILD_HEADER + .rebuild = kni_net_rebuild_header, +#endif /* < 4.1.0 */ + .cache = NULL, /* disable caching */ +}; + +static const struct net_device_ops kni_net_netdev_ops = { + .ndo_open = kni_net_open, + .ndo_stop = kni_net_release, + .ndo_set_config = kni_net_config, + .ndo_change_rx_flags = kni_net_change_rx_flags, + .ndo_start_xmit = kni_net_tx, + .ndo_change_mtu = kni_net_change_mtu, + .ndo_tx_timeout = kni_net_tx_timeout, + .ndo_set_mac_address = kni_net_set_mac, +#ifdef HAVE_CHANGE_CARRIER_CB + .ndo_change_carrier = kni_net_change_carrier, +#endif +}; + +static void kni_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *info) +{ + strlcpy(info->version, KNI_VERSION, sizeof(info->version)); + strlcpy(info->driver, "kni", sizeof(info->driver)); +} + +static const struct ethtool_ops kni_net_ethtool_ops = { + .get_drvinfo = kni_get_drvinfo, + .get_link = ethtool_op_get_link, +}; + +void +kni_net_init(struct net_device *dev) +{ + struct kni_dev *kni = netdev_priv(dev); + + init_waitqueue_head(&kni->wq); + mutex_init(&kni->sync_lock); + + ether_setup(dev); /* assign some of the fields */ + dev->netdev_ops = &kni_net_netdev_ops; + dev->header_ops = &kni_net_header_ops; + dev->ethtool_ops = &kni_net_ethtool_ops; + dev->watchdog_timeo = WD_TIMEOUT; +} + +void +kni_net_config_lo_mode(char *lo_str) +{ + if (!lo_str) { + pr_debug("loopback disabled"); + return; + } + + if (!strcmp(lo_str, "lo_mode_none")) + pr_debug("loopback disabled"); + else if (!strcmp(lo_str, "lo_mode_fifo")) { + pr_debug("loopback mode=lo_mode_fifo enabled"); + kni_net_rx_func = kni_net_rx_lo_fifo; + } else if (!strcmp(lo_str, "lo_mode_fifo_skb")) { + pr_debug("loopback mode=lo_mode_fifo_skb enabled"); + kni_net_rx_func = kni_net_rx_lo_fifo_skb; + } else { + pr_debug("Unknown loopback parameter, disabled"); + } +} diff --git a/src/spdk/dpdk/kernel/linux/kni/meson.build b/src/spdk/dpdk/kernel/linux/kni/meson.build new file mode 100644 index 000000000..d696347f2 --- /dev/null +++ b/src/spdk/dpdk/kernel/linux/kni/meson.build @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2018 Luca Boccassi <bluca@debian.org> + +kni_mkfile = custom_target('rte_kni_makefile', + output: 'Makefile', + command: ['touch', '@OUTPUT@']) + +kni_sources = files( + 'kni_misc.c', + 'kni_net.c', + 'Kbuild') + +custom_target('rte_kni', + input: kni_sources, + output: 'rte_kni.ko', + command: ['make', '-j4', '-C', kernel_dir + '/build', + 'M=' + meson.current_build_dir(), + 'src=' + meson.current_source_dir(), + 'MODULE_CFLAGS=-include ' + meson.source_root() + '/config/rte_config.h' + + ' -I' + meson.source_root() + '/lib/librte_eal/include' + + ' -I' + meson.source_root() + '/lib/librte_eal/linux/include' + + ' -I' + meson.build_root() + + ' -I' + meson.current_source_dir(), + 'modules'], + depends: kni_mkfile, + install: true, + install_dir: kernel_dir + '/extra/dpdk', + build_by_default: get_option('enable_kmods')) diff --git a/src/spdk/dpdk/kernel/linux/meson.build b/src/spdk/dpdk/kernel/linux/meson.build new file mode 100644 index 000000000..da79df168 --- /dev/null +++ b/src/spdk/dpdk/kernel/linux/meson.build @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2018 Intel Corporation + +subdirs = ['igb_uio', 'kni'] + +# if we are cross-compiling we need kernel_dir specified +if get_option('kernel_dir') == '' and meson.is_cross_build() + error('Need "kernel_dir" option for kmod compilation when cross-compiling') +endif + +kernel_dir = get_option('kernel_dir') +if kernel_dir == '' + # use default path for native builds + kernel_version = run_command('uname', '-r').stdout().strip() + kernel_dir = '/lib/modules/' + kernel_version +endif + +# test running make in kernel directory, using "make kernelversion" +make_returncode = run_command('make', '-sC', kernel_dir + '/build', + 'kernelversion').returncode() +if make_returncode != 0 + error('Cannot compile kernel modules as requested - are kernel headers installed?') +endif + +# DO ACTUAL MODULE BUILDING +foreach d:subdirs + subdir(d) +endforeach |