diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/spdk/dpdk/drivers/vdpa | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/dpdk/drivers/vdpa')
20 files changed, 4599 insertions, 0 deletions
diff --git a/src/spdk/dpdk/drivers/vdpa/Makefile b/src/spdk/dpdk/drivers/vdpa/Makefile new file mode 100644 index 000000000..6e8835948 --- /dev/null +++ b/src/spdk/dpdk/drivers/vdpa/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2019 Mellanox Technologies, Ltd + +include $(RTE_SDK)/mk/rte.vars.mk + +ifeq ($(CONFIG_RTE_EAL_VFIO),y) +DIRS-$(CONFIG_RTE_LIBRTE_IFC_PMD) += ifc +endif + +DIRS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5 + +include $(RTE_SDK)/mk/rte.subdir.mk diff --git a/src/spdk/dpdk/drivers/vdpa/ifc/Makefile b/src/spdk/dpdk/drivers/vdpa/ifc/Makefile new file mode 100644 index 000000000..b468bfdbd --- /dev/null +++ b/src/spdk/dpdk/drivers/vdpa/ifc/Makefile @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2018 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# +# library name +# +LIB = librte_pmd_ifc.a + +LDLIBS += -lpthread +LDLIBS += -lrte_eal -lrte_pci -lrte_vhost -lrte_bus_pci +LDLIBS += -lrte_kvargs + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) + +# +# Add extra flags for base driver source files to disable warnings in them +# +BASE_DRIVER_OBJS=$(sort $(patsubst %.c,%.o,$(notdir $(wildcard $(SRCDIR)/base/*.c)))) + +VPATH += $(SRCDIR)/base + +EXPORT_MAP := rte_pmd_ifc_version.map + +# +# all source are stored in SRCS-y +# +SRCS-$(CONFIG_RTE_LIBRTE_IFC_PMD) += ifcvf_vdpa.c +SRCS-$(CONFIG_RTE_LIBRTE_IFC_PMD) += ifcvf.c + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/drivers/vdpa/ifc/base/ifcvf.c b/src/spdk/dpdk/drivers/vdpa/ifc/base/ifcvf.c new file mode 100644 index 000000000..3c0b2dff6 --- /dev/null +++ b/src/spdk/dpdk/drivers/vdpa/ifc/base/ifcvf.c @@ -0,0 +1,329 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include "ifcvf.h" +#include "ifcvf_osdep.h" + +STATIC void * +get_cap_addr(struct ifcvf_hw *hw, struct ifcvf_pci_cap *cap) +{ + u8 bar = cap->bar; + u32 length = cap->length; + u32 offset = cap->offset; + + if (bar > IFCVF_PCI_MAX_RESOURCE - 1) { + DEBUGOUT("invalid bar: %u\n", bar); + return NULL; + } + + if (offset + length < offset) { + DEBUGOUT("offset(%u) + length(%u) overflows\n", + offset, length); + return NULL; + } + + if (offset + length > hw->mem_resource[cap->bar].len) { + DEBUGOUT("offset(%u) + length(%u) overflows bar length(%u)", + offset, length, (u32)hw->mem_resource[cap->bar].len); + return NULL; + } + + return hw->mem_resource[bar].addr + offset; +} + +int +ifcvf_init_hw(struct ifcvf_hw *hw, PCI_DEV *dev) +{ + int ret; + u8 pos; + struct ifcvf_pci_cap cap; + + ret = PCI_READ_CONFIG_BYTE(dev, &pos, PCI_CAPABILITY_LIST); + if (ret < 0) { + DEBUGOUT("failed to read pci capability list\n"); + return -1; + } + + while (pos) { + ret = PCI_READ_CONFIG_RANGE(dev, (u32 *)&cap, + sizeof(cap), pos); + if (ret < 0) { + DEBUGOUT("failed to read cap at pos: %x", pos); + break; + } + + if (cap.cap_vndr != PCI_CAP_ID_VNDR) + goto next; + + DEBUGOUT("cfg type: %u, bar: %u, offset: %u, " + "len: %u\n", cap.cfg_type, cap.bar, + cap.offset, cap.length); + + switch (cap.cfg_type) { + case IFCVF_PCI_CAP_COMMON_CFG: + hw->common_cfg = get_cap_addr(hw, &cap); + break; + case IFCVF_PCI_CAP_NOTIFY_CFG: + PCI_READ_CONFIG_DWORD(dev, &hw->notify_off_multiplier, + pos + sizeof(cap)); + hw->notify_base = get_cap_addr(hw, &cap); + hw->notify_region = cap.bar; + break; + case IFCVF_PCI_CAP_ISR_CFG: + hw->isr = get_cap_addr(hw, &cap); + break; + case IFCVF_PCI_CAP_DEVICE_CFG: + hw->dev_cfg = get_cap_addr(hw, &cap); + break; + } +next: + pos = cap.cap_next; + } + + hw->lm_cfg = hw->mem_resource[4].addr; + + if (hw->common_cfg == NULL || hw->notify_base == NULL || + hw->isr == NULL || hw->dev_cfg == NULL) { + DEBUGOUT("capability incomplete\n"); + return -1; + } + + DEBUGOUT("capability mapping:\ncommon cfg: %p\n" + "notify base: %p\nisr cfg: %p\ndevice cfg: %p\n" + "multiplier: %u\n", + hw->common_cfg, hw->dev_cfg, + hw->isr, hw->notify_base, + hw->notify_off_multiplier); + + return 0; +} + +STATIC u8 +ifcvf_get_status(struct ifcvf_hw *hw) +{ + return IFCVF_READ_REG8(&hw->common_cfg->device_status); +} + +STATIC void +ifcvf_set_status(struct ifcvf_hw *hw, u8 status) +{ + IFCVF_WRITE_REG8(status, &hw->common_cfg->device_status); +} + +STATIC void +ifcvf_reset(struct ifcvf_hw *hw) +{ + ifcvf_set_status(hw, 0); + + /* flush status write */ + while (ifcvf_get_status(hw)) + msec_delay(1); +} + +STATIC void +ifcvf_add_status(struct ifcvf_hw *hw, u8 status) +{ + if (status != 0) + status |= ifcvf_get_status(hw); + + ifcvf_set_status(hw, status); + ifcvf_get_status(hw); +} + +u64 +ifcvf_get_features(struct ifcvf_hw *hw) +{ + u32 features_lo, features_hi; + struct ifcvf_pci_common_cfg *cfg = hw->common_cfg; + + IFCVF_WRITE_REG32(0, &cfg->device_feature_select); + features_lo = IFCVF_READ_REG32(&cfg->device_feature); + + IFCVF_WRITE_REG32(1, &cfg->device_feature_select); + features_hi = IFCVF_READ_REG32(&cfg->device_feature); + + return ((u64)features_hi << 32) | features_lo; +} + +STATIC void +ifcvf_set_features(struct ifcvf_hw *hw, u64 features) +{ + struct ifcvf_pci_common_cfg *cfg = hw->common_cfg; + + IFCVF_WRITE_REG32(0, &cfg->guest_feature_select); + IFCVF_WRITE_REG32(features & ((1ULL << 32) - 1), &cfg->guest_feature); + + IFCVF_WRITE_REG32(1, &cfg->guest_feature_select); + IFCVF_WRITE_REG32(features >> 32, &cfg->guest_feature); +} + +STATIC int +ifcvf_config_features(struct ifcvf_hw *hw) +{ + u64 host_features; + + host_features = ifcvf_get_features(hw); + hw->req_features &= host_features; + + ifcvf_set_features(hw, hw->req_features); + ifcvf_add_status(hw, IFCVF_CONFIG_STATUS_FEATURES_OK); + + if (!(ifcvf_get_status(hw) & IFCVF_CONFIG_STATUS_FEATURES_OK)) { + DEBUGOUT("failed to set FEATURES_OK status\n"); + return -1; + } + + return 0; +} + +STATIC void +io_write64_twopart(u64 val, u32 *lo, u32 *hi) +{ + IFCVF_WRITE_REG32(val & ((1ULL << 32) - 1), lo); + IFCVF_WRITE_REG32(val >> 32, hi); +} + +STATIC int +ifcvf_hw_enable(struct ifcvf_hw *hw) +{ + struct ifcvf_pci_common_cfg *cfg; + u8 *lm_cfg; + u32 i; + u16 notify_off; + + cfg = hw->common_cfg; + lm_cfg = hw->lm_cfg; + + IFCVF_WRITE_REG16(0, &cfg->msix_config); + if (IFCVF_READ_REG16(&cfg->msix_config) == IFCVF_MSI_NO_VECTOR) { + DEBUGOUT("msix vec alloc failed for device config\n"); + return -1; + } + + for (i = 0; i < hw->nr_vring; i++) { + IFCVF_WRITE_REG16(i, &cfg->queue_select); + io_write64_twopart(hw->vring[i].desc, &cfg->queue_desc_lo, + &cfg->queue_desc_hi); + io_write64_twopart(hw->vring[i].avail, &cfg->queue_avail_lo, + &cfg->queue_avail_hi); + io_write64_twopart(hw->vring[i].used, &cfg->queue_used_lo, + &cfg->queue_used_hi); + IFCVF_WRITE_REG16(hw->vring[i].size, &cfg->queue_size); + + *(u32 *)(lm_cfg + IFCVF_LM_RING_STATE_OFFSET + + (i / 2) * IFCVF_LM_CFG_SIZE + (i % 2) * 4) = + (u32)hw->vring[i].last_avail_idx | + ((u32)hw->vring[i].last_used_idx << 16); + + IFCVF_WRITE_REG16(i + 1, &cfg->queue_msix_vector); + if (IFCVF_READ_REG16(&cfg->queue_msix_vector) == + IFCVF_MSI_NO_VECTOR) { + DEBUGOUT("queue %u, msix vec alloc failed\n", + i); + return -1; + } + + notify_off = IFCVF_READ_REG16(&cfg->queue_notify_off); + hw->notify_addr[i] = (void *)((u8 *)hw->notify_base + + notify_off * hw->notify_off_multiplier); + IFCVF_WRITE_REG16(1, &cfg->queue_enable); + } + + return 0; +} + +STATIC void +ifcvf_hw_disable(struct ifcvf_hw *hw) +{ + u32 i; + struct ifcvf_pci_common_cfg *cfg; + u32 ring_state; + + cfg = hw->common_cfg; + + IFCVF_WRITE_REG16(IFCVF_MSI_NO_VECTOR, &cfg->msix_config); + for (i = 0; i < hw->nr_vring; i++) { + IFCVF_WRITE_REG16(i, &cfg->queue_select); + IFCVF_WRITE_REG16(0, &cfg->queue_enable); + IFCVF_WRITE_REG16(IFCVF_MSI_NO_VECTOR, &cfg->queue_msix_vector); + ring_state = *(u32 *)(hw->lm_cfg + IFCVF_LM_RING_STATE_OFFSET + + (i / 2) * IFCVF_LM_CFG_SIZE + (i % 2) * 4); + hw->vring[i].last_avail_idx = (u16)(ring_state >> 16); + hw->vring[i].last_used_idx = (u16)(ring_state >> 16); + } +} + +int +ifcvf_start_hw(struct ifcvf_hw *hw) +{ + ifcvf_reset(hw); + ifcvf_add_status(hw, IFCVF_CONFIG_STATUS_ACK); + ifcvf_add_status(hw, IFCVF_CONFIG_STATUS_DRIVER); + + if (ifcvf_config_features(hw) < 0) + return -1; + + if (ifcvf_hw_enable(hw) < 0) + return -1; + + ifcvf_add_status(hw, IFCVF_CONFIG_STATUS_DRIVER_OK); + return 0; +} + +void +ifcvf_stop_hw(struct ifcvf_hw *hw) +{ + ifcvf_hw_disable(hw); + ifcvf_reset(hw); +} + +void +ifcvf_enable_logging(struct ifcvf_hw *hw, u64 log_base, u64 log_size) +{ + u8 *lm_cfg; + + lm_cfg = hw->lm_cfg; + + *(u32 *)(lm_cfg + IFCVF_LM_BASE_ADDR_LOW) = + log_base & IFCVF_32_BIT_MASK; + + *(u32 *)(lm_cfg + IFCVF_LM_BASE_ADDR_HIGH) = + (log_base >> 32) & IFCVF_32_BIT_MASK; + + *(u32 *)(lm_cfg + IFCVF_LM_END_ADDR_LOW) = + (log_base + log_size) & IFCVF_32_BIT_MASK; + + *(u32 *)(lm_cfg + IFCVF_LM_END_ADDR_HIGH) = + ((log_base + log_size) >> 32) & IFCVF_32_BIT_MASK; + + *(u32 *)(lm_cfg + IFCVF_LM_LOGGING_CTRL) = IFCVF_LM_ENABLE_VF; +} + +void +ifcvf_disable_logging(struct ifcvf_hw *hw) +{ + u8 *lm_cfg; + + lm_cfg = hw->lm_cfg; + *(u32 *)(lm_cfg + IFCVF_LM_LOGGING_CTRL) = IFCVF_LM_DISABLE; +} + +void +ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid) +{ + IFCVF_WRITE_REG16(qid, hw->notify_addr[qid]); +} + +u8 +ifcvf_get_notify_region(struct ifcvf_hw *hw) +{ + return hw->notify_region; +} + +u64 +ifcvf_get_queue_notify_off(struct ifcvf_hw *hw, int qid) +{ + return (u8 *)hw->notify_addr[qid] - + (u8 *)hw->mem_resource[hw->notify_region].addr; +} diff --git a/src/spdk/dpdk/drivers/vdpa/ifc/base/ifcvf.h b/src/spdk/dpdk/drivers/vdpa/ifc/base/ifcvf.h new file mode 100644 index 000000000..eb04a9406 --- /dev/null +++ b/src/spdk/dpdk/drivers/vdpa/ifc/base/ifcvf.h @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef _IFCVF_H_ +#define _IFCVF_H_ + +#include "ifcvf_osdep.h" + +#define IFCVF_VENDOR_ID 0x1AF4 +#define IFCVF_DEVICE_ID 0x1041 +#define IFCVF_SUBSYS_VENDOR_ID 0x8086 +#define IFCVF_SUBSYS_DEVICE_ID 0x001A + +#define IFCVF_MAX_QUEUES 1 +#define VIRTIO_F_IOMMU_PLATFORM 33 + +/* Common configuration */ +#define IFCVF_PCI_CAP_COMMON_CFG 1 +/* Notifications */ +#define IFCVF_PCI_CAP_NOTIFY_CFG 2 +/* ISR Status */ +#define IFCVF_PCI_CAP_ISR_CFG 3 +/* Device specific configuration */ +#define IFCVF_PCI_CAP_DEVICE_CFG 4 +/* PCI configuration access */ +#define IFCVF_PCI_CAP_PCI_CFG 5 + +#define IFCVF_CONFIG_STATUS_RESET 0x00 +#define IFCVF_CONFIG_STATUS_ACK 0x01 +#define IFCVF_CONFIG_STATUS_DRIVER 0x02 +#define IFCVF_CONFIG_STATUS_DRIVER_OK 0x04 +#define IFCVF_CONFIG_STATUS_FEATURES_OK 0x08 +#define IFCVF_CONFIG_STATUS_FAILED 0x80 + +#define IFCVF_MSI_NO_VECTOR 0xffff +#define IFCVF_PCI_MAX_RESOURCE 6 + +#define IFCVF_LM_CFG_SIZE 0x40 +#define IFCVF_LM_RING_STATE_OFFSET 0x20 + +#define IFCVF_LM_LOGGING_CTRL 0x0 + +#define IFCVF_LM_BASE_ADDR_LOW 0x10 +#define IFCVF_LM_BASE_ADDR_HIGH 0x14 +#define IFCVF_LM_END_ADDR_LOW 0x18 +#define IFCVF_LM_END_ADDR_HIGH 0x1c + +#define IFCVF_LM_DISABLE 0x0 +#define IFCVF_LM_ENABLE_VF 0x1 +#define IFCVF_LM_ENABLE_PF 0x3 +#define IFCVF_LOG_BASE 0x100000000000 +#define IFCVF_MEDIATED_VRING 0x200000000000 + +#define IFCVF_32_BIT_MASK 0xffffffff + + +struct ifcvf_pci_cap { + u8 cap_vndr; /* Generic PCI field: PCI_CAP_ID_VNDR */ + u8 cap_next; /* Generic PCI field: next ptr. */ + u8 cap_len; /* Generic PCI field: capability length */ + u8 cfg_type; /* Identifies the structure. */ + u8 bar; /* Where to find it. */ + u8 padding[3]; /* Pad to full dword. */ + u32 offset; /* Offset within bar. */ + u32 length; /* Length of the structure, in bytes. */ +}; + +struct ifcvf_pci_notify_cap { + struct ifcvf_pci_cap cap; + u32 notify_off_multiplier; /* Multiplier for queue_notify_off. */ +}; + +struct ifcvf_pci_common_cfg { + /* About the whole device. */ + u32 device_feature_select; + u32 device_feature; + u32 guest_feature_select; + u32 guest_feature; + u16 msix_config; + u16 num_queues; + u8 device_status; + u8 config_generation; + + /* About a specific virtqueue. */ + u16 queue_select; + u16 queue_size; + u16 queue_msix_vector; + u16 queue_enable; + u16 queue_notify_off; + u32 queue_desc_lo; + u32 queue_desc_hi; + u32 queue_avail_lo; + u32 queue_avail_hi; + u32 queue_used_lo; + u32 queue_used_hi; +}; + +struct ifcvf_net_config { + u8 mac[6]; + u16 status; + u16 max_virtqueue_pairs; +} __rte_packed; + +struct ifcvf_pci_mem_resource { + u64 phys_addr; /**< Physical address, 0 if not resource. */ + u64 len; /**< Length of the resource. */ + u8 *addr; /**< Virtual address, NULL when not mapped. */ +}; + +struct vring_info { + u64 desc; + u64 avail; + u64 used; + u16 size; + u16 last_avail_idx; + u16 last_used_idx; +}; + +struct ifcvf_hw { + u64 req_features; + u8 notify_region; + u32 notify_off_multiplier; + struct ifcvf_pci_common_cfg *common_cfg; + struct ifcvf_net_config *dev_cfg; + u8 *isr; + u16 *notify_base; + u16 *notify_addr[IFCVF_MAX_QUEUES * 2]; + u8 *lm_cfg; + struct vring_info vring[IFCVF_MAX_QUEUES * 2]; + u8 nr_vring; + struct ifcvf_pci_mem_resource mem_resource[IFCVF_PCI_MAX_RESOURCE]; +}; + +int +ifcvf_init_hw(struct ifcvf_hw *hw, PCI_DEV *dev); + +u64 +ifcvf_get_features(struct ifcvf_hw *hw); + +int +ifcvf_start_hw(struct ifcvf_hw *hw); + +void +ifcvf_stop_hw(struct ifcvf_hw *hw); + +void +ifcvf_enable_logging(struct ifcvf_hw *hw, u64 log_base, u64 log_size); + +void +ifcvf_disable_logging(struct ifcvf_hw *hw); + +void +ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid); + +u8 +ifcvf_get_notify_region(struct ifcvf_hw *hw); + +u64 +ifcvf_get_queue_notify_off(struct ifcvf_hw *hw, int qid); + +#endif /* _IFCVF_H_ */ diff --git a/src/spdk/dpdk/drivers/vdpa/ifc/base/ifcvf_osdep.h b/src/spdk/dpdk/drivers/vdpa/ifc/base/ifcvf_osdep.h new file mode 100644 index 000000000..6aef25ea4 --- /dev/null +++ b/src/spdk/dpdk/drivers/vdpa/ifc/base/ifcvf_osdep.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef _IFCVF_OSDEP_H_ +#define _IFCVF_OSDEP_H_ + +#include <stdint.h> +#include <linux/pci_regs.h> + +#include <rte_cycles.h> +#include <rte_pci.h> +#include <rte_bus_pci.h> +#include <rte_log.h> +#include <rte_io.h> + +#define DEBUGOUT(S, args...) RTE_LOG(DEBUG, PMD, S, ##args) +#define STATIC static + +#define msec_delay(x) rte_delay_us_sleep(1000 * (x)) + +#define IFCVF_READ_REG8(reg) rte_read8(reg) +#define IFCVF_WRITE_REG8(val, reg) rte_write8((val), (reg)) +#define IFCVF_READ_REG16(reg) rte_read16(reg) +#define IFCVF_WRITE_REG16(val, reg) rte_write16((val), (reg)) +#define IFCVF_READ_REG32(reg) rte_read32(reg) +#define IFCVF_WRITE_REG32(val, reg) rte_write32((val), (reg)) + +typedef struct rte_pci_device PCI_DEV; + +#define PCI_READ_CONFIG_BYTE(dev, val, where) \ + rte_pci_read_config(dev, val, 1, where) + +#define PCI_READ_CONFIG_DWORD(dev, val, where) \ + rte_pci_read_config(dev, val, 4, where) + +typedef uint8_t u8; +typedef int8_t s8; +typedef uint16_t u16; +typedef int16_t s16; +typedef uint32_t u32; +typedef int32_t s32; +typedef int64_t s64; +typedef uint64_t u64; + +static inline int +PCI_READ_CONFIG_RANGE(PCI_DEV *dev, uint32_t *val, int size, int where) +{ + return rte_pci_read_config(dev, val, size, where); +} + +#endif /* _IFCVF_OSDEP_H_ */ diff --git a/src/spdk/dpdk/drivers/vdpa/ifc/ifcvf_vdpa.c b/src/spdk/dpdk/drivers/vdpa/ifc/ifcvf_vdpa.c new file mode 100644 index 000000000..ec97178dc --- /dev/null +++ b/src/spdk/dpdk/drivers/vdpa/ifc/ifcvf_vdpa.c @@ -0,0 +1,1280 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include <unistd.h> +#include <pthread.h> +#include <fcntl.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/epoll.h> +#include <linux/virtio_net.h> +#include <stdbool.h> + +#include <rte_malloc.h> +#include <rte_memory.h> +#include <rte_bus_pci.h> +#include <rte_vhost.h> +#include <rte_vdpa.h> +#include <rte_vfio.h> +#include <rte_spinlock.h> +#include <rte_log.h> +#include <rte_kvargs.h> +#include <rte_devargs.h> + +#include "base/ifcvf.h" + +#define DRV_LOG(level, fmt, args...) \ + rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \ + "IFCVF %s(): " fmt "\n", __func__, ##args) + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +#define IFCVF_USED_RING_LEN(size) \ + ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3) + +#define IFCVF_VDPA_MODE "vdpa" +#define IFCVF_SW_FALLBACK_LM "sw-live-migration" + +static const char * const ifcvf_valid_arguments[] = { + IFCVF_VDPA_MODE, + IFCVF_SW_FALLBACK_LM, + NULL +}; + +static int ifcvf_vdpa_logtype; + +struct ifcvf_internal { + struct rte_vdpa_dev_addr dev_addr; + struct rte_pci_device *pdev; + struct ifcvf_hw hw; + int vfio_container_fd; + int vfio_group_fd; + int vfio_dev_fd; + pthread_t tid; /* thread for notify relay */ + int epfd; + int vid; + int did; + uint16_t max_queues; + uint64_t features; + rte_atomic32_t started; + rte_atomic32_t dev_attached; + rte_atomic32_t running; + rte_spinlock_t lock; + bool sw_lm; + bool sw_fallback_running; + /* mediated vring for sw fallback */ + struct vring m_vring[IFCVF_MAX_QUEUES * 2]; + /* eventfd for used ring interrupt */ + int intr_fd[IFCVF_MAX_QUEUES * 2]; +}; + +struct internal_list { + TAILQ_ENTRY(internal_list) next; + struct ifcvf_internal *internal; +}; + +TAILQ_HEAD(internal_list_head, internal_list); +static struct internal_list_head internal_list = + TAILQ_HEAD_INITIALIZER(internal_list); + +static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER; + +static void update_used_ring(struct ifcvf_internal *internal, uint16_t qid); + +static struct internal_list * +find_internal_resource_by_did(int did) +{ + int found = 0; + struct internal_list *list; + + pthread_mutex_lock(&internal_list_lock); + + TAILQ_FOREACH(list, &internal_list, next) { + if (did == list->internal->did) { + found = 1; + break; + } + } + + pthread_mutex_unlock(&internal_list_lock); + + if (!found) + return NULL; + + return list; +} + +static struct internal_list * +find_internal_resource_by_dev(struct rte_pci_device *pdev) +{ + int found = 0; + struct internal_list *list; + + pthread_mutex_lock(&internal_list_lock); + + TAILQ_FOREACH(list, &internal_list, next) { + if (pdev == list->internal->pdev) { + found = 1; + break; + } + } + + pthread_mutex_unlock(&internal_list_lock); + + if (!found) + return NULL; + + return list; +} + +static int +ifcvf_vfio_setup(struct ifcvf_internal *internal) +{ + struct rte_pci_device *dev = internal->pdev; + char devname[RTE_DEV_NAME_MAX_LEN] = {0}; + int iommu_group_num; + int i, ret; + + internal->vfio_dev_fd = -1; + internal->vfio_group_fd = -1; + internal->vfio_container_fd = -1; + + rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN); + ret = rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname, + &iommu_group_num); + if (ret <= 0) { + DRV_LOG(ERR, "%s failed to get IOMMU group", devname); + return -1; + } + + internal->vfio_container_fd = rte_vfio_container_create(); + if (internal->vfio_container_fd < 0) + return -1; + + internal->vfio_group_fd = rte_vfio_container_group_bind( + internal->vfio_container_fd, iommu_group_num); + if (internal->vfio_group_fd < 0) + goto err; + + if (rte_pci_map_device(dev)) + goto err; + + internal->vfio_dev_fd = dev->intr_handle.vfio_dev_fd; + + for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE); + i++) { + internal->hw.mem_resource[i].addr = + internal->pdev->mem_resource[i].addr; + internal->hw.mem_resource[i].phys_addr = + internal->pdev->mem_resource[i].phys_addr; + internal->hw.mem_resource[i].len = + internal->pdev->mem_resource[i].len; + } + + return 0; + +err: + rte_vfio_container_destroy(internal->vfio_container_fd); + return -1; +} + +static int +ifcvf_dma_map(struct ifcvf_internal *internal, int do_map) +{ + uint32_t i; + int ret; + struct rte_vhost_memory *mem = NULL; + int vfio_container_fd; + + ret = rte_vhost_get_mem_table(internal->vid, &mem); + if (ret < 0) { + DRV_LOG(ERR, "failed to get VM memory layout."); + goto exit; + } + + vfio_container_fd = internal->vfio_container_fd; + + for (i = 0; i < mem->nregions; i++) { + struct rte_vhost_mem_region *reg; + + reg = &mem->regions[i]; + DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", " + "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".", + do_map ? "DMA map" : "DMA unmap", i, + reg->host_user_addr, reg->guest_phys_addr, reg->size); + + if (do_map) { + ret = rte_vfio_container_dma_map(vfio_container_fd, + reg->host_user_addr, reg->guest_phys_addr, + reg->size); + if (ret < 0) { + DRV_LOG(ERR, "DMA map failed."); + goto exit; + } + } else { + ret = rte_vfio_container_dma_unmap(vfio_container_fd, + reg->host_user_addr, reg->guest_phys_addr, + reg->size); + if (ret < 0) { + DRV_LOG(ERR, "DMA unmap failed."); + goto exit; + } + } + } + +exit: + if (mem) + free(mem); + return ret; +} + +static uint64_t +hva_to_gpa(int vid, uint64_t hva) +{ + struct rte_vhost_memory *mem = NULL; + struct rte_vhost_mem_region *reg; + uint32_t i; + uint64_t gpa = 0; + + if (rte_vhost_get_mem_table(vid, &mem) < 0) + goto exit; + + for (i = 0; i < mem->nregions; i++) { + reg = &mem->regions[i]; + + if (hva >= reg->host_user_addr && + hva < reg->host_user_addr + reg->size) { + gpa = hva - reg->host_user_addr + reg->guest_phys_addr; + break; + } + } + +exit: + if (mem) + free(mem); + return gpa; +} + +static int +vdpa_ifcvf_start(struct ifcvf_internal *internal) +{ + struct ifcvf_hw *hw = &internal->hw; + int i, nr_vring; + int vid; + struct rte_vhost_vring vq; + uint64_t gpa; + + vid = internal->vid; + nr_vring = rte_vhost_get_vring_num(vid); + rte_vhost_get_negotiated_features(vid, &hw->req_features); + + for (i = 0; i < nr_vring; i++) { + rte_vhost_get_vhost_vring(vid, i, &vq); + gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc); + if (gpa == 0) { + DRV_LOG(ERR, "Fail to get GPA for descriptor ring."); + return -1; + } + hw->vring[i].desc = gpa; + + gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail); + if (gpa == 0) { + DRV_LOG(ERR, "Fail to get GPA for available ring."); + return -1; + } + hw->vring[i].avail = gpa; + + gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used); + if (gpa == 0) { + DRV_LOG(ERR, "Fail to get GPA for used ring."); + return -1; + } + hw->vring[i].used = gpa; + + hw->vring[i].size = vq.size; + rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx, + &hw->vring[i].last_used_idx); + } + hw->nr_vring = i; + + return ifcvf_start_hw(&internal->hw); +} + +static void +vdpa_ifcvf_stop(struct ifcvf_internal *internal) +{ + struct ifcvf_hw *hw = &internal->hw; + uint32_t i; + int vid; + uint64_t features = 0; + uint64_t log_base = 0, log_size = 0; + uint64_t len; + + vid = internal->vid; + ifcvf_stop_hw(hw); + + for (i = 0; i < hw->nr_vring; i++) + rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx, + hw->vring[i].last_used_idx); + + if (internal->sw_lm) + return; + + rte_vhost_get_negotiated_features(vid, &features); + if (RTE_VHOST_NEED_LOG(features)) { + ifcvf_disable_logging(hw); + rte_vhost_get_log_base(internal->vid, &log_base, &log_size); + rte_vfio_container_dma_unmap(internal->vfio_container_fd, + log_base, IFCVF_LOG_BASE, log_size); + /* + * IFCVF marks dirty memory pages for only packet buffer, + * SW helps to mark the used ring as dirty after device stops. + */ + for (i = 0; i < hw->nr_vring; i++) { + len = IFCVF_USED_RING_LEN(hw->vring[i].size); + rte_vhost_log_used_vring(vid, i, 0, len); + } + } +} + +#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \ + sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1)) +static int +vdpa_enable_vfio_intr(struct ifcvf_internal *internal, bool m_rx) +{ + int ret; + uint32_t i, nr_vring; + char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + int *fd_ptr; + struct rte_vhost_vring vring; + int fd; + + vring.callfd = -1; + + nr_vring = rte_vhost_get_vring_num(internal->vid); + + irq_set = (struct vfio_irq_set *)irq_set_buf; + irq_set->argsz = sizeof(irq_set_buf); + irq_set->count = nr_vring + 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | + VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *)&irq_set->data; + fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = internal->pdev->intr_handle.fd; + + for (i = 0; i < nr_vring; i++) + internal->intr_fd[i] = -1; + + for (i = 0; i < nr_vring; i++) { + rte_vhost_get_vhost_vring(internal->vid, i, &vring); + fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd; + if ((i & 1) == 0 && m_rx == true) { + fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (fd < 0) { + DRV_LOG(ERR, "can't setup eventfd: %s", + strerror(errno)); + return -1; + } + internal->intr_fd[i] = fd; + fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd; + } + } + + ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + if (ret) { + DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s", + strerror(errno)); + return -1; + } + + return 0; +} + +static int +vdpa_disable_vfio_intr(struct ifcvf_internal *internal) +{ + int ret; + uint32_t i, nr_vring; + char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + + irq_set = (struct vfio_irq_set *)irq_set_buf; + irq_set->argsz = sizeof(irq_set_buf); + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = 0; + + nr_vring = rte_vhost_get_vring_num(internal->vid); + for (i = 0; i < nr_vring; i++) { + if (internal->intr_fd[i] >= 0) + close(internal->intr_fd[i]); + internal->intr_fd[i] = -1; + } + + ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + if (ret) { + DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s", + strerror(errno)); + return -1; + } + + return 0; +} + +static void * +notify_relay(void *arg) +{ + int i, kickfd, epfd, nfds = 0; + uint32_t qid, q_num; + struct epoll_event events[IFCVF_MAX_QUEUES * 2]; + struct epoll_event ev; + uint64_t buf; + int nbytes; + struct rte_vhost_vring vring; + struct ifcvf_internal *internal = (struct ifcvf_internal *)arg; + struct ifcvf_hw *hw = &internal->hw; + + q_num = rte_vhost_get_vring_num(internal->vid); + + epfd = epoll_create(IFCVF_MAX_QUEUES * 2); + if (epfd < 0) { + DRV_LOG(ERR, "failed to create epoll instance."); + return NULL; + } + internal->epfd = epfd; + + vring.kickfd = -1; + for (qid = 0; qid < q_num; qid++) { + ev.events = EPOLLIN | EPOLLPRI; + rte_vhost_get_vhost_vring(internal->vid, qid, &vring); + ev.data.u64 = qid | (uint64_t)vring.kickfd << 32; + if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) { + DRV_LOG(ERR, "epoll add error: %s", strerror(errno)); + return NULL; + } + } + + for (;;) { + nfds = epoll_wait(epfd, events, q_num, -1); + if (nfds < 0) { + if (errno == EINTR) + continue; + DRV_LOG(ERR, "epoll_wait return fail\n"); + return NULL; + } + + for (i = 0; i < nfds; i++) { + qid = events[i].data.u32; + kickfd = (uint32_t)(events[i].data.u64 >> 32); + do { + nbytes = read(kickfd, &buf, 8); + if (nbytes < 0) { + if (errno == EINTR || + errno == EWOULDBLOCK || + errno == EAGAIN) + continue; + DRV_LOG(INFO, "Error reading " + "kickfd: %s", + strerror(errno)); + } + break; + } while (1); + + ifcvf_notify_queue(hw, qid); + } + } + + return NULL; +} + +static int +setup_notify_relay(struct ifcvf_internal *internal) +{ + int ret; + + ret = pthread_create(&internal->tid, NULL, notify_relay, + (void *)internal); + if (ret) { + DRV_LOG(ERR, "failed to create notify relay pthread."); + return -1; + } + return 0; +} + +static int +unset_notify_relay(struct ifcvf_internal *internal) +{ + void *status; + + if (internal->tid) { + pthread_cancel(internal->tid); + pthread_join(internal->tid, &status); + } + internal->tid = 0; + + if (internal->epfd >= 0) + close(internal->epfd); + internal->epfd = -1; + + return 0; +} + +static int +update_datapath(struct ifcvf_internal *internal) +{ + int ret; + + rte_spinlock_lock(&internal->lock); + + if (!rte_atomic32_read(&internal->running) && + (rte_atomic32_read(&internal->started) && + rte_atomic32_read(&internal->dev_attached))) { + ret = ifcvf_dma_map(internal, 1); + if (ret) + goto err; + + ret = vdpa_enable_vfio_intr(internal, 0); + if (ret) + goto err; + + ret = vdpa_ifcvf_start(internal); + if (ret) + goto err; + + ret = setup_notify_relay(internal); + if (ret) + goto err; + + rte_atomic32_set(&internal->running, 1); + } else if (rte_atomic32_read(&internal->running) && + (!rte_atomic32_read(&internal->started) || + !rte_atomic32_read(&internal->dev_attached))) { + ret = unset_notify_relay(internal); + if (ret) + goto err; + + vdpa_ifcvf_stop(internal); + + ret = vdpa_disable_vfio_intr(internal); + if (ret) + goto err; + + ret = ifcvf_dma_map(internal, 0); + if (ret) + goto err; + + rte_atomic32_set(&internal->running, 0); + } + + rte_spinlock_unlock(&internal->lock); + return 0; +err: + rte_spinlock_unlock(&internal->lock); + return ret; +} + +static int +m_ifcvf_start(struct ifcvf_internal *internal) +{ + struct ifcvf_hw *hw = &internal->hw; + uint32_t i, nr_vring; + int vid, ret; + struct rte_vhost_vring vq; + void *vring_buf; + uint64_t m_vring_iova = IFCVF_MEDIATED_VRING; + uint64_t size; + uint64_t gpa; + + memset(&vq, 0, sizeof(vq)); + vid = internal->vid; + nr_vring = rte_vhost_get_vring_num(vid); + rte_vhost_get_negotiated_features(vid, &hw->req_features); + + for (i = 0; i < nr_vring; i++) { + rte_vhost_get_vhost_vring(vid, i, &vq); + + size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE), + PAGE_SIZE); + vring_buf = rte_zmalloc("ifcvf", size, PAGE_SIZE); + vring_init(&internal->m_vring[i], vq.size, vring_buf, + PAGE_SIZE); + + ret = rte_vfio_container_dma_map(internal->vfio_container_fd, + (uint64_t)(uintptr_t)vring_buf, m_vring_iova, size); + if (ret < 0) { + DRV_LOG(ERR, "mediated vring DMA map failed."); + goto error; + } + + gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc); + if (gpa == 0) { + DRV_LOG(ERR, "Fail to get GPA for descriptor ring."); + return -1; + } + hw->vring[i].desc = gpa; + + gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail); + if (gpa == 0) { + DRV_LOG(ERR, "Fail to get GPA for available ring."); + return -1; + } + hw->vring[i].avail = gpa; + + /* Direct I/O for Tx queue, relay for Rx queue */ + if (i & 1) { + gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used); + if (gpa == 0) { + DRV_LOG(ERR, "Fail to get GPA for used ring."); + return -1; + } + hw->vring[i].used = gpa; + } else { + hw->vring[i].used = m_vring_iova + + (char *)internal->m_vring[i].used - + (char *)internal->m_vring[i].desc; + } + + hw->vring[i].size = vq.size; + + rte_vhost_get_vring_base(vid, i, + &internal->m_vring[i].avail->idx, + &internal->m_vring[i].used->idx); + + rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx, + &hw->vring[i].last_used_idx); + + m_vring_iova += size; + } + hw->nr_vring = nr_vring; + + return ifcvf_start_hw(&internal->hw); + +error: + for (i = 0; i < nr_vring; i++) + if (internal->m_vring[i].desc) + rte_free(internal->m_vring[i].desc); + + return -1; +} + +static int +m_ifcvf_stop(struct ifcvf_internal *internal) +{ + int vid; + uint32_t i; + struct rte_vhost_vring vq; + struct ifcvf_hw *hw = &internal->hw; + uint64_t m_vring_iova = IFCVF_MEDIATED_VRING; + uint64_t size, len; + + vid = internal->vid; + ifcvf_stop_hw(hw); + + for (i = 0; i < hw->nr_vring; i++) { + /* synchronize remaining new used entries if any */ + if ((i & 1) == 0) + update_used_ring(internal, i); + + rte_vhost_get_vhost_vring(vid, i, &vq); + len = IFCVF_USED_RING_LEN(vq.size); + rte_vhost_log_used_vring(vid, i, 0, len); + + size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE), + PAGE_SIZE); + rte_vfio_container_dma_unmap(internal->vfio_container_fd, + (uint64_t)(uintptr_t)internal->m_vring[i].desc, + m_vring_iova, size); + + rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx, + hw->vring[i].last_used_idx); + rte_free(internal->m_vring[i].desc); + m_vring_iova += size; + } + + return 0; +} + +static void +update_used_ring(struct ifcvf_internal *internal, uint16_t qid) +{ + rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]); + rte_vhost_vring_call(internal->vid, qid); +} + +static void * +vring_relay(void *arg) +{ + int i, vid, epfd, fd, nfds; + struct ifcvf_internal *internal = (struct ifcvf_internal *)arg; + struct rte_vhost_vring vring; + uint16_t qid, q_num; + struct epoll_event events[IFCVF_MAX_QUEUES * 4]; + struct epoll_event ev; + int nbytes; + uint64_t buf; + + vid = internal->vid; + q_num = rte_vhost_get_vring_num(vid); + + /* add notify fd and interrupt fd to epoll */ + epfd = epoll_create(IFCVF_MAX_QUEUES * 2); + if (epfd < 0) { + DRV_LOG(ERR, "failed to create epoll instance."); + return NULL; + } + internal->epfd = epfd; + + vring.kickfd = -1; + for (qid = 0; qid < q_num; qid++) { + ev.events = EPOLLIN | EPOLLPRI; + rte_vhost_get_vhost_vring(vid, qid, &vring); + ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32; + if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) { + DRV_LOG(ERR, "epoll add error: %s", strerror(errno)); + return NULL; + } + } + + for (qid = 0; qid < q_num; qid += 2) { + ev.events = EPOLLIN | EPOLLPRI; + /* leave a flag to mark it's for interrupt */ + ev.data.u64 = 1 | qid << 1 | + (uint64_t)internal->intr_fd[qid] << 32; + if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid], &ev) + < 0) { + DRV_LOG(ERR, "epoll add error: %s", strerror(errno)); + return NULL; + } + update_used_ring(internal, qid); + } + + /* start relay with a first kick */ + for (qid = 0; qid < q_num; qid++) + ifcvf_notify_queue(&internal->hw, qid); + + /* listen to the events and react accordingly */ + for (;;) { + nfds = epoll_wait(epfd, events, q_num * 2, -1); + if (nfds < 0) { + if (errno == EINTR) + continue; + DRV_LOG(ERR, "epoll_wait return fail\n"); + return NULL; + } + + for (i = 0; i < nfds; i++) { + fd = (uint32_t)(events[i].data.u64 >> 32); + do { + nbytes = read(fd, &buf, 8); + if (nbytes < 0) { + if (errno == EINTR || + errno == EWOULDBLOCK || + errno == EAGAIN) + continue; + DRV_LOG(INFO, "Error reading " + "kickfd: %s", + strerror(errno)); + } + break; + } while (1); + + qid = events[i].data.u32 >> 1; + + if (events[i].data.u32 & 1) + update_used_ring(internal, qid); + else + ifcvf_notify_queue(&internal->hw, qid); + } + } + + return NULL; +} + +static int +setup_vring_relay(struct ifcvf_internal *internal) +{ + int ret; + + ret = pthread_create(&internal->tid, NULL, vring_relay, + (void *)internal); + if (ret) { + DRV_LOG(ERR, "failed to create ring relay pthread."); + return -1; + } + return 0; +} + +static int +unset_vring_relay(struct ifcvf_internal *internal) +{ + void *status; + + if (internal->tid) { + pthread_cancel(internal->tid); + pthread_join(internal->tid, &status); + } + internal->tid = 0; + + if (internal->epfd >= 0) + close(internal->epfd); + internal->epfd = -1; + + return 0; +} + +static int +ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal) +{ + int ret; + int vid = internal->vid; + + /* stop the direct IO data path */ + unset_notify_relay(internal); + vdpa_ifcvf_stop(internal); + vdpa_disable_vfio_intr(internal); + + ret = rte_vhost_host_notifier_ctrl(vid, false); + if (ret && ret != -ENOTSUP) + goto error; + + /* set up interrupt for interrupt relay */ + ret = vdpa_enable_vfio_intr(internal, 1); + if (ret) + goto unmap; + + /* config the VF */ + ret = m_ifcvf_start(internal); + if (ret) + goto unset_intr; + + /* set up vring relay thread */ + ret = setup_vring_relay(internal); + if (ret) + goto stop_vf; + + rte_vhost_host_notifier_ctrl(vid, true); + + internal->sw_fallback_running = true; + + return 0; + +stop_vf: + m_ifcvf_stop(internal); +unset_intr: + vdpa_disable_vfio_intr(internal); +unmap: + ifcvf_dma_map(internal, 0); +error: + return -1; +} + +static int +ifcvf_dev_config(int vid) +{ + int did; + struct internal_list *list; + struct ifcvf_internal *internal; + + did = rte_vhost_get_vdpa_device_id(vid); + list = find_internal_resource_by_did(did); + if (list == NULL) { + DRV_LOG(ERR, "Invalid device id: %d", did); + return -1; + } + + internal = list->internal; + internal->vid = vid; + rte_atomic32_set(&internal->dev_attached, 1); + update_datapath(internal); + + if (rte_vhost_host_notifier_ctrl(vid, true) != 0) + DRV_LOG(NOTICE, "vDPA (%d): software relay is used.", did); + + return 0; +} + +static int +ifcvf_dev_close(int vid) +{ + int did; + struct internal_list *list; + struct ifcvf_internal *internal; + + did = rte_vhost_get_vdpa_device_id(vid); + list = find_internal_resource_by_did(did); + if (list == NULL) { + DRV_LOG(ERR, "Invalid device id: %d", did); + return -1; + } + + internal = list->internal; + + if (internal->sw_fallback_running) { + /* unset ring relay */ + unset_vring_relay(internal); + + /* reset VF */ + m_ifcvf_stop(internal); + + /* remove interrupt setting */ + vdpa_disable_vfio_intr(internal); + + /* unset DMA map for guest memory */ + ifcvf_dma_map(internal, 0); + + internal->sw_fallback_running = false; + } else { + rte_atomic32_set(&internal->dev_attached, 0); + update_datapath(internal); + } + + return 0; +} + +static int +ifcvf_set_features(int vid) +{ + uint64_t features = 0; + int did; + struct internal_list *list; + struct ifcvf_internal *internal; + uint64_t log_base = 0, log_size = 0; + + did = rte_vhost_get_vdpa_device_id(vid); + list = find_internal_resource_by_did(did); + if (list == NULL) { + DRV_LOG(ERR, "Invalid device id: %d", did); + return -1; + } + + internal = list->internal; + rte_vhost_get_negotiated_features(vid, &features); + + if (!RTE_VHOST_NEED_LOG(features)) + return 0; + + if (internal->sw_lm) { + ifcvf_sw_fallback_switchover(internal); + } else { + rte_vhost_get_log_base(vid, &log_base, &log_size); + rte_vfio_container_dma_map(internal->vfio_container_fd, + log_base, IFCVF_LOG_BASE, log_size); + ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size); + } + + return 0; +} + +static int +ifcvf_get_vfio_group_fd(int vid) +{ + int did; + struct internal_list *list; + + did = rte_vhost_get_vdpa_device_id(vid); + list = find_internal_resource_by_did(did); + if (list == NULL) { + DRV_LOG(ERR, "Invalid device id: %d", did); + return -1; + } + + return list->internal->vfio_group_fd; +} + +static int +ifcvf_get_vfio_device_fd(int vid) +{ + int did; + struct internal_list *list; + + did = rte_vhost_get_vdpa_device_id(vid); + list = find_internal_resource_by_did(did); + if (list == NULL) { + DRV_LOG(ERR, "Invalid device id: %d", did); + return -1; + } + + return list->internal->vfio_dev_fd; +} + +static int +ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size) +{ + int did; + struct internal_list *list; + struct ifcvf_internal *internal; + struct vfio_region_info reg = { .argsz = sizeof(reg) }; + int ret; + + did = rte_vhost_get_vdpa_device_id(vid); + list = find_internal_resource_by_did(did); + if (list == NULL) { + DRV_LOG(ERR, "Invalid device id: %d", did); + return -1; + } + + internal = list->internal; + + reg.index = ifcvf_get_notify_region(&internal->hw); + ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®); + if (ret) { + DRV_LOG(ERR, "Get not get device region info: %s", + strerror(errno)); + return -1; + } + + *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset; + *size = 0x1000; + + return 0; +} + +static int +ifcvf_get_queue_num(int did, uint32_t *queue_num) +{ + struct internal_list *list; + + list = find_internal_resource_by_did(did); + if (list == NULL) { + DRV_LOG(ERR, "Invalid device id: %d", did); + return -1; + } + + *queue_num = list->internal->max_queues; + + return 0; +} + +static int +ifcvf_get_vdpa_features(int did, uint64_t *features) +{ + struct internal_list *list; + + list = find_internal_resource_by_did(did); + if (list == NULL) { + DRV_LOG(ERR, "Invalid device id: %d", did); + return -1; + } + + *features = list->internal->features; + + return 0; +} + +#define VDPA_SUPPORTED_PROTOCOL_FEATURES \ + (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \ + 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \ + 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \ + 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \ + 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) +static int +ifcvf_get_protocol_features(int did __rte_unused, uint64_t *features) +{ + *features = VDPA_SUPPORTED_PROTOCOL_FEATURES; + return 0; +} + +static struct rte_vdpa_dev_ops ifcvf_ops = { + .get_queue_num = ifcvf_get_queue_num, + .get_features = ifcvf_get_vdpa_features, + .get_protocol_features = ifcvf_get_protocol_features, + .dev_conf = ifcvf_dev_config, + .dev_close = ifcvf_dev_close, + .set_vring_state = NULL, + .set_features = ifcvf_set_features, + .migration_done = NULL, + .get_vfio_group_fd = ifcvf_get_vfio_group_fd, + .get_vfio_device_fd = ifcvf_get_vfio_device_fd, + .get_notify_area = ifcvf_get_notify_area, +}; + +static inline int +open_int(const char *key __rte_unused, const char *value, void *extra_args) +{ + uint16_t *n = extra_args; + + if (value == NULL || extra_args == NULL) + return -EINVAL; + + *n = (uint16_t)strtoul(value, NULL, 0); + if (*n == USHRT_MAX && errno == ERANGE) + return -1; + + return 0; +} + +static int +ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, + struct rte_pci_device *pci_dev) +{ + uint64_t features; + struct ifcvf_internal *internal = NULL; + struct internal_list *list = NULL; + int vdpa_mode = 0; + int sw_fallback_lm = 0; + struct rte_kvargs *kvlist = NULL; + int ret = 0; + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) + return 0; + + if (!pci_dev->device.devargs) + return 1; + + kvlist = rte_kvargs_parse(pci_dev->device.devargs->args, + ifcvf_valid_arguments); + if (kvlist == NULL) + return 1; + + /* probe only when vdpa mode is specified */ + if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) { + rte_kvargs_free(kvlist); + return 1; + } + + ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int, + &vdpa_mode); + if (ret < 0 || vdpa_mode == 0) { + rte_kvargs_free(kvlist); + return 1; + } + + list = rte_zmalloc("ifcvf", sizeof(*list), 0); + if (list == NULL) + goto error; + + internal = rte_zmalloc("ifcvf", sizeof(*internal), 0); + if (internal == NULL) + goto error; + + internal->pdev = pci_dev; + rte_spinlock_init(&internal->lock); + + if (ifcvf_vfio_setup(internal) < 0) { + DRV_LOG(ERR, "failed to setup device %s", pci_dev->name); + goto error; + } + + if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) { + DRV_LOG(ERR, "failed to init device %s", pci_dev->name); + goto error; + } + + internal->max_queues = IFCVF_MAX_QUEUES; + features = ifcvf_get_features(&internal->hw); + internal->features = (features & + ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) | + (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | + (1ULL << VIRTIO_NET_F_CTRL_VQ) | + (1ULL << VIRTIO_NET_F_STATUS) | + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | + (1ULL << VHOST_F_LOG_ALL); + + internal->dev_addr.pci_addr = pci_dev->addr; + internal->dev_addr.type = VDPA_ADDR_PCI; + list->internal = internal; + + if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) { + ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM, + &open_int, &sw_fallback_lm); + if (ret < 0) + goto error; + } + internal->sw_lm = sw_fallback_lm; + + internal->did = rte_vdpa_register_device(&internal->dev_addr, + &ifcvf_ops); + if (internal->did < 0) { + DRV_LOG(ERR, "failed to register device %s", pci_dev->name); + goto error; + } + + pthread_mutex_lock(&internal_list_lock); + TAILQ_INSERT_TAIL(&internal_list, list, next); + pthread_mutex_unlock(&internal_list_lock); + + rte_atomic32_set(&internal->started, 1); + update_datapath(internal); + + rte_kvargs_free(kvlist); + return 0; + +error: + rte_kvargs_free(kvlist); + rte_free(list); + rte_free(internal); + return -1; +} + +static int +ifcvf_pci_remove(struct rte_pci_device *pci_dev) +{ + struct ifcvf_internal *internal; + struct internal_list *list; + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) + return 0; + + list = find_internal_resource_by_dev(pci_dev); + if (list == NULL) { + DRV_LOG(ERR, "Invalid device: %s", pci_dev->name); + return -1; + } + + internal = list->internal; + rte_atomic32_set(&internal->started, 0); + update_datapath(internal); + + rte_pci_unmap_device(internal->pdev); + rte_vfio_container_destroy(internal->vfio_container_fd); + rte_vdpa_unregister_device(internal->did); + + pthread_mutex_lock(&internal_list_lock); + TAILQ_REMOVE(&internal_list, list, next); + pthread_mutex_unlock(&internal_list_lock); + + rte_free(list); + rte_free(internal); + + return 0; +} + +/* + * IFCVF has the same vendor ID and device ID as virtio net PCI + * device, with its specific subsystem vendor ID and device ID. + */ +static const struct rte_pci_id pci_id_ifcvf_map[] = { + { .class_id = RTE_CLASS_ANY_ID, + .vendor_id = IFCVF_VENDOR_ID, + .device_id = IFCVF_DEVICE_ID, + .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID, + .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID, + }, + + { .vendor_id = 0, /* sentinel */ + }, +}; + +static struct rte_pci_driver rte_ifcvf_vdpa = { + .id_table = pci_id_ifcvf_map, + .drv_flags = 0, + .probe = ifcvf_pci_probe, + .remove = ifcvf_pci_remove, +}; + +RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa); +RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map); +RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci"); + +RTE_INIT(ifcvf_vdpa_init_log) +{ + ifcvf_vdpa_logtype = rte_log_register("pmd.net.ifcvf_vdpa"); + if (ifcvf_vdpa_logtype >= 0) + rte_log_set_level(ifcvf_vdpa_logtype, RTE_LOG_NOTICE); +} diff --git a/src/spdk/dpdk/drivers/vdpa/ifc/meson.build b/src/spdk/dpdk/drivers/vdpa/ifc/meson.build new file mode 100644 index 000000000..b179987f9 --- /dev/null +++ b/src/spdk/dpdk/drivers/vdpa/ifc/meson.build @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2018 Intel Corporation + +build = dpdk_conf.has('RTE_LIBRTE_VHOST') +reason = 'missing dependency, DPDK vhost library' +sources = files('ifcvf_vdpa.c', 'base/ifcvf.c') +includes += include_directories('base') +deps += 'vhost' diff --git a/src/spdk/dpdk/drivers/vdpa/ifc/rte_pmd_ifc_version.map b/src/spdk/dpdk/drivers/vdpa/ifc/rte_pmd_ifc_version.map new file mode 100644 index 000000000..f9f17e4f6 --- /dev/null +++ b/src/spdk/dpdk/drivers/vdpa/ifc/rte_pmd_ifc_version.map @@ -0,0 +1,3 @@ +DPDK_20.0 { + local: *; +}; diff --git a/src/spdk/dpdk/drivers/vdpa/meson.build b/src/spdk/dpdk/drivers/vdpa/meson.build new file mode 100644 index 000000000..e3ed54a25 --- /dev/null +++ b/src/spdk/dpdk/drivers/vdpa/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2019 Mellanox Technologies, Ltd + +drivers = ['ifc', + 'mlx5',] +std_deps = ['bus_pci', 'kvargs'] +std_deps += ['vhost'] +config_flag_fmt = 'RTE_LIBRTE_@0@_PMD' +driver_name_fmt = 'rte_pmd_@0@' diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/Makefile b/src/spdk/dpdk/drivers/vdpa/mlx5/Makefile new file mode 100644 index 000000000..ef34c0b88 --- /dev/null +++ b/src/spdk/dpdk/drivers/vdpa/mlx5/Makefile @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2019 Mellanox Technologies, Ltd + +include $(RTE_SDK)/mk/rte.vars.mk + +# Library name. +LIB = librte_pmd_mlx5_vdpa.a + +# Sources. +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa_mem.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa_event.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa_virtq.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa_steer.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa_lm.c + + +# Basic CFLAGS. +CFLAGS += -O3 +CFLAGS += -std=c11 -Wall -Wextra +CFLAGS += -g +CFLAGS += -I$(RTE_SDK)/drivers/common/mlx5 +CFLAGS += -I$(RTE_SDK)/drivers/net/mlx5_vdpa +CFLAGS += -I$(RTE_SDK)/lib/librte_sched +CFLAGS += -I$(BUILDDIR)/drivers/common/mlx5 +CFLAGS += -D_BSD_SOURCE +CFLAGS += -D_DEFAULT_SOURCE +CFLAGS += -D_XOPEN_SOURCE=600 +CFLAGS += $(WERROR_FLAGS) +CFLAGS += -Wno-strict-prototypes +LDLIBS += -lrte_common_mlx5 +LDLIBS += -lrte_eal -lrte_vhost -lrte_kvargs -lrte_pci -lrte_bus_pci -lrte_sched + +# A few warnings cannot be avoided in external headers. +CFLAGS += -Wno-error=cast-qual + +EXPORT_MAP := rte_pmd_mlx5_vdpa_version.map + +# DEBUG which is usually provided on the command-line may enable +# CONFIG_RTE_LIBRTE_MLX5_DEBUG. +ifeq ($(DEBUG),1) +CONFIG_RTE_LIBRTE_MLX5_DEBUG := y +endif + +# User-defined CFLAGS. +ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DEBUG),y) +CFLAGS += -pedantic +ifneq ($(CONFIG_RTE_TOOLCHAIN_ICC),y) +CFLAGS += -DPEDANTIC +endif +AUTO_CONFIG_CFLAGS += -Wno-pedantic +else +CFLAGS += -UPEDANTIC +endif + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/meson.build b/src/spdk/dpdk/drivers/vdpa/mlx5/meson.build new file mode 100644 index 000000000..2963aad71 --- /dev/null +++ b/src/spdk/dpdk/drivers/vdpa/mlx5/meson.build @@ -0,0 +1,37 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2019 Mellanox Technologies, Ltd + +if not is_linux + build = false + reason = 'only supported on Linux' + subdir_done() +endif + +fmt_name = 'mlx5_vdpa' +deps += ['hash', 'common_mlx5', 'vhost', 'pci', 'bus_pci', 'eal', 'sched'] +sources = files( + 'mlx5_vdpa.c', + 'mlx5_vdpa_mem.c', + 'mlx5_vdpa_event.c', + 'mlx5_vdpa_virtq.c', + 'mlx5_vdpa_steer.c', + 'mlx5_vdpa_lm.c', +) +cflags_options = [ + '-std=c11', + '-Wno-strict-prototypes', + '-D_BSD_SOURCE', + '-D_DEFAULT_SOURCE', + '-D_XOPEN_SOURCE=600' +] +foreach option:cflags_options + if cc.has_argument(option) + cflags += option + endif +endforeach + +if get_option('buildtype').contains('debug') + cflags += [ '-pedantic', '-DPEDANTIC' ] +else + cflags += [ '-UPEDANTIC' ] +endif diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa.c b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa.c new file mode 100644 index 000000000..1113d6cef --- /dev/null +++ b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa.c @@ -0,0 +1,626 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2019 Mellanox Technologies, Ltd + */ +#include <unistd.h> + +#include <rte_malloc.h> +#include <rte_log.h> +#include <rte_errno.h> +#include <rte_bus_pci.h> +#include <rte_pci.h> + +#include <mlx5_glue.h> +#include <mlx5_common.h> +#include <mlx5_devx_cmds.h> +#include <mlx5_prm.h> +#include <mlx5_nl.h> + +#include "mlx5_vdpa_utils.h" +#include "mlx5_vdpa.h" + + +#define MLX5_VDPA_DEFAULT_FEATURES ((1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \ + (1ULL << VIRTIO_F_ANY_LAYOUT) | \ + (1ULL << VIRTIO_NET_F_MQ) | \ + (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \ + (1ULL << VIRTIO_F_ORDER_PLATFORM) | \ + (1ULL << VHOST_F_LOG_ALL)) + +#define MLX5_VDPA_PROTOCOL_FEATURES \ + ((1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ) | \ + (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \ + (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) | \ + (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) | \ + (1ULL << VHOST_USER_PROTOCOL_F_MQ)) + +TAILQ_HEAD(mlx5_vdpa_privs, mlx5_vdpa_priv) priv_list = + TAILQ_HEAD_INITIALIZER(priv_list); +static pthread_mutex_t priv_list_lock = PTHREAD_MUTEX_INITIALIZER; +int mlx5_vdpa_logtype; + +static struct mlx5_vdpa_priv * +mlx5_vdpa_find_priv_resource_by_did(int did) +{ + struct mlx5_vdpa_priv *priv; + int found = 0; + + pthread_mutex_lock(&priv_list_lock); + TAILQ_FOREACH(priv, &priv_list, next) { + if (did == priv->id) { + found = 1; + break; + } + } + pthread_mutex_unlock(&priv_list_lock); + if (!found) { + DRV_LOG(ERR, "Invalid device id: %d.", did); + rte_errno = EINVAL; + return NULL; + } + return priv; +} + +static int +mlx5_vdpa_get_queue_num(int did, uint32_t *queue_num) +{ + struct mlx5_vdpa_priv *priv = mlx5_vdpa_find_priv_resource_by_did(did); + + if (priv == NULL) { + DRV_LOG(ERR, "Invalid device id: %d.", did); + return -1; + } + *queue_num = priv->caps.max_num_virtio_queues; + return 0; +} + +static int +mlx5_vdpa_get_vdpa_features(int did, uint64_t *features) +{ + struct mlx5_vdpa_priv *priv = mlx5_vdpa_find_priv_resource_by_did(did); + + if (priv == NULL) { + DRV_LOG(ERR, "Invalid device id: %d.", did); + return -1; + } + *features = MLX5_VDPA_DEFAULT_FEATURES; + if (priv->caps.virtio_queue_type & (1 << MLX5_VIRTQ_TYPE_PACKED)) + *features |= (1ULL << VIRTIO_F_RING_PACKED); + if (priv->caps.tso_ipv4) + *features |= (1ULL << VIRTIO_NET_F_HOST_TSO4); + if (priv->caps.tso_ipv6) + *features |= (1ULL << VIRTIO_NET_F_HOST_TSO6); + if (priv->caps.tx_csum) + *features |= (1ULL << VIRTIO_NET_F_CSUM); + if (priv->caps.rx_csum) + *features |= (1ULL << VIRTIO_NET_F_GUEST_CSUM); + if (priv->caps.virtio_version_1_0) + *features |= (1ULL << VIRTIO_F_VERSION_1); + return 0; +} + +static int +mlx5_vdpa_get_protocol_features(int did, uint64_t *features) +{ + struct mlx5_vdpa_priv *priv = mlx5_vdpa_find_priv_resource_by_did(did); + + if (priv == NULL) { + DRV_LOG(ERR, "Invalid device id: %d.", did); + return -1; + } + *features = MLX5_VDPA_PROTOCOL_FEATURES; + return 0; +} + +static int +mlx5_vdpa_set_vring_state(int vid, int vring, int state) +{ + int did = rte_vhost_get_vdpa_device_id(vid); + struct mlx5_vdpa_priv *priv = mlx5_vdpa_find_priv_resource_by_did(did); + + if (priv == NULL) { + DRV_LOG(ERR, "Invalid device id: %d.", did); + return -EINVAL; + } + if (vring >= (int)priv->caps.max_num_virtio_queues * 2) { + DRV_LOG(ERR, "Too big vring id: %d.", vring); + return -E2BIG; + } + return mlx5_vdpa_virtq_enable(priv, vring, state); +} + +static int +mlx5_vdpa_direct_db_prepare(struct mlx5_vdpa_priv *priv) +{ + int ret; + + if (priv->direct_notifier) { + ret = rte_vhost_host_notifier_ctrl(priv->vid, false); + if (ret != 0) { + DRV_LOG(INFO, "Direct HW notifier FD cannot be " + "destroyed for device %d: %d.", priv->vid, ret); + return -1; + } + priv->direct_notifier = 0; + } + ret = rte_vhost_host_notifier_ctrl(priv->vid, true); + if (ret != 0) + DRV_LOG(INFO, "Direct HW notifier FD cannot be configured for" + " device %d: %d.", priv->vid, ret); + else + priv->direct_notifier = 1; + return 0; +} + +static int +mlx5_vdpa_features_set(int vid) +{ + int did = rte_vhost_get_vdpa_device_id(vid); + struct mlx5_vdpa_priv *priv = mlx5_vdpa_find_priv_resource_by_did(did); + uint64_t log_base, log_size; + uint64_t features; + int ret; + + if (priv == NULL) { + DRV_LOG(ERR, "Invalid device id: %d.", did); + return -EINVAL; + } + ret = rte_vhost_get_negotiated_features(vid, &features); + if (ret) { + DRV_LOG(ERR, "Failed to get negotiated features."); + return ret; + } + if (RTE_VHOST_NEED_LOG(features)) { + ret = rte_vhost_get_log_base(vid, &log_base, &log_size); + if (ret) { + DRV_LOG(ERR, "Failed to get log base."); + return ret; + } + ret = mlx5_vdpa_dirty_bitmap_set(priv, log_base, log_size); + if (ret) { + DRV_LOG(ERR, "Failed to set dirty bitmap."); + return ret; + } + DRV_LOG(INFO, "mlx5 vdpa: enabling dirty logging..."); + ret = mlx5_vdpa_logging_enable(priv, 1); + if (ret) { + DRV_LOG(ERR, "Failed t enable dirty logging."); + return ret; + } + } + return 0; +} + +static int +mlx5_vdpa_dev_close(int vid) +{ + int did = rte_vhost_get_vdpa_device_id(vid); + struct mlx5_vdpa_priv *priv = mlx5_vdpa_find_priv_resource_by_did(did); + int ret = 0; + + if (priv == NULL) { + DRV_LOG(ERR, "Invalid device id: %d.", did); + return -1; + } + if (priv->configured) + ret |= mlx5_vdpa_lm_log(priv); + mlx5_vdpa_cqe_event_unset(priv); + mlx5_vdpa_steer_unset(priv); + mlx5_vdpa_virtqs_release(priv); + mlx5_vdpa_event_qp_global_release(priv); + mlx5_vdpa_mem_dereg(priv); + priv->configured = 0; + priv->vid = 0; + DRV_LOG(INFO, "vDPA device %d was closed.", vid); + return ret; +} + +static int +mlx5_vdpa_dev_config(int vid) +{ + int did = rte_vhost_get_vdpa_device_id(vid); + struct mlx5_vdpa_priv *priv = mlx5_vdpa_find_priv_resource_by_did(did); + + if (priv == NULL) { + DRV_LOG(ERR, "Invalid device id: %d.", did); + return -EINVAL; + } + if (priv->configured && mlx5_vdpa_dev_close(vid)) { + DRV_LOG(ERR, "Failed to reconfigure vid %d.", vid); + return -1; + } + priv->vid = vid; + if (mlx5_vdpa_mem_register(priv) || mlx5_vdpa_direct_db_prepare(priv) || + mlx5_vdpa_virtqs_prepare(priv) || mlx5_vdpa_steer_setup(priv) || + mlx5_vdpa_cqe_event_setup(priv)) { + mlx5_vdpa_dev_close(vid); + return -1; + } + priv->configured = 1; + DRV_LOG(INFO, "vDPA device %d was configured.", vid); + return 0; +} + +static int +mlx5_vdpa_get_device_fd(int vid) +{ + int did = rte_vhost_get_vdpa_device_id(vid); + struct mlx5_vdpa_priv *priv = mlx5_vdpa_find_priv_resource_by_did(did); + + if (priv == NULL) { + DRV_LOG(ERR, "Invalid device id: %d.", did); + return -EINVAL; + } + return priv->ctx->cmd_fd; +} + +static int +mlx5_vdpa_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size) +{ + int did = rte_vhost_get_vdpa_device_id(vid); + struct mlx5_vdpa_priv *priv = mlx5_vdpa_find_priv_resource_by_did(did); + + RTE_SET_USED(qid); + if (priv == NULL) { + DRV_LOG(ERR, "Invalid device id: %d.", did); + return -EINVAL; + } + if (!priv->var) { + DRV_LOG(ERR, "VAR was not created for device %d, is the device" + " configured?.", did); + return -EINVAL; + } + *offset = priv->var->mmap_off; + *size = priv->var->length; + return 0; +} + +static struct rte_vdpa_dev_ops mlx5_vdpa_ops = { + .get_queue_num = mlx5_vdpa_get_queue_num, + .get_features = mlx5_vdpa_get_vdpa_features, + .get_protocol_features = mlx5_vdpa_get_protocol_features, + .dev_conf = mlx5_vdpa_dev_config, + .dev_close = mlx5_vdpa_dev_close, + .set_vring_state = mlx5_vdpa_set_vring_state, + .set_features = mlx5_vdpa_features_set, + .migration_done = NULL, + .get_vfio_group_fd = NULL, + .get_vfio_device_fd = mlx5_vdpa_get_device_fd, + .get_notify_area = mlx5_vdpa_get_notify_area, +}; + +static struct ibv_device * +mlx5_vdpa_get_ib_device_match(struct rte_pci_addr *addr) +{ + int n; + struct ibv_device **ibv_list = mlx5_glue->get_device_list(&n); + struct ibv_device *ibv_match = NULL; + + if (!ibv_list) { + rte_errno = ENOSYS; + return NULL; + } + while (n-- > 0) { + struct rte_pci_addr pci_addr; + + DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[n]->name); + if (mlx5_dev_to_pci_addr(ibv_list[n]->ibdev_path, &pci_addr)) + continue; + if (rte_pci_addr_cmp(addr, &pci_addr)) + continue; + ibv_match = ibv_list[n]; + break; + } + if (!ibv_match) + rte_errno = ENOENT; + mlx5_glue->free_device_list(ibv_list); + return ibv_match; +} + +/* Try to disable ROCE by Netlink\Devlink. */ +static int +mlx5_vdpa_nl_roce_disable(const char *addr) +{ + int nlsk_fd = mlx5_nl_init(NETLINK_GENERIC); + int devlink_id; + int enable; + int ret; + + if (nlsk_fd < 0) + return nlsk_fd; + devlink_id = mlx5_nl_devlink_family_id_get(nlsk_fd); + if (devlink_id < 0) { + ret = devlink_id; + DRV_LOG(DEBUG, "Failed to get devlink id for ROCE operations by" + " Netlink."); + goto close; + } + ret = mlx5_nl_enable_roce_get(nlsk_fd, devlink_id, addr, &enable); + if (ret) { + DRV_LOG(DEBUG, "Failed to get ROCE enable by Netlink: %d.", + ret); + goto close; + } else if (!enable) { + DRV_LOG(INFO, "ROCE has already disabled(Netlink)."); + goto close; + } + ret = mlx5_nl_enable_roce_set(nlsk_fd, devlink_id, addr, 0); + if (ret) + DRV_LOG(DEBUG, "Failed to disable ROCE by Netlink: %d.", ret); + else + DRV_LOG(INFO, "ROCE is disabled by Netlink successfully."); +close: + close(nlsk_fd); + return ret; +} + +/* Try to disable ROCE by sysfs. */ +static int +mlx5_vdpa_sys_roce_disable(const char *addr) +{ + FILE *file_o; + int enable; + int ret; + + MKSTR(file_p, "/sys/bus/pci/devices/%s/roce_enable", addr); + file_o = fopen(file_p, "rb"); + if (!file_o) { + rte_errno = ENOTSUP; + return -ENOTSUP; + } + ret = fscanf(file_o, "%d", &enable); + if (ret != 1) { + rte_errno = EINVAL; + ret = EINVAL; + goto close; + } else if (!enable) { + ret = 0; + DRV_LOG(INFO, "ROCE has already disabled(sysfs)."); + goto close; + } + fclose(file_o); + file_o = fopen(file_p, "wb"); + if (!file_o) { + rte_errno = ENOTSUP; + return -ENOTSUP; + } + fprintf(file_o, "0\n"); + ret = 0; +close: + if (ret) + DRV_LOG(DEBUG, "Failed to disable ROCE by sysfs: %d.", ret); + else + DRV_LOG(INFO, "ROCE is disabled by sysfs successfully."); + fclose(file_o); + return ret; +} + +#define MLX5_VDPA_MAX_RETRIES 20 +#define MLX5_VDPA_USEC 1000 +static int +mlx5_vdpa_roce_disable(struct rte_pci_addr *addr, struct ibv_device **ibv) +{ + char addr_name[64] = {0}; + + rte_pci_device_name(addr, addr_name, sizeof(addr_name)); + /* Firstly try to disable ROCE by Netlink and fallback to sysfs. */ + if (mlx5_vdpa_nl_roce_disable(addr_name) == 0 || + mlx5_vdpa_sys_roce_disable(addr_name) == 0) { + /* + * Succeed to disable ROCE, wait for the IB device to appear + * again after reload. + */ + int r; + struct ibv_device *ibv_new; + + for (r = MLX5_VDPA_MAX_RETRIES; r; r--) { + ibv_new = mlx5_vdpa_get_ib_device_match(addr); + if (ibv_new) { + *ibv = ibv_new; + return 0; + } + usleep(MLX5_VDPA_USEC); + } + DRV_LOG(ERR, "Cannot much device %s after ROCE disable, " + "retries exceed %d", addr_name, MLX5_VDPA_MAX_RETRIES); + rte_errno = EAGAIN; + } + return -rte_errno; +} + +/** + * DPDK callback to register a PCI device. + * + * This function spawns vdpa device out of a given PCI device. + * + * @param[in] pci_drv + * PCI driver structure (mlx5_vpda_driver). + * @param[in] pci_dev + * PCI device information. + * + * @return + * 0 on success, 1 to skip this driver, a negative errno value otherwise + * and rte_errno is set. + */ +static int +mlx5_vdpa_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, + struct rte_pci_device *pci_dev __rte_unused) +{ + struct ibv_device *ibv; + struct mlx5_vdpa_priv *priv = NULL; + struct ibv_context *ctx = NULL; + struct mlx5_hca_attr attr; + int ret; + + if (mlx5_class_get(pci_dev->device.devargs) != MLX5_CLASS_VDPA) { + DRV_LOG(DEBUG, "Skip probing - should be probed by other mlx5" + " driver."); + return 1; + } + ibv = mlx5_vdpa_get_ib_device_match(&pci_dev->addr); + if (!ibv) { + DRV_LOG(ERR, "No matching IB device for PCI slot " + PCI_PRI_FMT ".", pci_dev->addr.domain, + pci_dev->addr.bus, pci_dev->addr.devid, + pci_dev->addr.function); + return -rte_errno; + } else { + DRV_LOG(INFO, "PCI information matches for device \"%s\".", + ibv->name); + } + if (mlx5_vdpa_roce_disable(&pci_dev->addr, &ibv) != 0) { + DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".", + ibv->name); + return -rte_errno; + } + ctx = mlx5_glue->dv_open_device(ibv); + if (!ctx) { + DRV_LOG(ERR, "Failed to open IB device \"%s\".", ibv->name); + rte_errno = ENODEV; + return -rte_errno; + } + ret = mlx5_devx_cmd_query_hca_attr(ctx, &attr); + if (ret) { + DRV_LOG(ERR, "Unable to read HCA capabilities."); + rte_errno = ENOTSUP; + goto error; + } else if (!attr.vdpa.valid || !attr.vdpa.max_num_virtio_queues) { + DRV_LOG(ERR, "Not enough capabilities to support vdpa, maybe " + "old FW/OFED version?"); + rte_errno = ENOTSUP; + goto error; + } + priv = rte_zmalloc("mlx5 vDPA device private", sizeof(*priv) + + sizeof(struct mlx5_vdpa_virtq) * + attr.vdpa.max_num_virtio_queues * 2, + RTE_CACHE_LINE_SIZE); + if (!priv) { + DRV_LOG(ERR, "Failed to allocate private memory."); + rte_errno = ENOMEM; + goto error; + } + priv->caps = attr.vdpa; + priv->log_max_rqt_size = attr.log_max_rqt_size; + priv->ctx = ctx; + priv->dev_addr.pci_addr = pci_dev->addr; + priv->dev_addr.type = VDPA_ADDR_PCI; + priv->var = mlx5_glue->dv_alloc_var(ctx, 0); + if (!priv->var) { + DRV_LOG(ERR, "Failed to allocate VAR %u.\n", errno); + goto error; + } + priv->id = rte_vdpa_register_device(&priv->dev_addr, &mlx5_vdpa_ops); + if (priv->id < 0) { + DRV_LOG(ERR, "Failed to register vDPA device."); + rte_errno = rte_errno ? rte_errno : EINVAL; + goto error; + } + SLIST_INIT(&priv->mr_list); + pthread_mutex_lock(&priv_list_lock); + TAILQ_INSERT_TAIL(&priv_list, priv, next); + pthread_mutex_unlock(&priv_list_lock); + return 0; + +error: + if (priv) { + if (priv->var) + mlx5_glue->dv_free_var(priv->var); + rte_free(priv); + } + if (ctx) + mlx5_glue->close_device(ctx); + return -rte_errno; +} + +/** + * DPDK callback to remove a PCI device. + * + * This function removes all vDPA devices belong to a given PCI device. + * + * @param[in] pci_dev + * Pointer to the PCI device. + * + * @return + * 0 on success, the function cannot fail. + */ +static int +mlx5_vdpa_pci_remove(struct rte_pci_device *pci_dev) +{ + struct mlx5_vdpa_priv *priv = NULL; + int found = 0; + + pthread_mutex_lock(&priv_list_lock); + TAILQ_FOREACH(priv, &priv_list, next) { + if (memcmp(&priv->dev_addr.pci_addr, &pci_dev->addr, + sizeof(pci_dev->addr)) == 0) { + found = 1; + break; + } + } + if (found) + TAILQ_REMOVE(&priv_list, priv, next); + pthread_mutex_unlock(&priv_list_lock); + if (found) { + if (priv->configured) + mlx5_vdpa_dev_close(priv->vid); + if (priv->var) { + mlx5_glue->dv_free_var(priv->var); + priv->var = NULL; + } + mlx5_glue->close_device(priv->ctx); + rte_free(priv); + } + return 0; +} + +static const struct rte_pci_id mlx5_vdpa_pci_id_map[] = { + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX6) + }, + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX6VF) + }, + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX6DX) + }, + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF) + }, + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF) + }, + { + .vendor_id = 0 + } +}; + +static struct rte_pci_driver mlx5_vdpa_driver = { + .driver = { + .name = "mlx5_vdpa", + }, + .id_table = mlx5_vdpa_pci_id_map, + .probe = mlx5_vdpa_pci_probe, + .remove = mlx5_vdpa_pci_remove, + .drv_flags = 0, +}; + +/** + * Driver initialization routine. + */ +RTE_INIT(rte_mlx5_vdpa_init) +{ + /* Initialize common log type. */ + mlx5_vdpa_logtype = rte_log_register("pmd.vdpa.mlx5"); + if (mlx5_vdpa_logtype >= 0) + rte_log_set_level(mlx5_vdpa_logtype, RTE_LOG_NOTICE); + if (mlx5_glue) + rte_pci_register(&mlx5_vdpa_driver); +} + +RTE_PMD_EXPORT_NAME(net_mlx5_vdpa, __COUNTER__); +RTE_PMD_REGISTER_PCI_TABLE(net_mlx5_vdpa, mlx5_vdpa_pci_id_map); +RTE_PMD_REGISTER_KMOD_DEP(net_mlx5_vdpa, "* ib_uverbs & mlx5_core & mlx5_ib"); diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa.h b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa.h new file mode 100644 index 000000000..fcc216ac7 --- /dev/null +++ b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa.h @@ -0,0 +1,355 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2019 Mellanox Technologies, Ltd + */ + +#ifndef RTE_PMD_MLX5_VDPA_H_ +#define RTE_PMD_MLX5_VDPA_H_ + +#include <linux/virtio_net.h> +#include <sys/queue.h> + +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <rte_vdpa.h> +#include <rte_vhost.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif +#include <rte_spinlock.h> +#include <rte_interrupts.h> + +#include <mlx5_glue.h> +#include <mlx5_devx_cmds.h> +#include <mlx5_prm.h> + + +#define MLX5_VDPA_INTR_RETRIES 256 +#define MLX5_VDPA_INTR_RETRIES_USEC 1000 + +#ifndef VIRTIO_F_ORDER_PLATFORM +#define VIRTIO_F_ORDER_PLATFORM 36 +#endif + +#ifndef VIRTIO_F_RING_PACKED +#define VIRTIO_F_RING_PACKED 34 +#endif + +struct mlx5_vdpa_cq { + uint16_t log_desc_n; + uint32_t cq_ci:24; + uint32_t arm_sn:2; + int callfd; + rte_spinlock_t sl; + struct mlx5_devx_obj *cq; + struct mlx5dv_devx_umem *umem_obj; + union { + volatile void *umem_buf; + volatile struct mlx5_cqe *cqes; + }; + volatile uint32_t *db_rec; + uint64_t errors; +}; + +struct mlx5_vdpa_event_qp { + struct mlx5_vdpa_cq cq; + struct mlx5_devx_obj *fw_qp; + struct mlx5_devx_obj *sw_qp; + struct mlx5dv_devx_umem *umem_obj; + void *umem_buf; + volatile uint32_t *db_rec; +}; + +struct mlx5_vdpa_query_mr { + SLIST_ENTRY(mlx5_vdpa_query_mr) next; + void *addr; + uint64_t length; + struct mlx5dv_devx_umem *umem; + struct mlx5_devx_obj *mkey; + int is_indirect; +}; + +struct mlx5_vdpa_virtq { + SLIST_ENTRY(mlx5_vdpa_virtq) next; + uint8_t enable; + uint16_t index; + uint16_t vq_size; + struct mlx5_vdpa_priv *priv; + struct mlx5_devx_obj *virtq; + struct mlx5_vdpa_event_qp eqp; + struct { + struct mlx5dv_devx_umem *obj; + void *buf; + uint32_t size; + } umems[3]; + struct rte_intr_handle intr_handle; +}; + +struct mlx5_vdpa_steer { + struct mlx5_devx_obj *rqt; + void *domain; + void *tbl; + struct { + struct mlx5dv_flow_matcher *matcher; + struct mlx5_devx_obj *tir; + void *tir_action; + void *flow; + } rss[7]; +}; + +struct mlx5_vdpa_priv { + TAILQ_ENTRY(mlx5_vdpa_priv) next; + uint8_t configured; + uint8_t direct_notifier; /* Whether direct notifier is on or off. */ + int id; /* vDPA device id. */ + int vid; /* vhost device id. */ + struct ibv_context *ctx; /* Device context. */ + struct rte_vdpa_dev_addr dev_addr; + struct mlx5_hca_vdpa_attr caps; + uint32_t pdn; /* Protection Domain number. */ + struct ibv_pd *pd; + uint32_t gpa_mkey_index; + struct ibv_mr *null_mr; + struct rte_vhost_memory *vmem; + uint32_t eqn; + struct mlx5dv_devx_event_channel *eventc; + struct mlx5dv_devx_uar *uar; + struct rte_intr_handle intr_handle; + struct mlx5_devx_obj *td; + struct mlx5_devx_obj *tis; + uint16_t nr_virtqs; + uint64_t features; /* Negotiated features. */ + uint16_t log_max_rqt_size; + struct mlx5_vdpa_steer steer; + struct mlx5dv_var *var; + void *virtq_db_addr; + SLIST_HEAD(mr_list, mlx5_vdpa_query_mr) mr_list; + struct mlx5_vdpa_virtq virtqs[]; +}; + +/* + * Check whether virtq is for traffic receive. + * According to VIRTIO_NET Spec the virtqueues index identity its type by: + * 0 receiveq1 + * 1 transmitq1 + * ... + * 2(N-1) receiveqN + * 2(N-1)+1 transmitqN + * 2N controlq + */ +static inline uint8_t +is_virtq_recvq(int virtq_index, int nr_vring) +{ + if (virtq_index % 2 == 0 && virtq_index != nr_vring - 1) + return 1; + return 0; +} + +/** + * Release all the prepared memory regions and all their related resources. + * + * @param[in] priv + * The vdpa driver private structure. + */ +void mlx5_vdpa_mem_dereg(struct mlx5_vdpa_priv *priv); + +/** + * Register all the memory regions of the virtio device to the HW and allocate + * all their related resources. + * + * @param[in] priv + * The vdpa driver private structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int mlx5_vdpa_mem_register(struct mlx5_vdpa_priv *priv); + + +/** + * Create an event QP and all its related resources. + * + * @param[in] priv + * The vdpa driver private structure. + * @param[in] desc_n + * Number of descriptors. + * @param[in] callfd + * The guest notification file descriptor. + * @param[in/out] eqp + * Pointer to the event QP structure. + * + * @return + * 0 on success, -1 otherwise and rte_errno is set. + */ +int mlx5_vdpa_event_qp_create(struct mlx5_vdpa_priv *priv, uint16_t desc_n, + int callfd, struct mlx5_vdpa_event_qp *eqp); + +/** + * Destroy an event QP and all its related resources. + * + * @param[in/out] eqp + * Pointer to the event QP structure. + */ +void mlx5_vdpa_event_qp_destroy(struct mlx5_vdpa_event_qp *eqp); + +/** + * Release all the event global resources. + * + * @param[in] priv + * The vdpa driver private structure. + */ +void mlx5_vdpa_event_qp_global_release(struct mlx5_vdpa_priv *priv); + +/** + * Setup CQE event. + * + * @param[in] priv + * The vdpa driver private structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int mlx5_vdpa_cqe_event_setup(struct mlx5_vdpa_priv *priv); + +/** + * Unset CQE event . + * + * @param[in] priv + * The vdpa driver private structure. + */ +void mlx5_vdpa_cqe_event_unset(struct mlx5_vdpa_priv *priv); + +/** + * Release a virtq and all its related resources. + * + * @param[in] priv + * The vdpa driver private structure. + */ +void mlx5_vdpa_virtqs_release(struct mlx5_vdpa_priv *priv); + +/** + * Create all the HW virtqs resources and all their related resources. + * + * @param[in] priv + * The vdpa driver private structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int mlx5_vdpa_virtqs_prepare(struct mlx5_vdpa_priv *priv); + +/** + * Enable\Disable virtq.. + * + * @param[in] priv + * The vdpa driver private structure. + * @param[in] index + * The virtq index. + * @param[in] enable + * Set to enable, otherwise disable. + * + * @return + * 0 on success, a negative value otherwise. + */ +int mlx5_vdpa_virtq_enable(struct mlx5_vdpa_priv *priv, int index, int enable); + +/** + * Unset steering and release all its related resources- stop traffic. + * + * @param[in] priv + * The vdpa driver private structure. + */ +void mlx5_vdpa_steer_unset(struct mlx5_vdpa_priv *priv); + +/** + * Update steering according to the received queues status. + * + * @param[in] priv + * The vdpa driver private structure. + * + * @return + * 0 on success, a negative value otherwise. + */ +int mlx5_vdpa_steer_update(struct mlx5_vdpa_priv *priv); + +/** + * Setup steering and all its related resources to enable RSS traffic from the + * device to all the Rx host queues. + * + * @param[in] priv + * The vdpa driver private structure. + * + * @return + * 0 on success, a negative value otherwise. + */ +int mlx5_vdpa_steer_setup(struct mlx5_vdpa_priv *priv); + +/** + * Enable\Disable live migration logging. + * + * @param[in] priv + * The vdpa driver private structure. + * @param[in] enable + * Set for enable, unset for disable. + * + * @return + * 0 on success, a negative value otherwise. + */ +int mlx5_vdpa_logging_enable(struct mlx5_vdpa_priv *priv, int enable); + +/** + * Set dirty bitmap logging to allow live migration. + * + * @param[in] priv + * The vdpa driver private structure. + * @param[in] log_base + * Vhost log base. + * @param[in] log_size + * Vhost log size. + * + * @return + * 0 on success, a negative value otherwise. + */ +int mlx5_vdpa_dirty_bitmap_set(struct mlx5_vdpa_priv *priv, uint64_t log_base, + uint64_t log_size); + +/** + * Log all virtqs information for live migration. + * + * @param[in] priv + * The vdpa driver private structure. + * @param[in] enable + * Set for enable, unset for disable. + * + * @return + * 0 on success, a negative value otherwise. + */ +int mlx5_vdpa_lm_log(struct mlx5_vdpa_priv *priv); + +/** + * Modify virtq state to be ready or suspend. + * + * @param[in] virtq + * The vdpa driver private virtq structure. + * @param[in] state + * Set for ready, otherwise suspend. + * + * @return + * 0 on success, a negative value otherwise. + */ +int mlx5_vdpa_virtq_modify(struct mlx5_vdpa_virtq *virtq, int state); + +/** + * Stop virtq before destroying it. + * + * @param[in] priv + * The vdpa driver private structure. + * @param[in] index + * The virtq index. + * + * @return + * 0 on success, a negative value otherwise. + */ +int mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index); + +#endif /* RTE_PMD_MLX5_VDPA_H_ */ diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_event.c b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_event.c new file mode 100644 index 000000000..dd60150fe --- /dev/null +++ b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_event.c @@ -0,0 +1,401 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2019 Mellanox Technologies, Ltd + */ +#include <unistd.h> +#include <stdint.h> +#include <fcntl.h> +#include <sys/eventfd.h> + +#include <rte_malloc.h> +#include <rte_errno.h> +#include <rte_lcore.h> +#include <rte_atomic.h> +#include <rte_common.h> +#include <rte_io.h> + +#include <mlx5_common.h> + +#include "mlx5_vdpa_utils.h" +#include "mlx5_vdpa.h" + + +void +mlx5_vdpa_event_qp_global_release(struct mlx5_vdpa_priv *priv) +{ + if (priv->uar) { + mlx5_glue->devx_free_uar(priv->uar); + priv->uar = NULL; + } + if (priv->eventc) { + mlx5_glue->devx_destroy_event_channel(priv->eventc); + priv->eventc = NULL; + } + priv->eqn = 0; +} + +/* Prepare all the global resources for all the event objects.*/ +static int +mlx5_vdpa_event_qp_global_prepare(struct mlx5_vdpa_priv *priv) +{ + uint32_t lcore; + + if (priv->eventc) + return 0; + lcore = (uint32_t)rte_lcore_to_cpu_id(-1); + if (mlx5_glue->devx_query_eqn(priv->ctx, lcore, &priv->eqn)) { + rte_errno = errno; + DRV_LOG(ERR, "Failed to query EQ number %d.", rte_errno); + return -1; + } + priv->eventc = mlx5_glue->devx_create_event_channel(priv->ctx, + MLX5DV_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA); + if (!priv->eventc) { + rte_errno = errno; + DRV_LOG(ERR, "Failed to create event channel %d.", + rte_errno); + goto error; + } + priv->uar = mlx5_glue->devx_alloc_uar(priv->ctx, 0); + if (!priv->uar) { + rte_errno = errno; + DRV_LOG(ERR, "Failed to allocate UAR."); + goto error; + } + return 0; +error: + mlx5_vdpa_event_qp_global_release(priv); + return -1; +} + +static void +mlx5_vdpa_cq_destroy(struct mlx5_vdpa_cq *cq) +{ + if (cq->cq) + claim_zero(mlx5_devx_cmd_destroy(cq->cq)); + if (cq->umem_obj) + claim_zero(mlx5_glue->devx_umem_dereg(cq->umem_obj)); + if (cq->umem_buf) + rte_free((void *)(uintptr_t)cq->umem_buf); + memset(cq, 0, sizeof(*cq)); +} + +static inline void +mlx5_vdpa_cq_arm(struct mlx5_vdpa_priv *priv, struct mlx5_vdpa_cq *cq) +{ + uint32_t arm_sn = cq->arm_sn << MLX5_CQ_SQN_OFFSET; + uint32_t cq_ci = cq->cq_ci & MLX5_CI_MASK; + uint32_t doorbell_hi = arm_sn | MLX5_CQ_DBR_CMD_ALL | cq_ci; + uint64_t doorbell = ((uint64_t)doorbell_hi << 32) | cq->cq->id; + uint64_t db_be = rte_cpu_to_be_64(doorbell); + uint32_t *addr = RTE_PTR_ADD(priv->uar->base_addr, MLX5_CQ_DOORBELL); + + rte_io_wmb(); + cq->db_rec[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(doorbell_hi); + rte_wmb(); +#ifdef RTE_ARCH_64 + *(uint64_t *)addr = db_be; +#else + *(uint32_t *)addr = db_be; + rte_io_wmb(); + *((uint32_t *)addr + 1) = db_be >> 32; +#endif + cq->arm_sn++; +} + +static int +mlx5_vdpa_cq_create(struct mlx5_vdpa_priv *priv, uint16_t log_desc_n, + int callfd, struct mlx5_vdpa_cq *cq) +{ + struct mlx5_devx_cq_attr attr; + size_t pgsize = sysconf(_SC_PAGESIZE); + uint32_t umem_size; + int ret; + uint16_t event_nums[1] = {0}; + + cq->log_desc_n = log_desc_n; + umem_size = sizeof(struct mlx5_cqe) * (1 << log_desc_n) + + sizeof(*cq->db_rec) * 2; + cq->umem_buf = rte_zmalloc(__func__, umem_size, 4096); + if (!cq->umem_buf) { + DRV_LOG(ERR, "Failed to allocate memory for CQ."); + rte_errno = ENOMEM; + return -ENOMEM; + } + cq->umem_obj = mlx5_glue->devx_umem_reg(priv->ctx, + (void *)(uintptr_t)cq->umem_buf, + umem_size, + IBV_ACCESS_LOCAL_WRITE); + if (!cq->umem_obj) { + DRV_LOG(ERR, "Failed to register umem for CQ."); + goto error; + } + attr.q_umem_valid = 1; + attr.db_umem_valid = 1; + attr.use_first_only = 0; + attr.overrun_ignore = 0; + attr.uar_page_id = priv->uar->page_id; + attr.q_umem_id = cq->umem_obj->umem_id; + attr.q_umem_offset = 0; + attr.db_umem_id = cq->umem_obj->umem_id; + attr.db_umem_offset = sizeof(struct mlx5_cqe) * (1 << log_desc_n); + attr.eqn = priv->eqn; + attr.log_cq_size = log_desc_n; + attr.log_page_size = rte_log2_u32(pgsize); + cq->cq = mlx5_devx_cmd_create_cq(priv->ctx, &attr); + if (!cq->cq) + goto error; + cq->db_rec = RTE_PTR_ADD(cq->umem_buf, (uintptr_t)attr.db_umem_offset); + cq->cq_ci = 0; + rte_spinlock_init(&cq->sl); + /* Subscribe CQ event to the event channel controlled by the driver. */ + ret = mlx5_glue->devx_subscribe_devx_event(priv->eventc, cq->cq->obj, + sizeof(event_nums), + event_nums, + (uint64_t)(uintptr_t)cq); + if (ret) { + DRV_LOG(ERR, "Failed to subscribe CQE event."); + rte_errno = errno; + goto error; + } + cq->callfd = callfd; + /* Init CQ to ones to be in HW owner in the start. */ + memset((void *)(uintptr_t)cq->umem_buf, 0xFF, attr.db_umem_offset); + /* First arming. */ + mlx5_vdpa_cq_arm(priv, cq); + return 0; +error: + mlx5_vdpa_cq_destroy(cq); + return -1; +} + +static inline void __rte_unused +mlx5_vdpa_cq_poll(struct mlx5_vdpa_priv *priv __rte_unused, + struct mlx5_vdpa_cq *cq) +{ + struct mlx5_vdpa_event_qp *eqp = + container_of(cq, struct mlx5_vdpa_event_qp, cq); + const unsigned int cq_size = 1 << cq->log_desc_n; + const unsigned int cq_mask = cq_size - 1; + int ret; + + do { + volatile struct mlx5_cqe *cqe = cq->cqes + (cq->cq_ci & + cq_mask); + + ret = check_cqe(cqe, cq_size, cq->cq_ci); + switch (ret) { + case MLX5_CQE_STATUS_ERR: + cq->errors++; + /*fall-through*/ + case MLX5_CQE_STATUS_SW_OWN: + cq->cq_ci++; + break; + case MLX5_CQE_STATUS_HW_OWN: + default: + break; + } + } while (ret != MLX5_CQE_STATUS_HW_OWN); + rte_io_wmb(); + /* Ring CQ doorbell record. */ + cq->db_rec[0] = rte_cpu_to_be_32(cq->cq_ci); + rte_io_wmb(); + /* Ring SW QP doorbell record. */ + eqp->db_rec[0] = rte_cpu_to_be_32(cq->cq_ci + cq_size); +} + +static void +mlx5_vdpa_interrupt_handler(void *cb_arg) +{ +#ifndef HAVE_IBV_DEVX_EVENT + (void)cb_arg; + return; +#else + struct mlx5_vdpa_priv *priv = cb_arg; + union { + struct mlx5dv_devx_async_event_hdr event_resp; + uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) + 128]; + } out; + + while (mlx5_glue->devx_get_event(priv->eventc, &out.event_resp, + sizeof(out.buf)) >= + (ssize_t)sizeof(out.event_resp.cookie)) { + struct mlx5_vdpa_cq *cq = (struct mlx5_vdpa_cq *) + (uintptr_t)out.event_resp.cookie; + rte_spinlock_lock(&cq->sl); + mlx5_vdpa_cq_poll(priv, cq); + mlx5_vdpa_cq_arm(priv, cq); + if (cq->callfd != -1) + /* Notify guest for descriptors consuming. */ + eventfd_write(cq->callfd, (eventfd_t)1); + rte_spinlock_unlock(&cq->sl); + DRV_LOG(DEBUG, "CQ %d event: new cq_ci = %u.", cq->cq->id, + cq->cq_ci); + } +#endif /* HAVE_IBV_DEVX_ASYNC */ +} + +int +mlx5_vdpa_cqe_event_setup(struct mlx5_vdpa_priv *priv) +{ + int flags; + int ret; + + if (!priv->eventc) + /* All virtqs are in poll mode. */ + return 0; + flags = fcntl(priv->eventc->fd, F_GETFL); + ret = fcntl(priv->eventc->fd, F_SETFL, flags | O_NONBLOCK); + if (ret) { + DRV_LOG(ERR, "Failed to change event channel FD."); + rte_errno = errno; + return -rte_errno; + } + priv->intr_handle.fd = priv->eventc->fd; + priv->intr_handle.type = RTE_INTR_HANDLE_EXT; + if (rte_intr_callback_register(&priv->intr_handle, + mlx5_vdpa_interrupt_handler, priv)) { + priv->intr_handle.fd = 0; + DRV_LOG(ERR, "Failed to register CQE interrupt %d.", rte_errno); + return -rte_errno; + } + return 0; +} + +void +mlx5_vdpa_cqe_event_unset(struct mlx5_vdpa_priv *priv) +{ + int retries = MLX5_VDPA_INTR_RETRIES; + int ret = -EAGAIN; + + if (priv->intr_handle.fd) { + while (retries-- && ret == -EAGAIN) { + ret = rte_intr_callback_unregister(&priv->intr_handle, + mlx5_vdpa_interrupt_handler, + priv); + if (ret == -EAGAIN) { + DRV_LOG(DEBUG, "Try again to unregister fd %d " + "of CQ interrupt, retries = %d.", + priv->intr_handle.fd, retries); + usleep(MLX5_VDPA_INTR_RETRIES_USEC); + } + } + memset(&priv->intr_handle, 0, sizeof(priv->intr_handle)); + } +} + +void +mlx5_vdpa_event_qp_destroy(struct mlx5_vdpa_event_qp *eqp) +{ + if (eqp->sw_qp) + claim_zero(mlx5_devx_cmd_destroy(eqp->sw_qp)); + if (eqp->umem_obj) + claim_zero(mlx5_glue->devx_umem_dereg(eqp->umem_obj)); + if (eqp->umem_buf) + rte_free(eqp->umem_buf); + if (eqp->fw_qp) + claim_zero(mlx5_devx_cmd_destroy(eqp->fw_qp)); + mlx5_vdpa_cq_destroy(&eqp->cq); + memset(eqp, 0, sizeof(*eqp)); +} + +static int +mlx5_vdpa_qps2rts(struct mlx5_vdpa_event_qp *eqp) +{ + if (mlx5_devx_cmd_modify_qp_state(eqp->fw_qp, MLX5_CMD_OP_RST2INIT_QP, + eqp->sw_qp->id)) { + DRV_LOG(ERR, "Failed to modify FW QP to INIT state(%u).", + rte_errno); + return -1; + } + if (mlx5_devx_cmd_modify_qp_state(eqp->sw_qp, MLX5_CMD_OP_RST2INIT_QP, + eqp->fw_qp->id)) { + DRV_LOG(ERR, "Failed to modify SW QP to INIT state(%u).", + rte_errno); + return -1; + } + if (mlx5_devx_cmd_modify_qp_state(eqp->fw_qp, MLX5_CMD_OP_INIT2RTR_QP, + eqp->sw_qp->id)) { + DRV_LOG(ERR, "Failed to modify FW QP to RTR state(%u).", + rte_errno); + return -1; + } + if (mlx5_devx_cmd_modify_qp_state(eqp->sw_qp, MLX5_CMD_OP_INIT2RTR_QP, + eqp->fw_qp->id)) { + DRV_LOG(ERR, "Failed to modify SW QP to RTR state(%u).", + rte_errno); + return -1; + } + if (mlx5_devx_cmd_modify_qp_state(eqp->fw_qp, MLX5_CMD_OP_RTR2RTS_QP, + eqp->sw_qp->id)) { + DRV_LOG(ERR, "Failed to modify FW QP to RTS state(%u).", + rte_errno); + return -1; + } + if (mlx5_devx_cmd_modify_qp_state(eqp->sw_qp, MLX5_CMD_OP_RTR2RTS_QP, + eqp->fw_qp->id)) { + DRV_LOG(ERR, "Failed to modify SW QP to RTS state(%u).", + rte_errno); + return -1; + } + return 0; +} + +int +mlx5_vdpa_event_qp_create(struct mlx5_vdpa_priv *priv, uint16_t desc_n, + int callfd, struct mlx5_vdpa_event_qp *eqp) +{ + struct mlx5_devx_qp_attr attr = {0}; + uint16_t log_desc_n = rte_log2_u32(desc_n); + uint32_t umem_size = (1 << log_desc_n) * MLX5_WSEG_SIZE + + sizeof(*eqp->db_rec) * 2; + + if (mlx5_vdpa_event_qp_global_prepare(priv)) + return -1; + if (mlx5_vdpa_cq_create(priv, log_desc_n, callfd, &eqp->cq)) + return -1; + attr.pd = priv->pdn; + eqp->fw_qp = mlx5_devx_cmd_create_qp(priv->ctx, &attr); + if (!eqp->fw_qp) { + DRV_LOG(ERR, "Failed to create FW QP(%u).", rte_errno); + goto error; + } + eqp->umem_buf = rte_zmalloc(__func__, umem_size, 4096); + if (!eqp->umem_buf) { + DRV_LOG(ERR, "Failed to allocate memory for SW QP."); + rte_errno = ENOMEM; + goto error; + } + eqp->umem_obj = mlx5_glue->devx_umem_reg(priv->ctx, + (void *)(uintptr_t)eqp->umem_buf, + umem_size, + IBV_ACCESS_LOCAL_WRITE); + if (!eqp->umem_obj) { + DRV_LOG(ERR, "Failed to register umem for SW QP."); + goto error; + } + attr.uar_index = priv->uar->page_id; + attr.cqn = eqp->cq.cq->id; + attr.log_page_size = rte_log2_u32(sysconf(_SC_PAGESIZE)); + attr.rq_size = 1 << log_desc_n; + attr.log_rq_stride = rte_log2_u32(MLX5_WSEG_SIZE); + attr.sq_size = 0; /* No need SQ. */ + attr.dbr_umem_valid = 1; + attr.wq_umem_id = eqp->umem_obj->umem_id; + attr.wq_umem_offset = 0; + attr.dbr_umem_id = eqp->umem_obj->umem_id; + attr.dbr_address = (1 << log_desc_n) * MLX5_WSEG_SIZE; + eqp->sw_qp = mlx5_devx_cmd_create_qp(priv->ctx, &attr); + if (!eqp->sw_qp) { + DRV_LOG(ERR, "Failed to create SW QP(%u).", rte_errno); + goto error; + } + eqp->db_rec = RTE_PTR_ADD(eqp->umem_buf, (uintptr_t)attr.dbr_address); + if (mlx5_vdpa_qps2rts(eqp)) + goto error; + /* First ringing. */ + rte_write32(rte_cpu_to_be_32(1 << log_desc_n), &eqp->db_rec[0]); + return 0; +error: + mlx5_vdpa_event_qp_destroy(eqp); + return -1; +} diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_lm.c b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_lm.c new file mode 100644 index 000000000..460e01d80 --- /dev/null +++ b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_lm.c @@ -0,0 +1,121 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2019 Mellanox Technologies, Ltd + */ +#include <rte_malloc.h> +#include <rte_errno.h> + +#include "mlx5_vdpa_utils.h" +#include "mlx5_vdpa.h" + + +int +mlx5_vdpa_logging_enable(struct mlx5_vdpa_priv *priv, int enable) +{ + struct mlx5_devx_virtq_attr attr = { + .type = MLX5_VIRTQ_MODIFY_TYPE_DIRTY_BITMAP_DUMP_ENABLE, + .dirty_bitmap_dump_enable = enable, + }; + int i; + + for (i = 0; i < priv->nr_virtqs; ++i) { + attr.queue_index = i; + if (!priv->virtqs[i].virtq || + mlx5_devx_cmd_modify_virtq(priv->virtqs[i].virtq, &attr)) { + DRV_LOG(ERR, "Failed to modify virtq %d logging.", i); + return -1; + } + } + return 0; +} + +int +mlx5_vdpa_dirty_bitmap_set(struct mlx5_vdpa_priv *priv, uint64_t log_base, + uint64_t log_size) +{ + struct mlx5_devx_mkey_attr mkey_attr = { + .addr = (uintptr_t)log_base, + .size = log_size, + .pd = priv->pdn, + .pg_access = 1, + .klm_array = NULL, + .klm_num = 0, + .relaxed_ordering = 0, + }; + struct mlx5_devx_virtq_attr attr = { + .type = MLX5_VIRTQ_MODIFY_TYPE_DIRTY_BITMAP_PARAMS, + .dirty_bitmap_addr = log_base, + .dirty_bitmap_size = log_size, + }; + struct mlx5_vdpa_query_mr *mr = rte_malloc(__func__, sizeof(*mr), 0); + int i; + + if (!mr) { + DRV_LOG(ERR, "Failed to allocate mem for lm mr."); + return -1; + } + mr->umem = mlx5_glue->devx_umem_reg(priv->ctx, + (void *)(uintptr_t)log_base, + log_size, IBV_ACCESS_LOCAL_WRITE); + if (!mr->umem) { + DRV_LOG(ERR, "Failed to register umem for lm mr."); + goto err; + } + mkey_attr.umem_id = mr->umem->umem_id; + mr->mkey = mlx5_devx_cmd_mkey_create(priv->ctx, &mkey_attr); + if (!mr->mkey) { + DRV_LOG(ERR, "Failed to create Mkey for lm."); + goto err; + } + attr.dirty_bitmap_mkey = mr->mkey->id; + for (i = 0; i < priv->nr_virtqs; ++i) { + attr.queue_index = i; + if (!priv->virtqs[i].virtq || + mlx5_devx_cmd_modify_virtq(priv->virtqs[i].virtq, &attr)) { + DRV_LOG(ERR, "Failed to modify virtq %d for lm.", i); + goto err; + } + } + mr->is_indirect = 0; + SLIST_INSERT_HEAD(&priv->mr_list, mr, next); + return 0; +err: + if (mr->mkey) + mlx5_devx_cmd_destroy(mr->mkey); + if (mr->umem) + mlx5_glue->devx_umem_dereg(mr->umem); + rte_free(mr); + return -1; +} + +#define MLX5_VDPA_USED_RING_LEN(size) \ + ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3) + +int +mlx5_vdpa_lm_log(struct mlx5_vdpa_priv *priv) +{ + uint64_t features; + int ret = rte_vhost_get_negotiated_features(priv->vid, &features); + int i; + + if (ret) { + DRV_LOG(ERR, "Failed to get negotiated features."); + return -1; + } + if (!RTE_VHOST_NEED_LOG(features)) + return 0; + for (i = 0; i < priv->nr_virtqs; ++i) { + if (priv->virtqs[i].virtq) { + ret = mlx5_vdpa_virtq_stop(priv, i); + if (ret) { + DRV_LOG(ERR, "Failed to stop virtq %d.", i); + return -1; + } + } else { + DRV_LOG(ERR, "virtq %d is not created.", i); + return -1; + } + rte_vhost_log_used_vring(priv->vid, i, 0, + MLX5_VDPA_USED_RING_LEN(priv->virtqs[i].vq_size)); + } + return 0; +} diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_mem.c b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_mem.c new file mode 100644 index 000000000..da31b47ec --- /dev/null +++ b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_mem.c @@ -0,0 +1,347 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2019 Mellanox Technologies, Ltd + */ +#include <stdlib.h> + +#include <rte_malloc.h> +#include <rte_errno.h> +#include <rte_common.h> +#include <rte_sched_common.h> + +#include <mlx5_prm.h> +#include <mlx5_common.h> + +#include "mlx5_vdpa_utils.h" +#include "mlx5_vdpa.h" + +static int +mlx5_vdpa_pd_prepare(struct mlx5_vdpa_priv *priv) +{ +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + if (priv->pd) + return 0; + priv->pd = mlx5_glue->alloc_pd(priv->ctx); + if (priv->pd == NULL) { + DRV_LOG(ERR, "Failed to allocate PD."); + return errno ? -errno : -ENOMEM; + } + struct mlx5dv_obj obj; + struct mlx5dv_pd pd_info; + int ret = 0; + + obj.pd.in = priv->pd; + obj.pd.out = &pd_info; + ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD); + if (ret) { + DRV_LOG(ERR, "Fail to get PD object info."); + mlx5_glue->dealloc_pd(priv->pd); + priv->pd = NULL; + return -errno; + } + priv->pdn = pd_info.pdn; + return 0; +#else + (void)priv; + DRV_LOG(ERR, "Cannot get pdn - no DV support."); + return -ENOTSUP; +#endif /* HAVE_IBV_FLOW_DV_SUPPORT */ +} + +void +mlx5_vdpa_mem_dereg(struct mlx5_vdpa_priv *priv) +{ + struct mlx5_vdpa_query_mr *entry; + struct mlx5_vdpa_query_mr *next; + + entry = SLIST_FIRST(&priv->mr_list); + while (entry) { + next = SLIST_NEXT(entry, next); + claim_zero(mlx5_devx_cmd_destroy(entry->mkey)); + if (!entry->is_indirect) + claim_zero(mlx5_glue->devx_umem_dereg(entry->umem)); + SLIST_REMOVE(&priv->mr_list, entry, mlx5_vdpa_query_mr, next); + rte_free(entry); + entry = next; + } + SLIST_INIT(&priv->mr_list); + if (priv->null_mr) { + claim_zero(mlx5_glue->dereg_mr(priv->null_mr)); + priv->null_mr = NULL; + } + if (priv->pd) { + claim_zero(mlx5_glue->dealloc_pd(priv->pd)); + priv->pd = NULL; + } + if (priv->vmem) { + free(priv->vmem); + priv->vmem = NULL; + } +} + +static int +mlx5_vdpa_regions_addr_cmp(const void *a, const void *b) +{ + const struct rte_vhost_mem_region *region_a = a; + const struct rte_vhost_mem_region *region_b = b; + + if (region_a->guest_phys_addr < region_b->guest_phys_addr) + return -1; + if (region_a->guest_phys_addr > region_b->guest_phys_addr) + return 1; + return 0; +} + +#define KLM_NUM_MAX_ALIGN(sz) (RTE_ALIGN_CEIL(sz, MLX5_MAX_KLM_BYTE_COUNT) / \ + MLX5_MAX_KLM_BYTE_COUNT) + +/* + * Allocate and sort the region list and choose indirect mkey mode: + * 1. Calculate GCD, guest memory size and indirect mkey entries num per mode. + * 2. Align GCD to the maximum allowed size(2G) and to be power of 2. + * 2. Decide the indirect mkey mode according to the next rules: + * a. If both KLM_FBS entries number and KLM entries number are bigger + * than the maximum allowed(MLX5_DEVX_MAX_KLM_ENTRIES) - error. + * b. KLM mode if KLM_FBS entries number is bigger than the maximum + * allowed(MLX5_DEVX_MAX_KLM_ENTRIES). + * c. KLM mode if GCD is smaller than the minimum allowed(4K). + * d. KLM mode if the total size of KLM entries is in one cache line + * and the total size of KLM_FBS entries is not in one cache line. + * e. Otherwise, KLM_FBS mode. + */ +static struct rte_vhost_memory * +mlx5_vdpa_vhost_mem_regions_prepare(int vid, uint8_t *mode, uint64_t *mem_size, + uint64_t *gcd, uint32_t *entries_num) +{ + struct rte_vhost_memory *mem; + uint64_t size; + uint64_t klm_entries_num = 0; + uint64_t klm_fbs_entries_num; + uint32_t i; + int ret = rte_vhost_get_mem_table(vid, &mem); + + if (ret < 0) { + DRV_LOG(ERR, "Failed to get VM memory layout vid =%d.", vid); + rte_errno = EINVAL; + return NULL; + } + qsort(mem->regions, mem->nregions, sizeof(mem->regions[0]), + mlx5_vdpa_regions_addr_cmp); + *mem_size = (mem->regions[(mem->nregions - 1)].guest_phys_addr) + + (mem->regions[(mem->nregions - 1)].size) - + (mem->regions[0].guest_phys_addr); + *gcd = 0; + for (i = 0; i < mem->nregions; ++i) { + DRV_LOG(INFO, "Region %u: HVA 0x%" PRIx64 ", GPA 0x%" PRIx64 + ", size 0x%" PRIx64 ".", i, + mem->regions[i].host_user_addr, + mem->regions[i].guest_phys_addr, mem->regions[i].size); + if (i > 0) { + /* Hole handle. */ + size = mem->regions[i].guest_phys_addr - + (mem->regions[i - 1].guest_phys_addr + + mem->regions[i - 1].size); + *gcd = rte_get_gcd(*gcd, size); + klm_entries_num += KLM_NUM_MAX_ALIGN(size); + } + size = mem->regions[i].size; + *gcd = rte_get_gcd(*gcd, size); + klm_entries_num += KLM_NUM_MAX_ALIGN(size); + } + if (*gcd > MLX5_MAX_KLM_BYTE_COUNT) + *gcd = rte_get_gcd(*gcd, MLX5_MAX_KLM_BYTE_COUNT); + if (!RTE_IS_POWER_OF_2(*gcd)) { + uint64_t candidate_gcd = rte_align64prevpow2(*gcd); + + while (candidate_gcd > 1 && (*gcd % candidate_gcd)) + candidate_gcd /= 2; + DRV_LOG(DEBUG, "GCD 0x%" PRIx64 " is not power of 2. Adjusted " + "GCD is 0x%" PRIx64 ".", *gcd, candidate_gcd); + *gcd = candidate_gcd; + } + klm_fbs_entries_num = *mem_size / *gcd; + if (*gcd < MLX5_MIN_KLM_FIXED_BUFFER_SIZE || klm_fbs_entries_num > + MLX5_DEVX_MAX_KLM_ENTRIES || + ((klm_entries_num * sizeof(struct mlx5_klm)) <= + RTE_CACHE_LINE_SIZE && (klm_fbs_entries_num * + sizeof(struct mlx5_klm)) > + RTE_CACHE_LINE_SIZE)) { + *mode = MLX5_MKC_ACCESS_MODE_KLM; + *entries_num = klm_entries_num; + DRV_LOG(INFO, "Indirect mkey mode is KLM."); + } else { + *mode = MLX5_MKC_ACCESS_MODE_KLM_FBS; + *entries_num = klm_fbs_entries_num; + DRV_LOG(INFO, "Indirect mkey mode is KLM Fixed Buffer Size."); + } + DRV_LOG(DEBUG, "Memory registration information: nregions = %u, " + "mem_size = 0x%" PRIx64 ", GCD = 0x%" PRIx64 + ", klm_fbs_entries_num = 0x%" PRIx64 ", klm_entries_num = 0x%" + PRIx64 ".", mem->nregions, *mem_size, *gcd, klm_fbs_entries_num, + klm_entries_num); + if (*entries_num > MLX5_DEVX_MAX_KLM_ENTRIES) { + DRV_LOG(ERR, "Failed to prepare memory of vid %d - memory is " + "too fragmented.", vid); + free(mem); + return NULL; + } + return mem; +} + +#define KLM_SIZE_MAX_ALIGN(sz) ((sz) > MLX5_MAX_KLM_BYTE_COUNT ? \ + MLX5_MAX_KLM_BYTE_COUNT : (sz)) + +/* + * The target here is to group all the physical memory regions of the + * virtio device in one indirect mkey. + * For KLM Fixed Buffer Size mode (HW find the translation entry in one + * read according to the guest phisical address): + * All the sub-direct mkeys of it must be in the same size, hence, each + * one of them should be in the GCD size of all the virtio memory + * regions and the holes between them. + * For KLM mode (each entry may be in different size so HW must iterate + * the entries): + * Each virtio memory region and each hole between them have one entry, + * just need to cover the maximum allowed size(2G) by splitting entries + * which their associated memory regions are bigger than 2G. + * It means that each virtio memory region may be mapped to more than + * one direct mkey in the 2 modes. + * All the holes of invalid memory between the virtio memory regions + * will be mapped to the null memory region for security. + */ +int +mlx5_vdpa_mem_register(struct mlx5_vdpa_priv *priv) +{ + struct mlx5_devx_mkey_attr mkey_attr; + struct mlx5_vdpa_query_mr *entry = NULL; + struct rte_vhost_mem_region *reg = NULL; + uint8_t mode; + uint32_t entries_num = 0; + uint32_t i; + uint64_t gcd; + uint64_t klm_size; + uint64_t mem_size; + uint64_t k; + int klm_index = 0; + int ret; + struct rte_vhost_memory *mem = mlx5_vdpa_vhost_mem_regions_prepare + (priv->vid, &mode, &mem_size, &gcd, &entries_num); + struct mlx5_klm klm_array[entries_num]; + + if (!mem) + return -rte_errno; + priv->vmem = mem; + ret = mlx5_vdpa_pd_prepare(priv); + if (ret) + goto error; + priv->null_mr = mlx5_glue->alloc_null_mr(priv->pd); + if (!priv->null_mr) { + DRV_LOG(ERR, "Failed to allocate null MR."); + ret = -errno; + goto error; + } + DRV_LOG(DEBUG, "Dump fill Mkey = %u.", priv->null_mr->lkey); + for (i = 0; i < mem->nregions; i++) { + reg = &mem->regions[i]; + entry = rte_zmalloc(__func__, sizeof(*entry), 0); + if (!entry) { + ret = -ENOMEM; + DRV_LOG(ERR, "Failed to allocate mem entry memory."); + goto error; + } + entry->umem = mlx5_glue->devx_umem_reg(priv->ctx, + (void *)(uintptr_t)reg->host_user_addr, + reg->size, IBV_ACCESS_LOCAL_WRITE); + if (!entry->umem) { + DRV_LOG(ERR, "Failed to register Umem by Devx."); + ret = -errno; + goto error; + } + mkey_attr.addr = (uintptr_t)(reg->guest_phys_addr); + mkey_attr.size = reg->size; + mkey_attr.umem_id = entry->umem->umem_id; + mkey_attr.pd = priv->pdn; + mkey_attr.pg_access = 1; + mkey_attr.klm_array = NULL; + mkey_attr.klm_num = 0; + mkey_attr.relaxed_ordering = 0; + entry->mkey = mlx5_devx_cmd_mkey_create(priv->ctx, &mkey_attr); + if (!entry->mkey) { + DRV_LOG(ERR, "Failed to create direct Mkey."); + ret = -rte_errno; + goto error; + } + entry->addr = (void *)(uintptr_t)(reg->host_user_addr); + entry->length = reg->size; + entry->is_indirect = 0; + if (i > 0) { + uint64_t sadd; + uint64_t empty_region_sz = reg->guest_phys_addr - + (mem->regions[i - 1].guest_phys_addr + + mem->regions[i - 1].size); + + if (empty_region_sz > 0) { + sadd = mem->regions[i - 1].guest_phys_addr + + mem->regions[i - 1].size; + klm_size = mode == MLX5_MKC_ACCESS_MODE_KLM ? + KLM_SIZE_MAX_ALIGN(empty_region_sz) : gcd; + for (k = 0; k < empty_region_sz; + k += klm_size) { + klm_array[klm_index].byte_count = + k + klm_size > empty_region_sz ? + empty_region_sz - k : klm_size; + klm_array[klm_index].mkey = + priv->null_mr->lkey; + klm_array[klm_index].address = sadd + k; + klm_index++; + } + } + } + klm_size = mode == MLX5_MKC_ACCESS_MODE_KLM ? + KLM_SIZE_MAX_ALIGN(reg->size) : gcd; + for (k = 0; k < reg->size; k += klm_size) { + klm_array[klm_index].byte_count = k + klm_size > + reg->size ? reg->size - k : klm_size; + klm_array[klm_index].mkey = entry->mkey->id; + klm_array[klm_index].address = reg->guest_phys_addr + k; + klm_index++; + } + SLIST_INSERT_HEAD(&priv->mr_list, entry, next); + } + mkey_attr.addr = (uintptr_t)(mem->regions[0].guest_phys_addr); + mkey_attr.size = mem_size; + mkey_attr.pd = priv->pdn; + mkey_attr.umem_id = 0; + /* Must be zero for KLM mode. */ + mkey_attr.log_entity_size = mode == MLX5_MKC_ACCESS_MODE_KLM_FBS ? + rte_log2_u64(gcd) : 0; + mkey_attr.pg_access = 0; + mkey_attr.klm_array = klm_array; + mkey_attr.klm_num = klm_index; + entry = rte_zmalloc(__func__, sizeof(*entry), 0); + if (!entry) { + DRV_LOG(ERR, "Failed to allocate memory for indirect entry."); + ret = -ENOMEM; + goto error; + } + entry->mkey = mlx5_devx_cmd_mkey_create(priv->ctx, &mkey_attr); + if (!entry->mkey) { + DRV_LOG(ERR, "Failed to create indirect Mkey."); + ret = -rte_errno; + goto error; + } + entry->is_indirect = 1; + SLIST_INSERT_HEAD(&priv->mr_list, entry, next); + priv->gpa_mkey_index = entry->mkey->id; + return 0; +error: + if (entry) { + if (entry->mkey) + mlx5_devx_cmd_destroy(entry->mkey); + if (entry->umem) + mlx5_glue->devx_umem_dereg(entry->umem); + rte_free(entry); + } + mlx5_vdpa_mem_dereg(priv); + rte_errno = -ret; + return ret; +} diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_steer.c b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_steer.c new file mode 100644 index 000000000..406c7be17 --- /dev/null +++ b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_steer.c @@ -0,0 +1,288 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2019 Mellanox Technologies, Ltd + */ +#include <netinet/in.h> + +#include <rte_malloc.h> +#include <rte_errno.h> +#include <rte_common.h> + +#include <mlx5_common.h> + +#include "mlx5_vdpa_utils.h" +#include "mlx5_vdpa.h" + +static void +mlx5_vdpa_rss_flows_destroy(struct mlx5_vdpa_priv *priv) +{ + unsigned i; + + for (i = 0; i < RTE_DIM(priv->steer.rss); ++i) { + if (priv->steer.rss[i].flow) { + claim_zero(mlx5_glue->dv_destroy_flow + (priv->steer.rss[i].flow)); + priv->steer.rss[i].flow = NULL; + } + if (priv->steer.rss[i].tir_action) { + claim_zero(mlx5_glue->destroy_flow_action + (priv->steer.rss[i].tir_action)); + priv->steer.rss[i].tir_action = NULL; + } + if (priv->steer.rss[i].tir) { + claim_zero(mlx5_devx_cmd_destroy + (priv->steer.rss[i].tir)); + priv->steer.rss[i].tir = NULL; + } + if (priv->steer.rss[i].matcher) { + claim_zero(mlx5_glue->dv_destroy_flow_matcher + (priv->steer.rss[i].matcher)); + priv->steer.rss[i].matcher = NULL; + } + } +} + +void +mlx5_vdpa_steer_unset(struct mlx5_vdpa_priv *priv) +{ + mlx5_vdpa_rss_flows_destroy(priv); + if (priv->steer.tbl) { + claim_zero(mlx5_glue->dr_destroy_flow_tbl(priv->steer.tbl)); + priv->steer.tbl = NULL; + } + if (priv->steer.domain) { + claim_zero(mlx5_glue->dr_destroy_domain(priv->steer.domain)); + priv->steer.domain = NULL; + } + if (priv->steer.rqt) { + claim_zero(mlx5_devx_cmd_destroy(priv->steer.rqt)); + priv->steer.rqt = NULL; + } +} + +#define MLX5_VDPA_DEFAULT_RQT_SIZE 512 +/* + * Return the number of queues configured to the table on success, otherwise + * -1 on error. + */ +static int +mlx5_vdpa_rqt_prepare(struct mlx5_vdpa_priv *priv) +{ + int i; + uint32_t rqt_n = RTE_MIN(MLX5_VDPA_DEFAULT_RQT_SIZE, + 1 << priv->log_max_rqt_size); + struct mlx5_devx_rqt_attr *attr = rte_zmalloc(__func__, sizeof(*attr) + + rqt_n * + sizeof(uint32_t), 0); + uint32_t k = 0, j; + int ret = 0, num; + + if (!attr) { + DRV_LOG(ERR, "Failed to allocate RQT attributes memory."); + rte_errno = ENOMEM; + return -ENOMEM; + } + for (i = 0; i < priv->nr_virtqs; i++) { + if (is_virtq_recvq(i, priv->nr_virtqs) && + priv->virtqs[i].enable && priv->virtqs[i].virtq) { + attr->rq_list[k] = priv->virtqs[i].virtq->id; + k++; + } + } + if (k == 0) + /* No enabled RQ to configure for RSS. */ + return 0; + num = (int)k; + for (j = 0; k != rqt_n; ++k, ++j) + attr->rq_list[k] = attr->rq_list[j]; + attr->rq_type = MLX5_INLINE_Q_TYPE_VIRTQ; + attr->rqt_max_size = rqt_n; + attr->rqt_actual_size = rqt_n; + if (!priv->steer.rqt) { + priv->steer.rqt = mlx5_devx_cmd_create_rqt(priv->ctx, attr); + if (!priv->steer.rqt) { + DRV_LOG(ERR, "Failed to create RQT."); + ret = -rte_errno; + } + } else { + ret = mlx5_devx_cmd_modify_rqt(priv->steer.rqt, attr); + if (ret) + DRV_LOG(ERR, "Failed to modify RQT."); + } + rte_free(attr); + return ret ? -1 : num; +} + +static int __rte_unused +mlx5_vdpa_rss_flows_create(struct mlx5_vdpa_priv *priv) +{ +#ifdef HAVE_MLX5DV_DR + struct mlx5_devx_tir_attr tir_att = { + .disp_type = MLX5_TIRC_DISP_TYPE_INDIRECT, + .rx_hash_fn = MLX5_RX_HASH_FN_TOEPLITZ, + .transport_domain = priv->td->id, + .indirect_table = priv->steer.rqt->id, + .rx_hash_symmetric = 1, + .rx_hash_toeplitz_key = { 0x2c, 0xc6, 0x81, 0xd1, + 0x5b, 0xdb, 0xf4, 0xf7, + 0xfc, 0xa2, 0x83, 0x19, + 0xdb, 0x1a, 0x3e, 0x94, + 0x6b, 0x9e, 0x38, 0xd9, + 0x2c, 0x9c, 0x03, 0xd1, + 0xad, 0x99, 0x44, 0xa7, + 0xd9, 0x56, 0x3d, 0x59, + 0x06, 0x3c, 0x25, 0xf3, + 0xfc, 0x1f, 0xdc, 0x2a }, + }; + struct { + size_t size; + /**< Size of match value. Do NOT split size and key! */ + uint32_t buf[MLX5_ST_SZ_DW(fte_match_param)]; + /**< Matcher value. This value is used as the mask or a key. */ + } matcher_mask = { + .size = sizeof(matcher_mask.buf), + }, + matcher_value = { + .size = sizeof(matcher_value.buf), + }; + struct mlx5dv_flow_matcher_attr dv_attr = { + .type = IBV_FLOW_ATTR_NORMAL, + .match_mask = (void *)&matcher_mask, + }; + void *match_m = matcher_mask.buf; + void *match_v = matcher_value.buf; + void *headers_m = MLX5_ADDR_OF(fte_match_param, match_m, outer_headers); + void *headers_v = MLX5_ADDR_OF(fte_match_param, match_v, outer_headers); + void *actions[1]; + const uint8_t l3_hash = + (1 << MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_SRC_IP) | + (1 << MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_DST_IP); + const uint8_t l4_hash = + (1 << MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_L4_SPORT) | + (1 << MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_L4_DPORT); + enum { PRIO, CRITERIA, IP_VER_M, IP_VER_V, IP_PROT_M, IP_PROT_V, L3_BIT, + L4_BIT, HASH, END}; + const uint8_t vars[RTE_DIM(priv->steer.rss)][END] = { + { 7, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 6, 1 << MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT, 0xf, 4, 0, 0, + MLX5_L3_PROT_TYPE_IPV4, 0, l3_hash }, + { 6, 1 << MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT, 0xf, 6, 0, 0, + MLX5_L3_PROT_TYPE_IPV6, 0, l3_hash }, + { 5, 1 << MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT, 0xf, 4, 0xff, + IPPROTO_UDP, MLX5_L3_PROT_TYPE_IPV4, MLX5_L4_PROT_TYPE_UDP, + l3_hash | l4_hash }, + { 5, 1 << MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT, 0xf, 4, 0xff, + IPPROTO_TCP, MLX5_L3_PROT_TYPE_IPV4, MLX5_L4_PROT_TYPE_TCP, + l3_hash | l4_hash }, + { 5, 1 << MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT, 0xf, 6, 0xff, + IPPROTO_UDP, MLX5_L3_PROT_TYPE_IPV6, MLX5_L4_PROT_TYPE_UDP, + l3_hash | l4_hash }, + { 5, 1 << MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT, 0xf, 6, 0xff, + IPPROTO_TCP, MLX5_L3_PROT_TYPE_IPV6, MLX5_L4_PROT_TYPE_TCP, + l3_hash | l4_hash }, + }; + unsigned i; + + for (i = 0; i < RTE_DIM(priv->steer.rss); ++i) { + dv_attr.priority = vars[i][PRIO]; + dv_attr.match_criteria_enable = vars[i][CRITERIA]; + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_version, + vars[i][IP_VER_M]); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_version, + vars[i][IP_VER_V]); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, + vars[i][IP_PROT_M]); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, + vars[i][IP_PROT_V]); + tir_att.rx_hash_field_selector_outer.l3_prot_type = + vars[i][L3_BIT]; + tir_att.rx_hash_field_selector_outer.l4_prot_type = + vars[i][L4_BIT]; + tir_att.rx_hash_field_selector_outer.selected_fields = + vars[i][HASH]; + priv->steer.rss[i].matcher = mlx5_glue->dv_create_flow_matcher + (priv->ctx, &dv_attr, priv->steer.tbl); + if (!priv->steer.rss[i].matcher) { + DRV_LOG(ERR, "Failed to create matcher %d.", i); + goto error; + } + priv->steer.rss[i].tir = mlx5_devx_cmd_create_tir(priv->ctx, + &tir_att); + if (!priv->steer.rss[i].tir) { + DRV_LOG(ERR, "Failed to create TIR %d.", i); + goto error; + } + priv->steer.rss[i].tir_action = + mlx5_glue->dv_create_flow_action_dest_devx_tir + (priv->steer.rss[i].tir->obj); + if (!priv->steer.rss[i].tir_action) { + DRV_LOG(ERR, "Failed to create TIR action %d.", i); + goto error; + } + actions[0] = priv->steer.rss[i].tir_action; + priv->steer.rss[i].flow = mlx5_glue->dv_create_flow + (priv->steer.rss[i].matcher, + (void *)&matcher_value, 1, actions); + if (!priv->steer.rss[i].flow) { + DRV_LOG(ERR, "Failed to create flow %d.", i); + goto error; + } + } + return 0; +error: + /* Resources will be freed by the caller. */ + return -1; +#else + (void)priv; + return -ENOTSUP; +#endif /* HAVE_MLX5DV_DR */ +} + +int +mlx5_vdpa_steer_update(struct mlx5_vdpa_priv *priv) +{ + int ret = mlx5_vdpa_rqt_prepare(priv); + + if (ret == 0) { + mlx5_vdpa_rss_flows_destroy(priv); + if (priv->steer.rqt) { + claim_zero(mlx5_devx_cmd_destroy(priv->steer.rqt)); + priv->steer.rqt = NULL; + } + } else if (ret < 0) { + return ret; + } else if (!priv->steer.rss[0].flow) { + ret = mlx5_vdpa_rss_flows_create(priv); + if (ret) { + DRV_LOG(ERR, "Cannot create RSS flows."); + return -1; + } + } + return 0; +} + +int +mlx5_vdpa_steer_setup(struct mlx5_vdpa_priv *priv) +{ +#ifdef HAVE_MLX5DV_DR + priv->steer.domain = mlx5_glue->dr_create_domain(priv->ctx, + MLX5DV_DR_DOMAIN_TYPE_NIC_RX); + if (!priv->steer.domain) { + DRV_LOG(ERR, "Failed to create Rx domain."); + goto error; + } + priv->steer.tbl = mlx5_glue->dr_create_flow_tbl(priv->steer.domain, 0); + if (!priv->steer.tbl) { + DRV_LOG(ERR, "Failed to create table 0 with Rx domain."); + goto error; + } + if (mlx5_vdpa_steer_update(priv)) + goto error; + return 0; +error: + mlx5_vdpa_steer_unset(priv); + return -1; +#else + (void)priv; + return -ENOTSUP; +#endif /* HAVE_MLX5DV_DR */ +} diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_utils.h b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_utils.h new file mode 100644 index 000000000..a239df9a5 --- /dev/null +++ b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_utils.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2019 Mellanox Technologies, Ltd + */ + +#ifndef RTE_PMD_MLX5_VDPA_UTILS_H_ +#define RTE_PMD_MLX5_VDPA_UTILS_H_ + +#include <mlx5_common.h> + + +extern int mlx5_vdpa_logtype; + +#define MLX5_VDPA_LOG_PREFIX "mlx5_vdpa" +/* Generic printf()-like logging macro with automatic line feed. */ +#define DRV_LOG(level, ...) \ + PMD_DRV_LOG_(level, mlx5_vdpa_logtype, MLX5_VDPA_LOG_PREFIX, \ + __VA_ARGS__ PMD_DRV_LOG_STRIP PMD_DRV_LOG_OPAREN, \ + PMD_DRV_LOG_CPAREN) + +#endif /* RTE_PMD_MLX5_VDPA_UTILS_H_ */ diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c new file mode 100644 index 000000000..bd48460b5 --- /dev/null +++ b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c @@ -0,0 +1,457 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2019 Mellanox Technologies, Ltd + */ +#include <string.h> +#include <unistd.h> +#include <sys/mman.h> + +#include <rte_malloc.h> +#include <rte_errno.h> +#include <rte_io.h> + +#include <mlx5_common.h> + +#include "mlx5_vdpa_utils.h" +#include "mlx5_vdpa.h" + + +static void +mlx5_vdpa_virtq_handler(void *cb_arg) +{ + struct mlx5_vdpa_virtq *virtq = cb_arg; + struct mlx5_vdpa_priv *priv = virtq->priv; + uint64_t buf; + int nbytes; + + do { + nbytes = read(virtq->intr_handle.fd, &buf, 8); + if (nbytes < 0) { + if (errno == EINTR || + errno == EWOULDBLOCK || + errno == EAGAIN) + continue; + DRV_LOG(ERR, "Failed to read kickfd of virtq %d: %s", + virtq->index, strerror(errno)); + } + break; + } while (1); + rte_write32(virtq->index, priv->virtq_db_addr); + DRV_LOG(DEBUG, "Ring virtq %u doorbell.", virtq->index); +} + +static int +mlx5_vdpa_virtq_unset(struct mlx5_vdpa_virtq *virtq) +{ + unsigned int i; + int retries = MLX5_VDPA_INTR_RETRIES; + int ret = -EAGAIN; + + if (virtq->intr_handle.fd != -1) { + while (retries-- && ret == -EAGAIN) { + ret = rte_intr_callback_unregister(&virtq->intr_handle, + mlx5_vdpa_virtq_handler, + virtq); + if (ret == -EAGAIN) { + DRV_LOG(DEBUG, "Try again to unregister fd %d " + "of virtq %d interrupt, retries = %d.", + virtq->intr_handle.fd, + (int)virtq->index, retries); + usleep(MLX5_VDPA_INTR_RETRIES_USEC); + } + } + virtq->intr_handle.fd = -1; + } + if (virtq->virtq) + claim_zero(mlx5_devx_cmd_destroy(virtq->virtq)); + virtq->virtq = NULL; + for (i = 0; i < RTE_DIM(virtq->umems); ++i) { + if (virtq->umems[i].obj) + claim_zero(mlx5_glue->devx_umem_dereg + (virtq->umems[i].obj)); + if (virtq->umems[i].buf) + rte_free(virtq->umems[i].buf); + } + memset(&virtq->umems, 0, sizeof(virtq->umems)); + if (virtq->eqp.fw_qp) + mlx5_vdpa_event_qp_destroy(&virtq->eqp); + return 0; +} + +void +mlx5_vdpa_virtqs_release(struct mlx5_vdpa_priv *priv) +{ + int i; + + for (i = 0; i < priv->nr_virtqs; i++) { + mlx5_vdpa_virtq_unset(&priv->virtqs[i]); + priv->virtqs[i].enable = 0; + } + if (priv->tis) { + claim_zero(mlx5_devx_cmd_destroy(priv->tis)); + priv->tis = NULL; + } + if (priv->td) { + claim_zero(mlx5_devx_cmd_destroy(priv->td)); + priv->td = NULL; + } + if (priv->virtq_db_addr) { + claim_zero(munmap(priv->virtq_db_addr, priv->var->length)); + priv->virtq_db_addr = NULL; + } + priv->features = 0; + priv->nr_virtqs = 0; +} + +int +mlx5_vdpa_virtq_modify(struct mlx5_vdpa_virtq *virtq, int state) +{ + struct mlx5_devx_virtq_attr attr = { + .type = MLX5_VIRTQ_MODIFY_TYPE_STATE, + .state = state ? MLX5_VIRTQ_STATE_RDY : + MLX5_VIRTQ_STATE_SUSPEND, + .queue_index = virtq->index, + }; + + return mlx5_devx_cmd_modify_virtq(virtq->virtq, &attr); +} + +int +mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index) +{ + struct mlx5_devx_virtq_attr attr = {0}; + struct mlx5_vdpa_virtq *virtq = &priv->virtqs[index]; + int ret = mlx5_vdpa_virtq_modify(virtq, 0); + + if (ret) + return -1; + if (mlx5_devx_cmd_query_virtq(virtq->virtq, &attr)) { + DRV_LOG(ERR, "Failed to query virtq %d.", index); + return -1; + } + DRV_LOG(INFO, "Query vid %d vring %d: hw_available_idx=%d, " + "hw_used_index=%d", priv->vid, index, + attr.hw_available_index, attr.hw_used_index); + ret = rte_vhost_set_vring_base(priv->vid, index, + attr.hw_available_index, + attr.hw_used_index); + if (ret) { + DRV_LOG(ERR, "Failed to set virtq %d base.", index); + return -1; + } + return 0; +} + +static uint64_t +mlx5_vdpa_hva_to_gpa(struct rte_vhost_memory *mem, uint64_t hva) +{ + struct rte_vhost_mem_region *reg; + uint32_t i; + uint64_t gpa = 0; + + for (i = 0; i < mem->nregions; i++) { + reg = &mem->regions[i]; + if (hva >= reg->host_user_addr && + hva < reg->host_user_addr + reg->size) { + gpa = hva - reg->host_user_addr + reg->guest_phys_addr; + break; + } + } + return gpa; +} + +static int +mlx5_vdpa_virtq_setup(struct mlx5_vdpa_priv *priv, int index) +{ + struct mlx5_vdpa_virtq *virtq = &priv->virtqs[index]; + struct rte_vhost_vring vq; + struct mlx5_devx_virtq_attr attr = {0}; + uint64_t gpa; + int ret; + unsigned int i; + uint16_t last_avail_idx; + uint16_t last_used_idx; + + ret = rte_vhost_get_vhost_vring(priv->vid, index, &vq); + if (ret) + return -1; + virtq->index = index; + virtq->vq_size = vq.size; + attr.tso_ipv4 = !!(priv->features & (1ULL << VIRTIO_NET_F_HOST_TSO4)); + attr.tso_ipv6 = !!(priv->features & (1ULL << VIRTIO_NET_F_HOST_TSO6)); + attr.tx_csum = !!(priv->features & (1ULL << VIRTIO_NET_F_CSUM)); + attr.rx_csum = !!(priv->features & (1ULL << VIRTIO_NET_F_GUEST_CSUM)); + attr.virtio_version_1_0 = !!(priv->features & (1ULL << + VIRTIO_F_VERSION_1)); + attr.type = (priv->features & (1ULL << VIRTIO_F_RING_PACKED)) ? + MLX5_VIRTQ_TYPE_PACKED : MLX5_VIRTQ_TYPE_SPLIT; + /* + * No need event QPs creation when the guest in poll mode or when the + * capability allows it. + */ + attr.event_mode = vq.callfd != -1 || !(priv->caps.event_mode & (1 << + MLX5_VIRTQ_EVENT_MODE_NO_MSIX)) ? + MLX5_VIRTQ_EVENT_MODE_QP : + MLX5_VIRTQ_EVENT_MODE_NO_MSIX; + if (attr.event_mode == MLX5_VIRTQ_EVENT_MODE_QP) { + ret = mlx5_vdpa_event_qp_create(priv, vq.size, vq.callfd, + &virtq->eqp); + if (ret) { + DRV_LOG(ERR, "Failed to create event QPs for virtq %d.", + index); + return -1; + } + attr.qp_id = virtq->eqp.fw_qp->id; + } else { + DRV_LOG(INFO, "Virtq %d is, for sure, working by poll mode, no" + " need event QPs and event mechanism.", index); + } + /* Setup 3 UMEMs for each virtq. */ + for (i = 0; i < RTE_DIM(virtq->umems); ++i) { + virtq->umems[i].size = priv->caps.umems[i].a * vq.size + + priv->caps.umems[i].b; + virtq->umems[i].buf = rte_zmalloc(__func__, + virtq->umems[i].size, 4096); + if (!virtq->umems[i].buf) { + DRV_LOG(ERR, "Cannot allocate umem %d memory for virtq" + " %u.", i, index); + goto error; + } + virtq->umems[i].obj = mlx5_glue->devx_umem_reg(priv->ctx, + virtq->umems[i].buf, + virtq->umems[i].size, + IBV_ACCESS_LOCAL_WRITE); + if (!virtq->umems[i].obj) { + DRV_LOG(ERR, "Failed to register umem %d for virtq %u.", + i, index); + goto error; + } + attr.umems[i].id = virtq->umems[i].obj->umem_id; + attr.umems[i].offset = 0; + attr.umems[i].size = virtq->umems[i].size; + } + if (attr.type == MLX5_VIRTQ_TYPE_SPLIT) { + gpa = mlx5_vdpa_hva_to_gpa(priv->vmem, + (uint64_t)(uintptr_t)vq.desc); + if (!gpa) { + DRV_LOG(ERR, "Failed to get descriptor ring GPA."); + goto error; + } + attr.desc_addr = gpa; + gpa = mlx5_vdpa_hva_to_gpa(priv->vmem, + (uint64_t)(uintptr_t)vq.used); + if (!gpa) { + DRV_LOG(ERR, "Failed to get GPA for used ring."); + goto error; + } + attr.used_addr = gpa; + gpa = mlx5_vdpa_hva_to_gpa(priv->vmem, + (uint64_t)(uintptr_t)vq.avail); + if (!gpa) { + DRV_LOG(ERR, "Failed to get GPA for available ring."); + goto error; + } + attr.available_addr = gpa; + } + ret = rte_vhost_get_vring_base(priv->vid, index, &last_avail_idx, + &last_used_idx); + if (ret) { + last_avail_idx = 0; + last_used_idx = 0; + DRV_LOG(WARNING, "Couldn't get vring base, idx are set to 0"); + } else { + DRV_LOG(INFO, "vid %d: Init last_avail_idx=%d, last_used_idx=%d for " + "virtq %d.", priv->vid, last_avail_idx, + last_used_idx, index); + } + attr.hw_available_index = last_avail_idx; + attr.hw_used_index = last_used_idx; + attr.q_size = vq.size; + attr.mkey = priv->gpa_mkey_index; + attr.tis_id = priv->tis->id; + attr.queue_index = index; + virtq->virtq = mlx5_devx_cmd_create_virtq(priv->ctx, &attr); + virtq->priv = priv; + if (!virtq->virtq) + goto error; + if (mlx5_vdpa_virtq_modify(virtq, 1)) + goto error; + virtq->priv = priv; + rte_write32(virtq->index, priv->virtq_db_addr); + /* Setup doorbell mapping. */ + virtq->intr_handle.fd = vq.kickfd; + if (virtq->intr_handle.fd == -1) { + DRV_LOG(WARNING, "Virtq %d kickfd is invalid.", index); + if (!priv->direct_notifier) { + DRV_LOG(ERR, "Virtq %d cannot be notified.", index); + goto error; + } + } else { + virtq->intr_handle.type = RTE_INTR_HANDLE_EXT; + if (rte_intr_callback_register(&virtq->intr_handle, + mlx5_vdpa_virtq_handler, + virtq)) { + virtq->intr_handle.fd = -1; + DRV_LOG(ERR, "Failed to register virtq %d interrupt.", + index); + goto error; + } else { + DRV_LOG(DEBUG, "Register fd %d interrupt for virtq %d.", + virtq->intr_handle.fd, index); + } + } + return 0; +error: + mlx5_vdpa_virtq_unset(virtq); + return -1; +} + +static int +mlx5_vdpa_features_validate(struct mlx5_vdpa_priv *priv) +{ + if (priv->features & (1ULL << VIRTIO_F_RING_PACKED)) { + if (!(priv->caps.virtio_queue_type & (1 << + MLX5_VIRTQ_TYPE_PACKED))) { + DRV_LOG(ERR, "Failed to configur PACKED mode for vdev " + "%d - it was not reported by HW/driver" + " capability.", priv->vid); + return -ENOTSUP; + } + } + if (priv->features & (1ULL << VIRTIO_NET_F_HOST_TSO4)) { + if (!priv->caps.tso_ipv4) { + DRV_LOG(ERR, "Failed to enable TSO4 for vdev %d - TSO4" + " was not reported by HW/driver capability.", + priv->vid); + return -ENOTSUP; + } + } + if (priv->features & (1ULL << VIRTIO_NET_F_HOST_TSO6)) { + if (!priv->caps.tso_ipv6) { + DRV_LOG(ERR, "Failed to enable TSO6 for vdev %d - TSO6" + " was not reported by HW/driver capability.", + priv->vid); + return -ENOTSUP; + } + } + if (priv->features & (1ULL << VIRTIO_NET_F_CSUM)) { + if (!priv->caps.tx_csum) { + DRV_LOG(ERR, "Failed to enable CSUM for vdev %d - CSUM" + " was not reported by HW/driver capability.", + priv->vid); + return -ENOTSUP; + } + } + if (priv->features & (1ULL << VIRTIO_NET_F_GUEST_CSUM)) { + if (!priv->caps.rx_csum) { + DRV_LOG(ERR, "Failed to enable GUEST CSUM for vdev %d" + " GUEST CSUM was not reported by HW/driver " + "capability.", priv->vid); + return -ENOTSUP; + } + } + if (priv->features & (1ULL << VIRTIO_F_VERSION_1)) { + if (!priv->caps.virtio_version_1_0) { + DRV_LOG(ERR, "Failed to enable version 1 for vdev %d " + "version 1 was not reported by HW/driver" + " capability.", priv->vid); + return -ENOTSUP; + } + } + return 0; +} + +int +mlx5_vdpa_virtqs_prepare(struct mlx5_vdpa_priv *priv) +{ + struct mlx5_devx_tis_attr tis_attr = {0}; + uint32_t i; + uint16_t nr_vring = rte_vhost_get_vring_num(priv->vid); + int ret = rte_vhost_get_negotiated_features(priv->vid, &priv->features); + + if (ret || mlx5_vdpa_features_validate(priv)) { + DRV_LOG(ERR, "Failed to configure negotiated features."); + return -1; + } + if (nr_vring > priv->caps.max_num_virtio_queues * 2) { + DRV_LOG(ERR, "Do not support more than %d virtqs(%d).", + (int)priv->caps.max_num_virtio_queues * 2, + (int)nr_vring); + return -1; + } + /* Always map the entire page. */ + priv->virtq_db_addr = mmap(NULL, priv->var->length, PROT_READ | + PROT_WRITE, MAP_SHARED, priv->ctx->cmd_fd, + priv->var->mmap_off); + if (priv->virtq_db_addr == MAP_FAILED) { + DRV_LOG(ERR, "Failed to map doorbell page %u.", errno); + priv->virtq_db_addr = NULL; + goto error; + } else { + DRV_LOG(DEBUG, "VAR address of doorbell mapping is %p.", + priv->virtq_db_addr); + } + priv->td = mlx5_devx_cmd_create_td(priv->ctx); + if (!priv->td) { + DRV_LOG(ERR, "Failed to create transport domain."); + return -rte_errno; + } + tis_attr.transport_domain = priv->td->id; + priv->tis = mlx5_devx_cmd_create_tis(priv->ctx, &tis_attr); + if (!priv->tis) { + DRV_LOG(ERR, "Failed to create TIS."); + goto error; + } + priv->nr_virtqs = nr_vring; + for (i = 0; i < nr_vring; i++) { + claim_zero(rte_vhost_enable_guest_notification(priv->vid, i, + 1)); + if (mlx5_vdpa_virtq_setup(priv, i)) + goto error; + } + return 0; +error: + mlx5_vdpa_virtqs_release(priv); + return -1; +} + +int +mlx5_vdpa_virtq_enable(struct mlx5_vdpa_priv *priv, int index, int enable) +{ + struct mlx5_vdpa_virtq *virtq = &priv->virtqs[index]; + int ret; + + DRV_LOG(INFO, "Update virtq %d status %sable -> %sable.", index, + virtq->enable ? "en" : "dis", enable ? "en" : "dis"); + if (virtq->enable == !!enable) + return 0; + if (!priv->configured) { + virtq->enable = !!enable; + return 0; + } + if (enable) { + /* Configuration might have been updated - reconfigure virtq. */ + if (virtq->virtq) { + ret = mlx5_vdpa_virtq_stop(priv, index); + if (ret) + DRV_LOG(WARNING, "Failed to stop virtq %d.", + index); + mlx5_vdpa_virtq_unset(virtq); + } + ret = mlx5_vdpa_virtq_setup(priv, index); + if (ret) { + DRV_LOG(ERR, "Failed to setup virtq %d.", index); + return ret; + /* The only case virtq can stay invalid. */ + } + } + virtq->enable = !!enable; + if (is_virtq_recvq(virtq->index, priv->nr_virtqs)) { + /* Need to add received virtq to the RQT table of the TIRs. */ + ret = mlx5_vdpa_steer_update(priv); + if (ret) { + virtq->enable = !enable; + return ret; + } + } + return 0; +} diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/rte_pmd_mlx5_vdpa_version.map b/src/spdk/dpdk/drivers/vdpa/mlx5/rte_pmd_mlx5_vdpa_version.map new file mode 100644 index 000000000..4a76d1d52 --- /dev/null +++ b/src/spdk/dpdk/drivers/vdpa/mlx5/rte_pmd_mlx5_vdpa_version.map @@ -0,0 +1,3 @@ +DPDK_21 { + local: *; +}; |