summaryrefslogtreecommitdiffstats
path: root/src/spdk/dpdk/drivers/bus/pci/linux
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/spdk/dpdk/drivers/bus/pci/linux
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/dpdk/drivers/bus/pci/linux')
-rw-r--r--src/spdk/dpdk/drivers/bus/pci/linux/Makefile6
-rw-r--r--src/spdk/dpdk/drivers/bus/pci/linux/pci.c878
-rw-r--r--src/spdk/dpdk/drivers/bus/pci/linux/pci_init.h88
-rw-r--r--src/spdk/dpdk/drivers/bus/pci/linux/pci_uio.c569
-rw-r--r--src/spdk/dpdk/drivers/bus/pci/linux/pci_vfio.c1070
5 files changed, 2611 insertions, 0 deletions
diff --git a/src/spdk/dpdk/drivers/bus/pci/linux/Makefile b/src/spdk/dpdk/drivers/bus/pci/linux/Makefile
new file mode 100644
index 000000000..90404468b
--- /dev/null
+++ b/src/spdk/dpdk/drivers/bus/pci/linux/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2017 6WIND S.A.
+
+SRCS += pci.c
+SRCS += pci_uio.c
+SRCS += pci_vfio.c
diff --git a/src/spdk/dpdk/drivers/bus/pci/linux/pci.c b/src/spdk/dpdk/drivers/bus/pci/linux/pci.c
new file mode 100644
index 000000000..313e8ffe6
--- /dev/null
+++ b/src/spdk/dpdk/drivers/bus/pci/linux/pci.c
@@ -0,0 +1,878 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <string.h>
+#include <dirent.h>
+
+#include <rte_log.h>
+#include <rte_bus.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_malloc.h>
+#include <rte_devargs.h>
+#include <rte_memcpy.h>
+#include <rte_vfio.h>
+
+#include "eal_filesystem.h"
+
+#include "private.h"
+#include "pci_init.h"
+
+/**
+ * @file
+ * PCI probing under linux
+ *
+ * This code is used to simulate a PCI probe by parsing information in sysfs.
+ * When a registered device matches a driver, it is then initialized with
+ * IGB_UIO driver (or doesn't initialize, if the device wasn't bound to it).
+ */
+
+extern struct rte_pci_bus rte_pci_bus;
+
+static int
+pci_get_kernel_driver_by_path(const char *filename, char *dri_name,
+ size_t len)
+{
+ int count;
+ char path[PATH_MAX];
+ char *name;
+
+ if (!filename || !dri_name)
+ return -1;
+
+ count = readlink(filename, path, PATH_MAX);
+ if (count >= PATH_MAX)
+ return -1;
+
+ /* For device does not have a driver */
+ if (count < 0)
+ return 1;
+
+ path[count] = '\0';
+
+ name = strrchr(path, '/');
+ if (name) {
+ strlcpy(dri_name, name + 1, len);
+ return 0;
+ }
+
+ return -1;
+}
+
+/* Map pci device */
+int
+rte_pci_map_device(struct rte_pci_device *dev)
+{
+ int ret = -1;
+
+ /* try mapping the NIC resources using VFIO if it exists */
+ switch (dev->kdrv) {
+ case RTE_KDRV_VFIO:
+#ifdef VFIO_PRESENT
+ if (pci_vfio_is_enabled())
+ ret = pci_vfio_map_resource(dev);
+#endif
+ break;
+ case RTE_KDRV_IGB_UIO:
+ case RTE_KDRV_UIO_GENERIC:
+ if (rte_eal_using_phys_addrs()) {
+ /* map resources for devices that use uio */
+ ret = pci_uio_map_resource(dev);
+ }
+ break;
+ default:
+ RTE_LOG(DEBUG, EAL,
+ " Not managed by a supported kernel driver, skipped\n");
+ ret = 1;
+ break;
+ }
+
+ return ret;
+}
+
+/* Unmap pci device */
+void
+rte_pci_unmap_device(struct rte_pci_device *dev)
+{
+ /* try unmapping the NIC resources using VFIO if it exists */
+ switch (dev->kdrv) {
+ case RTE_KDRV_VFIO:
+#ifdef VFIO_PRESENT
+ if (pci_vfio_is_enabled())
+ pci_vfio_unmap_resource(dev);
+#endif
+ break;
+ case RTE_KDRV_IGB_UIO:
+ case RTE_KDRV_UIO_GENERIC:
+ /* unmap resources for devices that use uio */
+ pci_uio_unmap_resource(dev);
+ break;
+ default:
+ RTE_LOG(DEBUG, EAL,
+ " Not managed by a supported kernel driver, skipped\n");
+ break;
+ }
+}
+
+static int
+find_max_end_va(const struct rte_memseg_list *msl, void *arg)
+{
+ size_t sz = msl->len;
+ void *end_va = RTE_PTR_ADD(msl->base_va, sz);
+ void **max_va = arg;
+
+ if (*max_va < end_va)
+ *max_va = end_va;
+ return 0;
+}
+
+void *
+pci_find_max_end_va(void)
+{
+ void *va = NULL;
+
+ rte_memseg_list_walk(find_max_end_va, &va);
+ return va;
+}
+
+
+/* parse one line of the "resource" sysfs file (note that the 'line'
+ * string is modified)
+ */
+int
+pci_parse_one_sysfs_resource(char *line, size_t len, uint64_t *phys_addr,
+ uint64_t *end_addr, uint64_t *flags)
+{
+ union pci_resource_info {
+ struct {
+ char *phys_addr;
+ char *end_addr;
+ char *flags;
+ };
+ char *ptrs[PCI_RESOURCE_FMT_NVAL];
+ } res_info;
+
+ if (rte_strsplit(line, len, res_info.ptrs, 3, ' ') != 3) {
+ RTE_LOG(ERR, EAL,
+ "%s(): bad resource format\n", __func__);
+ return -1;
+ }
+ errno = 0;
+ *phys_addr = strtoull(res_info.phys_addr, NULL, 16);
+ *end_addr = strtoull(res_info.end_addr, NULL, 16);
+ *flags = strtoull(res_info.flags, NULL, 16);
+ if (errno != 0) {
+ RTE_LOG(ERR, EAL,
+ "%s(): bad resource format\n", __func__);
+ return -1;
+ }
+
+ return 0;
+}
+
+/* parse the "resource" sysfs file */
+static int
+pci_parse_sysfs_resource(const char *filename, struct rte_pci_device *dev)
+{
+ FILE *f;
+ char buf[BUFSIZ];
+ int i;
+ uint64_t phys_addr, end_addr, flags;
+
+ f = fopen(filename, "r");
+ if (f == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot open sysfs resource\n");
+ return -1;
+ }
+
+ for (i = 0; i<PCI_MAX_RESOURCE; i++) {
+
+ if (fgets(buf, sizeof(buf), f) == NULL) {
+ RTE_LOG(ERR, EAL,
+ "%s(): cannot read resource\n", __func__);
+ goto error;
+ }
+ if (pci_parse_one_sysfs_resource(buf, sizeof(buf), &phys_addr,
+ &end_addr, &flags) < 0)
+ goto error;
+
+ if (flags & IORESOURCE_MEM) {
+ dev->mem_resource[i].phys_addr = phys_addr;
+ dev->mem_resource[i].len = end_addr - phys_addr + 1;
+ /* not mapped for now */
+ dev->mem_resource[i].addr = NULL;
+ }
+ }
+ fclose(f);
+ return 0;
+
+error:
+ fclose(f);
+ return -1;
+}
+
+/* Scan one pci sysfs entry, and fill the devices list from it. */
+static int
+pci_scan_one(const char *dirname, const struct rte_pci_addr *addr)
+{
+ char filename[PATH_MAX];
+ unsigned long tmp;
+ struct rte_pci_device *dev;
+ char driver[PATH_MAX];
+ int ret;
+
+ dev = malloc(sizeof(*dev));
+ if (dev == NULL)
+ return -1;
+
+ memset(dev, 0, sizeof(*dev));
+ dev->device.bus = &rte_pci_bus.bus;
+ dev->addr = *addr;
+
+ /* get vendor id */
+ snprintf(filename, sizeof(filename), "%s/vendor", dirname);
+ if (eal_parse_sysfs_value(filename, &tmp) < 0) {
+ free(dev);
+ return -1;
+ }
+ dev->id.vendor_id = (uint16_t)tmp;
+
+ /* get device id */
+ snprintf(filename, sizeof(filename), "%s/device", dirname);
+ if (eal_parse_sysfs_value(filename, &tmp) < 0) {
+ free(dev);
+ return -1;
+ }
+ dev->id.device_id = (uint16_t)tmp;
+
+ /* get subsystem_vendor id */
+ snprintf(filename, sizeof(filename), "%s/subsystem_vendor",
+ dirname);
+ if (eal_parse_sysfs_value(filename, &tmp) < 0) {
+ free(dev);
+ return -1;
+ }
+ dev->id.subsystem_vendor_id = (uint16_t)tmp;
+
+ /* get subsystem_device id */
+ snprintf(filename, sizeof(filename), "%s/subsystem_device",
+ dirname);
+ if (eal_parse_sysfs_value(filename, &tmp) < 0) {
+ free(dev);
+ return -1;
+ }
+ dev->id.subsystem_device_id = (uint16_t)tmp;
+
+ /* get class_id */
+ snprintf(filename, sizeof(filename), "%s/class",
+ dirname);
+ if (eal_parse_sysfs_value(filename, &tmp) < 0) {
+ free(dev);
+ return -1;
+ }
+ /* the least 24 bits are valid: class, subclass, program interface */
+ dev->id.class_id = (uint32_t)tmp & RTE_CLASS_ANY_ID;
+
+ /* get max_vfs */
+ dev->max_vfs = 0;
+ snprintf(filename, sizeof(filename), "%s/max_vfs", dirname);
+ if (!access(filename, F_OK) &&
+ eal_parse_sysfs_value(filename, &tmp) == 0)
+ dev->max_vfs = (uint16_t)tmp;
+ else {
+ /* for non igb_uio driver, need kernel version >= 3.8 */
+ snprintf(filename, sizeof(filename),
+ "%s/sriov_numvfs", dirname);
+ if (!access(filename, F_OK) &&
+ eal_parse_sysfs_value(filename, &tmp) == 0)
+ dev->max_vfs = (uint16_t)tmp;
+ }
+
+ /* get numa node, default to 0 if not present */
+ snprintf(filename, sizeof(filename), "%s/numa_node",
+ dirname);
+
+ if (access(filename, F_OK) != -1) {
+ if (eal_parse_sysfs_value(filename, &tmp) == 0)
+ dev->device.numa_node = tmp;
+ else
+ dev->device.numa_node = -1;
+ } else {
+ dev->device.numa_node = 0;
+ }
+
+ pci_name_set(dev);
+
+ /* parse resources */
+ snprintf(filename, sizeof(filename), "%s/resource", dirname);
+ if (pci_parse_sysfs_resource(filename, dev) < 0) {
+ RTE_LOG(ERR, EAL, "%s(): cannot parse resource\n", __func__);
+ free(dev);
+ return -1;
+ }
+
+ /* parse driver */
+ snprintf(filename, sizeof(filename), "%s/driver", dirname);
+ ret = pci_get_kernel_driver_by_path(filename, driver, sizeof(driver));
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "Fail to get kernel driver\n");
+ free(dev);
+ return -1;
+ }
+
+ if (!ret) {
+ if (!strcmp(driver, "vfio-pci"))
+ dev->kdrv = RTE_KDRV_VFIO;
+ else if (!strcmp(driver, "igb_uio"))
+ dev->kdrv = RTE_KDRV_IGB_UIO;
+ else if (!strcmp(driver, "uio_pci_generic"))
+ dev->kdrv = RTE_KDRV_UIO_GENERIC;
+ else
+ dev->kdrv = RTE_KDRV_UNKNOWN;
+ } else
+ dev->kdrv = RTE_KDRV_NONE;
+
+ /* device is valid, add in list (sorted) */
+ if (TAILQ_EMPTY(&rte_pci_bus.device_list)) {
+ rte_pci_add_device(dev);
+ } else {
+ struct rte_pci_device *dev2;
+ int ret;
+
+ TAILQ_FOREACH(dev2, &rte_pci_bus.device_list, next) {
+ ret = rte_pci_addr_cmp(&dev->addr, &dev2->addr);
+ if (ret > 0)
+ continue;
+
+ if (ret < 0) {
+ rte_pci_insert_device(dev2, dev);
+ } else { /* already registered */
+ if (!rte_dev_is_probed(&dev2->device)) {
+ dev2->kdrv = dev->kdrv;
+ dev2->max_vfs = dev->max_vfs;
+ memcpy(&dev2->id, &dev->id, sizeof(dev2->id));
+ pci_name_set(dev2);
+ memmove(dev2->mem_resource,
+ dev->mem_resource,
+ sizeof(dev->mem_resource));
+ } else {
+ /**
+ * If device is plugged and driver is
+ * probed already, (This happens when
+ * we call rte_dev_probe which will
+ * scan all device on the bus) we don't
+ * need to do anything here unless...
+ **/
+ if (dev2->kdrv != dev->kdrv ||
+ dev2->max_vfs != dev->max_vfs ||
+ memcmp(&dev2->id, &dev->id, sizeof(dev2->id)))
+ /*
+ * This should not happens.
+ * But it is still possible if
+ * we unbind a device from
+ * vfio or uio before hotplug
+ * remove and rebind it with
+ * a different configure.
+ * So we just print out the
+ * error as an alarm.
+ */
+ RTE_LOG(ERR, EAL, "Unexpected device scan at %s!\n",
+ filename);
+ else if (dev2->device.devargs !=
+ dev->device.devargs) {
+ rte_devargs_remove(dev2->device.devargs);
+ pci_name_set(dev2);
+ }
+ }
+ free(dev);
+ }
+ return 0;
+ }
+
+ rte_pci_add_device(dev);
+ }
+
+ return 0;
+}
+
+int
+pci_update_device(const struct rte_pci_addr *addr)
+{
+ char filename[PATH_MAX];
+
+ snprintf(filename, sizeof(filename), "%s/" PCI_PRI_FMT,
+ rte_pci_get_sysfs_path(), addr->domain, addr->bus, addr->devid,
+ addr->function);
+
+ return pci_scan_one(filename, addr);
+}
+
+/*
+ * split up a pci address into its constituent parts.
+ */
+static int
+parse_pci_addr_format(const char *buf, int bufsize, struct rte_pci_addr *addr)
+{
+ /* first split on ':' */
+ union splitaddr {
+ struct {
+ char *domain;
+ char *bus;
+ char *devid;
+ char *function;
+ };
+ char *str[PCI_FMT_NVAL]; /* last element-separator is "." not ":" */
+ } splitaddr;
+
+ char *buf_copy = strndup(buf, bufsize);
+ if (buf_copy == NULL)
+ return -1;
+
+ if (rte_strsplit(buf_copy, bufsize, splitaddr.str, PCI_FMT_NVAL, ':')
+ != PCI_FMT_NVAL - 1)
+ goto error;
+ /* final split is on '.' between devid and function */
+ splitaddr.function = strchr(splitaddr.devid,'.');
+ if (splitaddr.function == NULL)
+ goto error;
+ *splitaddr.function++ = '\0';
+
+ /* now convert to int values */
+ errno = 0;
+ addr->domain = strtoul(splitaddr.domain, NULL, 16);
+ addr->bus = strtoul(splitaddr.bus, NULL, 16);
+ addr->devid = strtoul(splitaddr.devid, NULL, 16);
+ addr->function = strtoul(splitaddr.function, NULL, 10);
+ if (errno != 0)
+ goto error;
+
+ free(buf_copy); /* free the copy made with strdup */
+ return 0;
+error:
+ free(buf_copy);
+ return -1;
+}
+
+/*
+ * Scan the content of the PCI bus, and the devices in the devices
+ * list
+ */
+int
+rte_pci_scan(void)
+{
+ struct dirent *e;
+ DIR *dir;
+ char dirname[PATH_MAX];
+ struct rte_pci_addr addr;
+
+ /* for debug purposes, PCI can be disabled */
+ if (!rte_eal_has_pci())
+ return 0;
+
+#ifdef VFIO_PRESENT
+ if (!pci_vfio_is_enabled())
+ RTE_LOG(DEBUG, EAL, "VFIO PCI modules not loaded\n");
+#endif
+
+ dir = opendir(rte_pci_get_sysfs_path());
+ if (dir == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): opendir failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+
+ while ((e = readdir(dir)) != NULL) {
+ if (e->d_name[0] == '.')
+ continue;
+
+ if (parse_pci_addr_format(e->d_name, sizeof(e->d_name), &addr) != 0)
+ continue;
+
+ if (rte_pci_ignore_device(&addr))
+ continue;
+
+ snprintf(dirname, sizeof(dirname), "%s/%s",
+ rte_pci_get_sysfs_path(), e->d_name);
+
+ if (pci_scan_one(dirname, &addr) < 0)
+ goto error;
+ }
+ closedir(dir);
+ return 0;
+
+error:
+ closedir(dir);
+ return -1;
+}
+
+#if defined(RTE_ARCH_X86)
+bool
+pci_device_iommu_support_va(const struct rte_pci_device *dev)
+{
+#define VTD_CAP_MGAW_SHIFT 16
+#define VTD_CAP_MGAW_MASK (0x3fULL << VTD_CAP_MGAW_SHIFT)
+ const struct rte_pci_addr *addr = &dev->addr;
+ char filename[PATH_MAX];
+ FILE *fp;
+ uint64_t mgaw, vtd_cap_reg = 0;
+
+ snprintf(filename, sizeof(filename),
+ "%s/" PCI_PRI_FMT "/iommu/intel-iommu/cap",
+ rte_pci_get_sysfs_path(), addr->domain, addr->bus, addr->devid,
+ addr->function);
+
+ fp = fopen(filename, "r");
+ if (fp == NULL) {
+ /* We don't have an Intel IOMMU, assume VA supported */
+ if (errno == ENOENT)
+ return true;
+
+ RTE_LOG(ERR, EAL, "%s(): can't open %s: %s\n",
+ __func__, filename, strerror(errno));
+ return false;
+ }
+
+ /* We have an Intel IOMMU */
+ if (fscanf(fp, "%" PRIx64, &vtd_cap_reg) != 1) {
+ RTE_LOG(ERR, EAL, "%s(): can't read %s\n", __func__, filename);
+ fclose(fp);
+ return false;
+ }
+
+ fclose(fp);
+
+ mgaw = ((vtd_cap_reg & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1;
+
+ /*
+ * Assuming there is no limitation by now. We can not know at this point
+ * because the memory has not been initialized yet. Setting the dma mask
+ * will force a check once memory initialization is done. We can not do
+ * a fallback to IOVA PA now, but if the dma check fails, the error
+ * message should advice for using '--iova-mode pa' if IOVA VA is the
+ * current mode.
+ */
+ rte_mem_set_dma_mask(mgaw);
+ return true;
+}
+#elif defined(RTE_ARCH_PPC_64)
+bool
+pci_device_iommu_support_va(__rte_unused const struct rte_pci_device *dev)
+{
+ /*
+ * IOMMU is always present on a PowerNV host (IOMMUv2).
+ * IOMMU is also present in a KVM/QEMU VM (IOMMUv1) but is not
+ * currently supported by DPDK. Test for our current environment
+ * and report VA support as appropriate.
+ */
+
+ char *line = NULL;
+ size_t len = 0;
+ char filename[PATH_MAX] = "/proc/cpuinfo";
+ FILE *fp = fopen(filename, "r");
+ bool ret = false;
+
+ if (fp == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): can't open %s: %s\n",
+ __func__, filename, strerror(errno));
+ return ret;
+ }
+
+ /* Check for a PowerNV platform */
+ while (getline(&line, &len, fp) != -1) {
+ if (strstr(line, "platform") != NULL)
+ continue;
+
+ if (strstr(line, "PowerNV") != NULL) {
+ RTE_LOG(DEBUG, EAL, "Running on a PowerNV system\n");
+ ret = true;
+ break;
+ }
+ }
+
+ free(line);
+ fclose(fp);
+ return ret;
+}
+#else
+bool
+pci_device_iommu_support_va(__rte_unused const struct rte_pci_device *dev)
+{
+ return true;
+}
+#endif
+
+enum rte_iova_mode
+pci_device_iova_mode(const struct rte_pci_driver *pdrv,
+ const struct rte_pci_device *pdev)
+{
+ enum rte_iova_mode iova_mode = RTE_IOVA_DC;
+
+ switch (pdev->kdrv) {
+ case RTE_KDRV_VFIO: {
+#ifdef VFIO_PRESENT
+ static int is_vfio_noiommu_enabled = -1;
+
+ if (is_vfio_noiommu_enabled == -1) {
+ if (rte_vfio_noiommu_is_enabled() == 1)
+ is_vfio_noiommu_enabled = 1;
+ else
+ is_vfio_noiommu_enabled = 0;
+ }
+ if (is_vfio_noiommu_enabled != 0)
+ iova_mode = RTE_IOVA_PA;
+ else if ((pdrv->drv_flags & RTE_PCI_DRV_NEED_IOVA_AS_VA) != 0)
+ iova_mode = RTE_IOVA_VA;
+#endif
+ break;
+ }
+
+ case RTE_KDRV_IGB_UIO:
+ case RTE_KDRV_UIO_GENERIC:
+ iova_mode = RTE_IOVA_PA;
+ break;
+
+ default:
+ if ((pdrv->drv_flags & RTE_PCI_DRV_NEED_IOVA_AS_VA) != 0)
+ iova_mode = RTE_IOVA_VA;
+ break;
+ }
+ return iova_mode;
+}
+
+/* Read PCI config space. */
+int rte_pci_read_config(const struct rte_pci_device *device,
+ void *buf, size_t len, off_t offset)
+{
+ char devname[RTE_DEV_NAME_MAX_LEN] = "";
+ const struct rte_intr_handle *intr_handle = &device->intr_handle;
+
+ switch (device->kdrv) {
+ case RTE_KDRV_IGB_UIO:
+ case RTE_KDRV_UIO_GENERIC:
+ return pci_uio_read_config(intr_handle, buf, len, offset);
+#ifdef VFIO_PRESENT
+ case RTE_KDRV_VFIO:
+ return pci_vfio_read_config(intr_handle, buf, len, offset);
+#endif
+ default:
+ rte_pci_device_name(&device->addr, devname,
+ RTE_DEV_NAME_MAX_LEN);
+ RTE_LOG(ERR, EAL,
+ "Unknown driver type for %s\n", devname);
+ return -1;
+ }
+}
+
+/* Write PCI config space. */
+int rte_pci_write_config(const struct rte_pci_device *device,
+ const void *buf, size_t len, off_t offset)
+{
+ char devname[RTE_DEV_NAME_MAX_LEN] = "";
+ const struct rte_intr_handle *intr_handle = &device->intr_handle;
+
+ switch (device->kdrv) {
+ case RTE_KDRV_IGB_UIO:
+ case RTE_KDRV_UIO_GENERIC:
+ return pci_uio_write_config(intr_handle, buf, len, offset);
+#ifdef VFIO_PRESENT
+ case RTE_KDRV_VFIO:
+ return pci_vfio_write_config(intr_handle, buf, len, offset);
+#endif
+ default:
+ rte_pci_device_name(&device->addr, devname,
+ RTE_DEV_NAME_MAX_LEN);
+ RTE_LOG(ERR, EAL,
+ "Unknown driver type for %s\n", devname);
+ return -1;
+ }
+}
+
+#if defined(RTE_ARCH_X86)
+static int
+pci_ioport_map(struct rte_pci_device *dev, int bar __rte_unused,
+ struct rte_pci_ioport *p)
+{
+ uint16_t start, end;
+ FILE *fp;
+ char *line = NULL;
+ char pci_id[16];
+ int found = 0;
+ size_t linesz;
+
+ if (rte_eal_iopl_init() != 0) {
+ RTE_LOG(ERR, EAL, "%s(): insufficient ioport permissions for PCI device %s\n",
+ __func__, dev->name);
+ return -1;
+ }
+
+ snprintf(pci_id, sizeof(pci_id), PCI_PRI_FMT,
+ dev->addr.domain, dev->addr.bus,
+ dev->addr.devid, dev->addr.function);
+
+ fp = fopen("/proc/ioports", "r");
+ if (fp == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): can't open ioports\n", __func__);
+ return -1;
+ }
+
+ while (getdelim(&line, &linesz, '\n', fp) > 0) {
+ char *ptr = line;
+ char *left;
+ int n;
+
+ n = strcspn(ptr, ":");
+ ptr[n] = 0;
+ left = &ptr[n + 1];
+
+ while (*left && isspace(*left))
+ left++;
+
+ if (!strncmp(left, pci_id, strlen(pci_id))) {
+ found = 1;
+
+ while (*ptr && isspace(*ptr))
+ ptr++;
+
+ sscanf(ptr, "%04hx-%04hx", &start, &end);
+
+ break;
+ }
+ }
+
+ free(line);
+ fclose(fp);
+
+ if (!found)
+ return -1;
+
+ p->base = start;
+ RTE_LOG(DEBUG, EAL, "PCI Port IO found start=0x%x\n", start);
+
+ return 0;
+}
+#endif
+
+int
+rte_pci_ioport_map(struct rte_pci_device *dev, int bar,
+ struct rte_pci_ioport *p)
+{
+ int ret = -1;
+
+ switch (dev->kdrv) {
+#ifdef VFIO_PRESENT
+ case RTE_KDRV_VFIO:
+ if (pci_vfio_is_enabled())
+ ret = pci_vfio_ioport_map(dev, bar, p);
+ break;
+#endif
+ case RTE_KDRV_IGB_UIO:
+ ret = pci_uio_ioport_map(dev, bar, p);
+ break;
+ case RTE_KDRV_UIO_GENERIC:
+#if defined(RTE_ARCH_X86)
+ ret = pci_ioport_map(dev, bar, p);
+#else
+ ret = pci_uio_ioport_map(dev, bar, p);
+#endif
+ break;
+ case RTE_KDRV_NONE:
+#if defined(RTE_ARCH_X86)
+ ret = pci_ioport_map(dev, bar, p);
+#endif
+ break;
+ default:
+ break;
+ }
+
+ if (!ret)
+ p->dev = dev;
+
+ return ret;
+}
+
+void
+rte_pci_ioport_read(struct rte_pci_ioport *p,
+ void *data, size_t len, off_t offset)
+{
+ switch (p->dev->kdrv) {
+#ifdef VFIO_PRESENT
+ case RTE_KDRV_VFIO:
+ pci_vfio_ioport_read(p, data, len, offset);
+ break;
+#endif
+ case RTE_KDRV_IGB_UIO:
+ pci_uio_ioport_read(p, data, len, offset);
+ break;
+ case RTE_KDRV_UIO_GENERIC:
+ pci_uio_ioport_read(p, data, len, offset);
+ break;
+ case RTE_KDRV_NONE:
+#if defined(RTE_ARCH_X86)
+ pci_uio_ioport_read(p, data, len, offset);
+#endif
+ break;
+ default:
+ break;
+ }
+}
+
+void
+rte_pci_ioport_write(struct rte_pci_ioport *p,
+ const void *data, size_t len, off_t offset)
+{
+ switch (p->dev->kdrv) {
+#ifdef VFIO_PRESENT
+ case RTE_KDRV_VFIO:
+ pci_vfio_ioport_write(p, data, len, offset);
+ break;
+#endif
+ case RTE_KDRV_IGB_UIO:
+ pci_uio_ioport_write(p, data, len, offset);
+ break;
+ case RTE_KDRV_UIO_GENERIC:
+ pci_uio_ioport_write(p, data, len, offset);
+ break;
+ case RTE_KDRV_NONE:
+#if defined(RTE_ARCH_X86)
+ pci_uio_ioport_write(p, data, len, offset);
+#endif
+ break;
+ default:
+ break;
+ }
+}
+
+int
+rte_pci_ioport_unmap(struct rte_pci_ioport *p)
+{
+ int ret = -1;
+
+ switch (p->dev->kdrv) {
+#ifdef VFIO_PRESENT
+ case RTE_KDRV_VFIO:
+ if (pci_vfio_is_enabled())
+ ret = pci_vfio_ioport_unmap(p);
+ break;
+#endif
+ case RTE_KDRV_IGB_UIO:
+ ret = pci_uio_ioport_unmap(p);
+ break;
+ case RTE_KDRV_UIO_GENERIC:
+#if defined(RTE_ARCH_X86)
+ ret = 0;
+#else
+ ret = pci_uio_ioport_unmap(p);
+#endif
+ break;
+ case RTE_KDRV_NONE:
+#if defined(RTE_ARCH_X86)
+ ret = 0;
+#endif
+ break;
+ default:
+ break;
+ }
+
+ return ret;
+}
diff --git a/src/spdk/dpdk/drivers/bus/pci/linux/pci_init.h b/src/spdk/dpdk/drivers/bus/pci/linux/pci_init.h
new file mode 100644
index 000000000..c2e603a37
--- /dev/null
+++ b/src/spdk/dpdk/drivers/bus/pci/linux/pci_init.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#ifndef EAL_PCI_INIT_H_
+#define EAL_PCI_INIT_H_
+
+#include <rte_vfio.h>
+
+/** IO resource type: */
+#define IORESOURCE_IO 0x00000100
+#define IORESOURCE_MEM 0x00000200
+
+/*
+ * Helper function to map PCI resources right after hugepages in virtual memory
+ */
+extern void *pci_map_addr;
+void *pci_find_max_end_va(void);
+
+/* parse one line of the "resource" sysfs file (note that the 'line'
+ * string is modified)
+ */
+int pci_parse_one_sysfs_resource(char *line, size_t len, uint64_t *phys_addr,
+ uint64_t *end_addr, uint64_t *flags);
+
+int pci_uio_alloc_resource(struct rte_pci_device *dev,
+ struct mapped_pci_resource **uio_res);
+void pci_uio_free_resource(struct rte_pci_device *dev,
+ struct mapped_pci_resource *uio_res);
+int pci_uio_map_resource_by_index(struct rte_pci_device *dev, int res_idx,
+ struct mapped_pci_resource *uio_res, int map_idx);
+
+int pci_uio_read_config(const struct rte_intr_handle *intr_handle,
+ void *buf, size_t len, off_t offs);
+int pci_uio_write_config(const struct rte_intr_handle *intr_handle,
+ const void *buf, size_t len, off_t offs);
+
+int pci_uio_ioport_map(struct rte_pci_device *dev, int bar,
+ struct rte_pci_ioport *p);
+void pci_uio_ioport_read(struct rte_pci_ioport *p,
+ void *data, size_t len, off_t offset);
+void pci_uio_ioport_write(struct rte_pci_ioport *p,
+ const void *data, size_t len, off_t offset);
+int pci_uio_ioport_unmap(struct rte_pci_ioport *p);
+
+#ifdef VFIO_PRESENT
+
+#ifdef PCI_MSIX_TABLE_BIR
+#define RTE_PCI_MSIX_TABLE_BIR PCI_MSIX_TABLE_BIR
+#else
+#define RTE_PCI_MSIX_TABLE_BIR 0x7
+#endif
+
+#ifdef PCI_MSIX_TABLE_OFFSET
+#define RTE_PCI_MSIX_TABLE_OFFSET PCI_MSIX_TABLE_OFFSET
+#else
+#define RTE_PCI_MSIX_TABLE_OFFSET 0xfffffff8
+#endif
+
+#ifdef PCI_MSIX_FLAGS_QSIZE
+#define RTE_PCI_MSIX_FLAGS_QSIZE PCI_MSIX_FLAGS_QSIZE
+#else
+#define RTE_PCI_MSIX_FLAGS_QSIZE 0x07ff
+#endif
+
+/* access config space */
+int pci_vfio_read_config(const struct rte_intr_handle *intr_handle,
+ void *buf, size_t len, off_t offs);
+int pci_vfio_write_config(const struct rte_intr_handle *intr_handle,
+ const void *buf, size_t len, off_t offs);
+
+int pci_vfio_ioport_map(struct rte_pci_device *dev, int bar,
+ struct rte_pci_ioport *p);
+void pci_vfio_ioport_read(struct rte_pci_ioport *p,
+ void *data, size_t len, off_t offset);
+void pci_vfio_ioport_write(struct rte_pci_ioport *p,
+ const void *data, size_t len, off_t offset);
+int pci_vfio_ioport_unmap(struct rte_pci_ioport *p);
+
+/* map/unmap VFIO resource prototype */
+int pci_vfio_map_resource(struct rte_pci_device *dev);
+int pci_vfio_unmap_resource(struct rte_pci_device *dev);
+
+int pci_vfio_is_enabled(void);
+
+#endif
+
+#endif /* EAL_PCI_INIT_H_ */
diff --git a/src/spdk/dpdk/drivers/bus/pci/linux/pci_uio.c b/src/spdk/dpdk/drivers/bus/pci/linux/pci_uio.c
new file mode 100644
index 000000000..097dc1922
--- /dev/null
+++ b/src/spdk/dpdk/drivers/bus/pci/linux/pci_uio.c
@@ -0,0 +1,569 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <inttypes.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/sysmacros.h>
+#include <linux/pci_regs.h>
+
+#if defined(RTE_ARCH_X86)
+#include <sys/io.h>
+#endif
+
+#include <rte_string_fns.h>
+#include <rte_log.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_common.h>
+#include <rte_malloc.h>
+
+#include "eal_filesystem.h"
+#include "pci_init.h"
+
+void *pci_map_addr = NULL;
+
+#define OFF_MAX ((uint64_t)(off_t)-1)
+
+int
+pci_uio_read_config(const struct rte_intr_handle *intr_handle,
+ void *buf, size_t len, off_t offset)
+{
+ return pread(intr_handle->uio_cfg_fd, buf, len, offset);
+}
+
+int
+pci_uio_write_config(const struct rte_intr_handle *intr_handle,
+ const void *buf, size_t len, off_t offset)
+{
+ return pwrite(intr_handle->uio_cfg_fd, buf, len, offset);
+}
+
+static int
+pci_uio_set_bus_master(int dev_fd)
+{
+ uint16_t reg;
+ int ret;
+
+ ret = pread(dev_fd, &reg, sizeof(reg), PCI_COMMAND);
+ if (ret != sizeof(reg)) {
+ RTE_LOG(ERR, EAL,
+ "Cannot read command from PCI config space!\n");
+ return -1;
+ }
+
+ /* return if bus mastering is already on */
+ if (reg & PCI_COMMAND_MASTER)
+ return 0;
+
+ reg |= PCI_COMMAND_MASTER;
+
+ ret = pwrite(dev_fd, &reg, sizeof(reg), PCI_COMMAND);
+ if (ret != sizeof(reg)) {
+ RTE_LOG(ERR, EAL,
+ "Cannot write command to PCI config space!\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+pci_mknod_uio_dev(const char *sysfs_uio_path, unsigned uio_num)
+{
+ FILE *f;
+ char filename[PATH_MAX];
+ int ret;
+ unsigned major, minor;
+ dev_t dev;
+
+ /* get the name of the sysfs file that contains the major and minor
+ * of the uio device and read its content */
+ snprintf(filename, sizeof(filename), "%s/dev", sysfs_uio_path);
+
+ f = fopen(filename, "r");
+ if (f == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): cannot open sysfs to get major:minor\n",
+ __func__);
+ return -1;
+ }
+
+ ret = fscanf(f, "%u:%u", &major, &minor);
+ if (ret != 2) {
+ RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs to get major:minor\n",
+ __func__);
+ fclose(f);
+ return -1;
+ }
+ fclose(f);
+
+ /* create the char device "mknod /dev/uioX c major minor" */
+ snprintf(filename, sizeof(filename), "/dev/uio%u", uio_num);
+ dev = makedev(major, minor);
+ ret = mknod(filename, S_IFCHR | S_IRUSR | S_IWUSR, dev);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "%s(): mknod() failed %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+
+ return ret;
+}
+
+/*
+ * Return the uioX char device used for a pci device. On success, return
+ * the UIO number and fill dstbuf string with the path of the device in
+ * sysfs. On error, return a negative value. In this case dstbuf is
+ * invalid.
+ */
+static int
+pci_get_uio_dev(struct rte_pci_device *dev, char *dstbuf,
+ unsigned int buflen, int create)
+{
+ struct rte_pci_addr *loc = &dev->addr;
+ int uio_num = -1;
+ struct dirent *e;
+ DIR *dir;
+ char dirname[PATH_MAX];
+
+ /* depending on kernel version, uio can be located in uio/uioX
+ * or uio:uioX */
+
+ snprintf(dirname, sizeof(dirname),
+ "%s/" PCI_PRI_FMT "/uio", rte_pci_get_sysfs_path(),
+ loc->domain, loc->bus, loc->devid, loc->function);
+
+ dir = opendir(dirname);
+ if (dir == NULL) {
+ /* retry with the parent directory */
+ snprintf(dirname, sizeof(dirname),
+ "%s/" PCI_PRI_FMT, rte_pci_get_sysfs_path(),
+ loc->domain, loc->bus, loc->devid, loc->function);
+ dir = opendir(dirname);
+
+ if (dir == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot opendir %s\n", dirname);
+ return -1;
+ }
+ }
+
+ /* take the first file starting with "uio" */
+ while ((e = readdir(dir)) != NULL) {
+ /* format could be uio%d ...*/
+ int shortprefix_len = sizeof("uio") - 1;
+ /* ... or uio:uio%d */
+ int longprefix_len = sizeof("uio:uio") - 1;
+ char *endptr;
+
+ if (strncmp(e->d_name, "uio", 3) != 0)
+ continue;
+
+ /* first try uio%d */
+ errno = 0;
+ uio_num = strtoull(e->d_name + shortprefix_len, &endptr, 10);
+ if (errno == 0 && endptr != (e->d_name + shortprefix_len)) {
+ snprintf(dstbuf, buflen, "%s/uio%u", dirname, uio_num);
+ break;
+ }
+
+ /* then try uio:uio%d */
+ errno = 0;
+ uio_num = strtoull(e->d_name + longprefix_len, &endptr, 10);
+ if (errno == 0 && endptr != (e->d_name + longprefix_len)) {
+ snprintf(dstbuf, buflen, "%s/uio:uio%u", dirname, uio_num);
+ break;
+ }
+ }
+ closedir(dir);
+
+ /* No uio resource found */
+ if (e == NULL)
+ return -1;
+
+ /* create uio device if we've been asked to */
+ if (rte_eal_create_uio_dev() && create &&
+ pci_mknod_uio_dev(dstbuf, uio_num) < 0)
+ RTE_LOG(WARNING, EAL, "Cannot create /dev/uio%u\n", uio_num);
+
+ return uio_num;
+}
+
+void
+pci_uio_free_resource(struct rte_pci_device *dev,
+ struct mapped_pci_resource *uio_res)
+{
+ rte_free(uio_res);
+
+ if (dev->intr_handle.uio_cfg_fd >= 0) {
+ close(dev->intr_handle.uio_cfg_fd);
+ dev->intr_handle.uio_cfg_fd = -1;
+ }
+ if (dev->intr_handle.fd >= 0) {
+ close(dev->intr_handle.fd);
+ dev->intr_handle.fd = -1;
+ dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+ }
+}
+
+int
+pci_uio_alloc_resource(struct rte_pci_device *dev,
+ struct mapped_pci_resource **uio_res)
+{
+ char dirname[PATH_MAX];
+ char cfgname[PATH_MAX];
+ char devname[PATH_MAX]; /* contains the /dev/uioX */
+ int uio_num;
+ struct rte_pci_addr *loc;
+
+ loc = &dev->addr;
+
+ /* find uio resource */
+ uio_num = pci_get_uio_dev(dev, dirname, sizeof(dirname), 1);
+ if (uio_num < 0) {
+ RTE_LOG(WARNING, EAL, " "PCI_PRI_FMT" not managed by UIO driver, "
+ "skipping\n", loc->domain, loc->bus, loc->devid, loc->function);
+ return 1;
+ }
+ snprintf(devname, sizeof(devname), "/dev/uio%u", uio_num);
+
+ /* save fd if in primary process */
+ dev->intr_handle.fd = open(devname, O_RDWR);
+ if (dev->intr_handle.fd < 0) {
+ RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
+ devname, strerror(errno));
+ goto error;
+ }
+
+ snprintf(cfgname, sizeof(cfgname),
+ "/sys/class/uio/uio%u/device/config", uio_num);
+ dev->intr_handle.uio_cfg_fd = open(cfgname, O_RDWR);
+ if (dev->intr_handle.uio_cfg_fd < 0) {
+ RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
+ cfgname, strerror(errno));
+ goto error;
+ }
+
+ if (dev->kdrv == RTE_KDRV_IGB_UIO)
+ dev->intr_handle.type = RTE_INTR_HANDLE_UIO;
+ else {
+ dev->intr_handle.type = RTE_INTR_HANDLE_UIO_INTX;
+
+ /* set bus master that is not done by uio_pci_generic */
+ if (pci_uio_set_bus_master(dev->intr_handle.uio_cfg_fd)) {
+ RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n");
+ goto error;
+ }
+ }
+
+ /* allocate the mapping details for secondary processes*/
+ *uio_res = rte_zmalloc("UIO_RES", sizeof(**uio_res), 0);
+ if (*uio_res == NULL) {
+ RTE_LOG(ERR, EAL,
+ "%s(): cannot store uio mmap details\n", __func__);
+ goto error;
+ }
+
+ strlcpy((*uio_res)->path, devname, sizeof((*uio_res)->path));
+ memcpy(&(*uio_res)->pci_addr, &dev->addr, sizeof((*uio_res)->pci_addr));
+
+ return 0;
+
+error:
+ pci_uio_free_resource(dev, *uio_res);
+ return -1;
+}
+
+int
+pci_uio_map_resource_by_index(struct rte_pci_device *dev, int res_idx,
+ struct mapped_pci_resource *uio_res, int map_idx)
+{
+ int fd = -1;
+ char devname[PATH_MAX];
+ void *mapaddr;
+ struct rte_pci_addr *loc;
+ struct pci_map *maps;
+ int wc_activate = 0;
+
+ if (dev->driver != NULL)
+ wc_activate = dev->driver->drv_flags & RTE_PCI_DRV_WC_ACTIVATE;
+
+ loc = &dev->addr;
+ maps = uio_res->maps;
+
+ /* allocate memory to keep path */
+ maps[map_idx].path = rte_malloc(NULL, sizeof(devname), 0);
+ if (maps[map_idx].path == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memory for path: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ /*
+ * open resource file, to mmap it
+ */
+ if (wc_activate) {
+ /* update devname for mmap */
+ snprintf(devname, sizeof(devname),
+ "%s/" PCI_PRI_FMT "/resource%d_wc",
+ rte_pci_get_sysfs_path(),
+ loc->domain, loc->bus, loc->devid,
+ loc->function, res_idx);
+
+ fd = open(devname, O_RDWR);
+ if (fd < 0 && errno != ENOENT) {
+ RTE_LOG(INFO, EAL, "%s cannot be mapped. "
+ "Fall-back to non prefetchable mode.\n",
+ devname);
+ }
+ }
+
+ if (!wc_activate || fd < 0) {
+ snprintf(devname, sizeof(devname),
+ "%s/" PCI_PRI_FMT "/resource%d",
+ rte_pci_get_sysfs_path(),
+ loc->domain, loc->bus, loc->devid,
+ loc->function, res_idx);
+
+ /* then try to map resource file */
+ fd = open(devname, O_RDWR);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
+ devname, strerror(errno));
+ goto error;
+ }
+ }
+
+ /* try mapping somewhere close to the end of hugepages */
+ if (pci_map_addr == NULL)
+ pci_map_addr = pci_find_max_end_va();
+
+ mapaddr = pci_map_resource(pci_map_addr, fd, 0,
+ (size_t)dev->mem_resource[res_idx].len, 0);
+ close(fd);
+ if (mapaddr == MAP_FAILED)
+ goto error;
+
+ pci_map_addr = RTE_PTR_ADD(mapaddr,
+ (size_t)dev->mem_resource[res_idx].len);
+
+ pci_map_addr = RTE_PTR_ALIGN(pci_map_addr, sysconf(_SC_PAGE_SIZE));
+
+ maps[map_idx].phaddr = dev->mem_resource[res_idx].phys_addr;
+ maps[map_idx].size = dev->mem_resource[res_idx].len;
+ maps[map_idx].addr = mapaddr;
+ maps[map_idx].offset = 0;
+ strcpy(maps[map_idx].path, devname);
+ dev->mem_resource[res_idx].addr = mapaddr;
+
+ return 0;
+
+error:
+ rte_free(maps[map_idx].path);
+ return -1;
+}
+
+#if defined(RTE_ARCH_X86)
+int
+pci_uio_ioport_map(struct rte_pci_device *dev, int bar,
+ struct rte_pci_ioport *p)
+{
+ char dirname[PATH_MAX];
+ char filename[PATH_MAX];
+ int uio_num;
+ unsigned long start;
+
+ if (rte_eal_iopl_init() != 0) {
+ RTE_LOG(ERR, EAL, "%s(): insufficient ioport permissions for PCI device %s\n",
+ __func__, dev->name);
+ return -1;
+ }
+
+ uio_num = pci_get_uio_dev(dev, dirname, sizeof(dirname), 0);
+ if (uio_num < 0)
+ return -1;
+
+ /* get portio start */
+ snprintf(filename, sizeof(filename),
+ "%s/portio/port%d/start", dirname, bar);
+ if (eal_parse_sysfs_value(filename, &start) < 0) {
+ RTE_LOG(ERR, EAL, "%s(): cannot parse portio start\n",
+ __func__);
+ return -1;
+ }
+ /* ensure we don't get anything funny here, read/write will cast to
+ * uin16_t */
+ if (start > UINT16_MAX)
+ return -1;
+
+ /* FIXME only for primary process ? */
+ if (dev->intr_handle.type == RTE_INTR_HANDLE_UNKNOWN) {
+
+ snprintf(filename, sizeof(filename), "/dev/uio%u", uio_num);
+ dev->intr_handle.fd = open(filename, O_RDWR);
+ if (dev->intr_handle.fd < 0) {
+ RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
+ filename, strerror(errno));
+ return -1;
+ }
+ dev->intr_handle.type = RTE_INTR_HANDLE_UIO;
+ }
+
+ RTE_LOG(DEBUG, EAL, "PCI Port IO found start=0x%lx\n", start);
+
+ p->base = start;
+ p->len = 0;
+ return 0;
+}
+#else
+int
+pci_uio_ioport_map(struct rte_pci_device *dev, int bar,
+ struct rte_pci_ioport *p)
+{
+ FILE *f;
+ char buf[BUFSIZ];
+ char filename[PATH_MAX];
+ uint64_t phys_addr, end_addr, flags;
+ int fd, i;
+ void *addr;
+
+ /* open and read addresses of the corresponding resource in sysfs */
+ snprintf(filename, sizeof(filename), "%s/" PCI_PRI_FMT "/resource",
+ rte_pci_get_sysfs_path(), dev->addr.domain, dev->addr.bus,
+ dev->addr.devid, dev->addr.function);
+ f = fopen(filename, "r");
+ if (f == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot open sysfs resource: %s\n",
+ strerror(errno));
+ return -1;
+ }
+ for (i = 0; i < bar + 1; i++) {
+ if (fgets(buf, sizeof(buf), f) == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot read sysfs resource\n");
+ goto error;
+ }
+ }
+ if (pci_parse_one_sysfs_resource(buf, sizeof(buf), &phys_addr,
+ &end_addr, &flags) < 0)
+ goto error;
+ if ((flags & IORESOURCE_IO) == 0) {
+ RTE_LOG(ERR, EAL, "BAR %d is not an IO resource\n", bar);
+ goto error;
+ }
+ snprintf(filename, sizeof(filename), "%s/" PCI_PRI_FMT "/resource%d",
+ rte_pci_get_sysfs_path(), dev->addr.domain, dev->addr.bus,
+ dev->addr.devid, dev->addr.function, bar);
+
+ /* mmap the pci resource */
+ fd = open(filename, O_RDWR);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
+ strerror(errno));
+ goto error;
+ }
+ addr = mmap(NULL, end_addr + 1, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ close(fd);
+ if (addr == MAP_FAILED) {
+ RTE_LOG(ERR, EAL, "Cannot mmap IO port resource: %s\n",
+ strerror(errno));
+ goto error;
+ }
+
+ /* strangely, the base address is mmap addr + phys_addr */
+ p->base = (uintptr_t)addr + phys_addr;
+ p->len = end_addr + 1;
+ RTE_LOG(DEBUG, EAL, "PCI Port IO found start=0x%"PRIx64"\n", p->base);
+ fclose(f);
+
+ return 0;
+
+error:
+ fclose(f);
+ return -1;
+}
+#endif
+
+void
+pci_uio_ioport_read(struct rte_pci_ioport *p,
+ void *data, size_t len, off_t offset)
+{
+ uint8_t *d;
+ int size;
+ uintptr_t reg = p->base + offset;
+
+ for (d = data; len > 0; d += size, reg += size, len -= size) {
+ if (len >= 4) {
+ size = 4;
+#if defined(RTE_ARCH_X86)
+ *(uint32_t *)d = inl(reg);
+#else
+ *(uint32_t *)d = *(volatile uint32_t *)reg;
+#endif
+ } else if (len >= 2) {
+ size = 2;
+#if defined(RTE_ARCH_X86)
+ *(uint16_t *)d = inw(reg);
+#else
+ *(uint16_t *)d = *(volatile uint16_t *)reg;
+#endif
+ } else {
+ size = 1;
+#if defined(RTE_ARCH_X86)
+ *d = inb(reg);
+#else
+ *d = *(volatile uint8_t *)reg;
+#endif
+ }
+ }
+}
+
+void
+pci_uio_ioport_write(struct rte_pci_ioport *p,
+ const void *data, size_t len, off_t offset)
+{
+ const uint8_t *s;
+ int size;
+ uintptr_t reg = p->base + offset;
+
+ for (s = data; len > 0; s += size, reg += size, len -= size) {
+ if (len >= 4) {
+ size = 4;
+#if defined(RTE_ARCH_X86)
+ outl_p(*(const uint32_t *)s, reg);
+#else
+ *(volatile uint32_t *)reg = *(const uint32_t *)s;
+#endif
+ } else if (len >= 2) {
+ size = 2;
+#if defined(RTE_ARCH_X86)
+ outw_p(*(const uint16_t *)s, reg);
+#else
+ *(volatile uint16_t *)reg = *(const uint16_t *)s;
+#endif
+ } else {
+ size = 1;
+#if defined(RTE_ARCH_X86)
+ outb_p(*s, reg);
+#else
+ *(volatile uint8_t *)reg = *s;
+#endif
+ }
+ }
+}
+
+int
+pci_uio_ioport_unmap(struct rte_pci_ioport *p)
+{
+#if defined(RTE_ARCH_X86)
+ RTE_SET_USED(p);
+ /* FIXME close intr fd ? */
+ return 0;
+#else
+ return munmap((void *)(uintptr_t)p->base, p->len);
+#endif
+}
diff --git a/src/spdk/dpdk/drivers/bus/pci/linux/pci_vfio.c b/src/spdk/dpdk/drivers/bus/pci/linux/pci_vfio.c
new file mode 100644
index 000000000..64cd84a68
--- /dev/null
+++ b/src/spdk/dpdk/drivers/bus/pci/linux/pci_vfio.c
@@ -0,0 +1,1070 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <string.h>
+#include <fcntl.h>
+#include <linux/pci_regs.h>
+#include <sys/eventfd.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <stdbool.h>
+
+#include <rte_log.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_malloc.h>
+#include <rte_vfio.h>
+#include <rte_eal.h>
+#include <rte_bus.h>
+#include <rte_spinlock.h>
+#include <rte_tailq.h>
+
+#include "eal_filesystem.h"
+
+#include "pci_init.h"
+#include "private.h"
+
+/**
+ * @file
+ * PCI probing under linux (VFIO version)
+ *
+ * This code tries to determine if the PCI device is bound to VFIO driver,
+ * and initialize it (map BARs, set up interrupts) if that's the case.
+ *
+ * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y".
+ */
+
+#ifdef VFIO_PRESENT
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE (sysconf(_SC_PAGESIZE))
+#endif
+#define PAGE_MASK (~(PAGE_SIZE - 1))
+
+static struct rte_tailq_elem rte_vfio_tailq = {
+ .name = "VFIO_RESOURCE_LIST",
+};
+EAL_REGISTER_TAILQ(rte_vfio_tailq)
+
+int
+pci_vfio_read_config(const struct rte_intr_handle *intr_handle,
+ void *buf, size_t len, off_t offs)
+{
+ return pread64(intr_handle->vfio_dev_fd, buf, len,
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs);
+}
+
+int
+pci_vfio_write_config(const struct rte_intr_handle *intr_handle,
+ const void *buf, size_t len, off_t offs)
+{
+ return pwrite64(intr_handle->vfio_dev_fd, buf, len,
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs);
+}
+
+/* get PCI BAR number where MSI-X interrupts are */
+static int
+pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table)
+{
+ int ret;
+ uint32_t reg;
+ uint16_t flags;
+ uint8_t cap_id, cap_offset;
+
+ /* read PCI capability pointer from config space */
+ ret = pread64(fd, &reg, sizeof(reg),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+ PCI_CAPABILITY_LIST);
+ if (ret != sizeof(reg)) {
+ RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI "
+ "config space!\n");
+ return -1;
+ }
+
+ /* we need first byte */
+ cap_offset = reg & 0xFF;
+
+ while (cap_offset) {
+
+ /* read PCI capability ID */
+ ret = pread64(fd, &reg, sizeof(reg),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+ cap_offset);
+ if (ret != sizeof(reg)) {
+ RTE_LOG(ERR, EAL, "Cannot read capability ID from PCI "
+ "config space!\n");
+ return -1;
+ }
+
+ /* we need first byte */
+ cap_id = reg & 0xFF;
+
+ /* if we haven't reached MSI-X, check next capability */
+ if (cap_id != PCI_CAP_ID_MSIX) {
+ ret = pread64(fd, &reg, sizeof(reg),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+ cap_offset);
+ if (ret != sizeof(reg)) {
+ RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI "
+ "config space!\n");
+ return -1;
+ }
+
+ /* we need second byte */
+ cap_offset = (reg & 0xFF00) >> 8;
+
+ continue;
+ }
+ /* else, read table offset */
+ else {
+ /* table offset resides in the next 4 bytes */
+ ret = pread64(fd, &reg, sizeof(reg),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+ cap_offset + 4);
+ if (ret != sizeof(reg)) {
+ RTE_LOG(ERR, EAL, "Cannot read table offset from PCI config "
+ "space!\n");
+ return -1;
+ }
+
+ ret = pread64(fd, &flags, sizeof(flags),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+ cap_offset + 2);
+ if (ret != sizeof(flags)) {
+ RTE_LOG(ERR, EAL, "Cannot read table flags from PCI config "
+ "space!\n");
+ return -1;
+ }
+
+ msix_table->bar_index = reg & RTE_PCI_MSIX_TABLE_BIR;
+ msix_table->offset = reg & RTE_PCI_MSIX_TABLE_OFFSET;
+ msix_table->size =
+ 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE));
+
+ return 0;
+ }
+ }
+ return 0;
+}
+
+/* set PCI bus mastering */
+static int
+pci_vfio_set_bus_master(int dev_fd, bool op)
+{
+ uint16_t reg;
+ int ret;
+
+ ret = pread64(dev_fd, &reg, sizeof(reg),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+ PCI_COMMAND);
+ if (ret != sizeof(reg)) {
+ RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n");
+ return -1;
+ }
+
+ if (op)
+ /* set the master bit */
+ reg |= PCI_COMMAND_MASTER;
+ else
+ reg &= ~(PCI_COMMAND_MASTER);
+
+ ret = pwrite64(dev_fd, &reg, sizeof(reg),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+ PCI_COMMAND);
+
+ if (ret != sizeof(reg)) {
+ RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+/* set up interrupt support (but not enable interrupts) */
+static int
+pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd)
+{
+ int i, ret, intr_idx;
+ enum rte_intr_mode intr_mode;
+
+ /* default to invalid index */
+ intr_idx = VFIO_PCI_NUM_IRQS;
+
+ /* Get default / configured intr_mode */
+ intr_mode = rte_eal_vfio_intr_mode();
+
+ /* get interrupt type from internal config (MSI-X by default, can be
+ * overridden from the command line
+ */
+ switch (intr_mode) {
+ case RTE_INTR_MODE_MSIX:
+ intr_idx = VFIO_PCI_MSIX_IRQ_INDEX;
+ break;
+ case RTE_INTR_MODE_MSI:
+ intr_idx = VFIO_PCI_MSI_IRQ_INDEX;
+ break;
+ case RTE_INTR_MODE_LEGACY:
+ intr_idx = VFIO_PCI_INTX_IRQ_INDEX;
+ break;
+ /* don't do anything if we want to automatically determine interrupt type */
+ case RTE_INTR_MODE_NONE:
+ break;
+ default:
+ RTE_LOG(ERR, EAL, " unknown default interrupt type!\n");
+ return -1;
+ }
+
+ /* start from MSI-X interrupt type */
+ for (i = VFIO_PCI_MSIX_IRQ_INDEX; i >= 0; i--) {
+ struct vfio_irq_info irq = { .argsz = sizeof(irq) };
+ int fd = -1;
+
+ /* skip interrupt modes we don't want */
+ if (intr_mode != RTE_INTR_MODE_NONE &&
+ i != intr_idx)
+ continue;
+
+ irq.index = i;
+
+ ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, " cannot get IRQ info, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ /* if this vector cannot be used with eventfd, fail if we explicitly
+ * specified interrupt type, otherwise continue */
+ if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) {
+ if (intr_mode != RTE_INTR_MODE_NONE) {
+ RTE_LOG(ERR, EAL,
+ " interrupt vector does not support eventfd!\n");
+ return -1;
+ } else
+ continue;
+ }
+
+ /* set up an eventfd for interrupts */
+ fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, " cannot set up eventfd, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ dev->intr_handle.fd = fd;
+ dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
+
+ switch (i) {
+ case VFIO_PCI_MSIX_IRQ_INDEX:
+ intr_mode = RTE_INTR_MODE_MSIX;
+ dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX;
+ break;
+ case VFIO_PCI_MSI_IRQ_INDEX:
+ intr_mode = RTE_INTR_MODE_MSI;
+ dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSI;
+ break;
+ case VFIO_PCI_INTX_IRQ_INDEX:
+ intr_mode = RTE_INTR_MODE_LEGACY;
+ dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_LEGACY;
+ break;
+ default:
+ RTE_LOG(ERR, EAL, " unknown interrupt type!\n");
+ return -1;
+ }
+
+ return 0;
+ }
+
+ /* if we're here, we haven't found a suitable interrupt vector */
+ return -1;
+}
+
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+/*
+ * Spinlock for device hot-unplug failure handling.
+ * If it tries to access bus or device, such as handle sigbus on bus
+ * or handle memory failure for device, just need to use this lock.
+ * It could protect the bus and the device to avoid race condition.
+ */
+static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER;
+
+static void
+pci_vfio_req_handler(void *param)
+{
+ struct rte_bus *bus;
+ int ret;
+ struct rte_device *device = (struct rte_device *)param;
+
+ rte_spinlock_lock(&failure_handle_lock);
+ bus = rte_bus_find_by_device(device);
+ if (bus == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot find bus for device (%s)\n",
+ device->name);
+ goto handle_end;
+ }
+
+ /*
+ * vfio kernel module request user space to release allocated
+ * resources before device be deleted in kernel, so it can directly
+ * call the vfio bus hot-unplug handler to process it.
+ */
+ ret = bus->hot_unplug_handler(device);
+ if (ret)
+ RTE_LOG(ERR, EAL,
+ "Can not handle hot-unplug for device (%s)\n",
+ device->name);
+handle_end:
+ rte_spinlock_unlock(&failure_handle_lock);
+}
+
+/* enable notifier (only enable req now) */
+static int
+pci_vfio_enable_notifier(struct rte_pci_device *dev, int vfio_dev_fd)
+{
+ int ret;
+ int fd = -1;
+
+ /* set up an eventfd for req notifier */
+ fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "Cannot set up eventfd, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ dev->vfio_req_intr_handle.fd = fd;
+ dev->vfio_req_intr_handle.type = RTE_INTR_HANDLE_VFIO_REQ;
+ dev->vfio_req_intr_handle.vfio_dev_fd = vfio_dev_fd;
+
+ ret = rte_intr_callback_register(&dev->vfio_req_intr_handle,
+ pci_vfio_req_handler,
+ (void *)&dev->device);
+ if (ret) {
+ RTE_LOG(ERR, EAL, "Fail to register req notifier handler.\n");
+ goto error;
+ }
+
+ ret = rte_intr_enable(&dev->vfio_req_intr_handle);
+ if (ret) {
+ RTE_LOG(ERR, EAL, "Fail to enable req notifier.\n");
+ ret = rte_intr_callback_unregister(&dev->vfio_req_intr_handle,
+ pci_vfio_req_handler,
+ (void *)&dev->device);
+ if (ret < 0)
+ RTE_LOG(ERR, EAL,
+ "Fail to unregister req notifier handler.\n");
+ goto error;
+ }
+
+ return 0;
+error:
+ close(fd);
+
+ dev->vfio_req_intr_handle.fd = -1;
+ dev->vfio_req_intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+ dev->vfio_req_intr_handle.vfio_dev_fd = -1;
+
+ return -1;
+}
+
+/* disable notifier (only disable req now) */
+static int
+pci_vfio_disable_notifier(struct rte_pci_device *dev)
+{
+ int ret;
+
+ ret = rte_intr_disable(&dev->vfio_req_intr_handle);
+ if (ret) {
+ RTE_LOG(ERR, EAL, "fail to disable req notifier.\n");
+ return -1;
+ }
+
+ ret = rte_intr_callback_unregister(&dev->vfio_req_intr_handle,
+ pci_vfio_req_handler,
+ (void *)&dev->device);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL,
+ "fail to unregister req notifier handler.\n");
+ return -1;
+ }
+
+ close(dev->vfio_req_intr_handle.fd);
+
+ dev->vfio_req_intr_handle.fd = -1;
+ dev->vfio_req_intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+ dev->vfio_req_intr_handle.vfio_dev_fd = -1;
+
+ return 0;
+}
+#endif
+
+static int
+pci_vfio_is_ioport_bar(int vfio_dev_fd, int bar_index)
+{
+ uint32_t ioport_bar;
+ int ret;
+
+ ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX)
+ + PCI_BASE_ADDRESS_0 + bar_index*4);
+ if (ret != sizeof(ioport_bar)) {
+ RTE_LOG(ERR, EAL, "Cannot read command (%x) from config space!\n",
+ PCI_BASE_ADDRESS_0 + bar_index*4);
+ return -1;
+ }
+
+ return (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) != 0;
+}
+
+static int
+pci_rte_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd)
+{
+ if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) {
+ RTE_LOG(ERR, EAL, "Error setting up interrupts!\n");
+ return -1;
+ }
+
+ /* set bus mastering for the device */
+ if (pci_vfio_set_bus_master(vfio_dev_fd, true)) {
+ RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n");
+ return -1;
+ }
+
+ /*
+ * Reset the device. If the device is not capable of resetting,
+ * then it updates errno as EINVAL.
+ */
+ if (ioctl(vfio_dev_fd, VFIO_DEVICE_RESET) && errno != EINVAL) {
+ RTE_LOG(ERR, EAL, "Unable to reset device! Error: %d (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
+ int bar_index, int additional_flags)
+{
+ struct memreg {
+ uint64_t offset;
+ size_t size;
+ } memreg[2] = {};
+ void *bar_addr;
+ struct pci_msix_table *msix_table = &vfio_res->msix_table;
+ struct pci_map *bar = &vfio_res->maps[bar_index];
+
+ if (bar->size == 0) {
+ RTE_LOG(DEBUG, EAL, "Bar size is 0, skip BAR%d\n", bar_index);
+ return 0;
+ }
+
+ if (msix_table->bar_index == bar_index) {
+ /*
+ * VFIO will not let us map the MSI-X table,
+ * but we can map around it.
+ */
+ uint32_t table_start = msix_table->offset;
+ uint32_t table_end = table_start + msix_table->size;
+ table_end = RTE_ALIGN(table_end, PAGE_SIZE);
+ table_start = RTE_ALIGN_FLOOR(table_start, PAGE_SIZE);
+
+ /* If page-aligned start of MSI-X table is less than the
+ * actual MSI-X table start address, reassign to the actual
+ * start address.
+ */
+ if (table_start < msix_table->offset)
+ table_start = msix_table->offset;
+
+ if (table_start == 0 && table_end >= bar->size) {
+ /* Cannot map this BAR */
+ RTE_LOG(DEBUG, EAL, "Skipping BAR%d\n", bar_index);
+ bar->size = 0;
+ bar->addr = 0;
+ return 0;
+ }
+
+ memreg[0].offset = bar->offset;
+ memreg[0].size = table_start;
+ if (bar->size < table_end) {
+ /*
+ * If MSI-X table end is beyond BAR end, don't attempt
+ * to perform second mapping.
+ */
+ memreg[1].offset = 0;
+ memreg[1].size = 0;
+ } else {
+ memreg[1].offset = bar->offset + table_end;
+ memreg[1].size = bar->size - table_end;
+ }
+
+ RTE_LOG(DEBUG, EAL,
+ "Trying to map BAR%d that contains the MSI-X "
+ "table. Trying offsets: "
+ "0x%04" PRIx64 ":0x%04zx, 0x%04" PRIx64 ":0x%04zx\n",
+ bar_index,
+ memreg[0].offset, memreg[0].size,
+ memreg[1].offset, memreg[1].size);
+ } else {
+ memreg[0].offset = bar->offset;
+ memreg[0].size = bar->size;
+ }
+
+ /* reserve the address using an inaccessible mapping */
+ bar_addr = mmap(bar->addr, bar->size, 0, MAP_PRIVATE |
+ MAP_ANONYMOUS | additional_flags, -1, 0);
+ if (bar_addr != MAP_FAILED) {
+ void *map_addr = NULL;
+ if (memreg[0].size) {
+ /* actual map of first part */
+ map_addr = pci_map_resource(bar_addr, vfio_dev_fd,
+ memreg[0].offset,
+ memreg[0].size,
+ MAP_FIXED);
+ }
+
+ /* if there's a second part, try to map it */
+ if (map_addr != MAP_FAILED
+ && memreg[1].offset && memreg[1].size) {
+ void *second_addr = RTE_PTR_ADD(bar_addr,
+ (uintptr_t)(memreg[1].offset -
+ bar->offset));
+ map_addr = pci_map_resource(second_addr,
+ vfio_dev_fd,
+ memreg[1].offset,
+ memreg[1].size,
+ MAP_FIXED);
+ }
+
+ if (map_addr == MAP_FAILED || !map_addr) {
+ munmap(bar_addr, bar->size);
+ bar_addr = MAP_FAILED;
+ RTE_LOG(ERR, EAL, "Failed to map pci BAR%d\n",
+ bar_index);
+ return -1;
+ }
+ } else {
+ RTE_LOG(ERR, EAL,
+ "Failed to create inaccessible mapping for BAR%d\n",
+ bar_index);
+ return -1;
+ }
+
+ bar->addr = bar_addr;
+ return 0;
+}
+
+/*
+ * region info may contain capability headers, so we need to keep reallocating
+ * the memory until we match allocated memory size with argsz.
+ */
+static int
+pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info,
+ int region)
+{
+ struct vfio_region_info *ri;
+ size_t argsz = sizeof(*ri);
+ int ret;
+
+ ri = malloc(sizeof(*ri));
+ if (ri == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memory for region info\n");
+ return -1;
+ }
+again:
+ memset(ri, 0, argsz);
+ ri->argsz = argsz;
+ ri->index = region;
+
+ ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ri);
+ if (ret < 0) {
+ free(ri);
+ return ret;
+ }
+ if (ri->argsz != argsz) {
+ struct vfio_region_info *tmp;
+
+ argsz = ri->argsz;
+ tmp = realloc(ri, argsz);
+
+ if (tmp == NULL) {
+ /* realloc failed but the ri is still there */
+ free(ri);
+ RTE_LOG(ERR, EAL, "Cannot reallocate memory for region info\n");
+ return -1;
+ }
+ ri = tmp;
+ goto again;
+ }
+ *info = ri;
+
+ return 0;
+}
+
+static struct vfio_info_cap_header *
+pci_vfio_info_cap(struct vfio_region_info *info, int cap)
+{
+ struct vfio_info_cap_header *h;
+ size_t offset;
+
+ if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) {
+ /* VFIO info does not advertise capabilities */
+ return NULL;
+ }
+
+ offset = VFIO_CAP_OFFSET(info);
+ while (offset != 0) {
+ h = RTE_PTR_ADD(info, offset);
+ if (h->id == cap)
+ return h;
+ offset = h->next;
+ }
+ return NULL;
+}
+
+static int
+pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region)
+{
+ struct vfio_region_info *info;
+ int ret;
+
+ ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region);
+ if (ret < 0)
+ return -1;
+
+ ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL;
+
+ /* cleanup */
+ free(info);
+
+ return ret;
+}
+
+
+static int
+pci_vfio_map_resource_primary(struct rte_pci_device *dev)
+{
+ struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
+ char pci_addr[PATH_MAX] = {0};
+ int vfio_dev_fd;
+ struct rte_pci_addr *loc = &dev->addr;
+ int i, ret;
+ struct mapped_pci_resource *vfio_res = NULL;
+ struct mapped_pci_res_list *vfio_res_list =
+ RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
+
+ struct pci_map *maps;
+
+ dev->intr_handle.fd = -1;
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+ dev->vfio_req_intr_handle.fd = -1;
+#endif
+
+ /* store PCI address string */
+ snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+ loc->domain, loc->bus, loc->devid, loc->function);
+
+ ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
+ &vfio_dev_fd, &device_info);
+ if (ret)
+ return ret;
+
+ /* allocate vfio_res and get region info */
+ vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0);
+ if (vfio_res == NULL) {
+ RTE_LOG(ERR, EAL,
+ "%s(): cannot store vfio mmap details\n", __func__);
+ goto err_vfio_dev_fd;
+ }
+ memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr));
+
+ /* get number of registers (up to BAR5) */
+ vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions,
+ VFIO_PCI_BAR5_REGION_INDEX + 1);
+
+ /* map BARs */
+ maps = vfio_res->maps;
+
+ vfio_res->msix_table.bar_index = -1;
+ /* get MSI-X BAR, if any (we have to know where it is because we can't
+ * easily mmap it when using VFIO)
+ */
+ ret = pci_vfio_get_msix_bar(vfio_dev_fd, &vfio_res->msix_table);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, " %s cannot get MSI-X BAR number!\n",
+ pci_addr);
+ goto err_vfio_res;
+ }
+ /* if we found our MSI-X BAR region, check if we can mmap it */
+ if (vfio_res->msix_table.bar_index != -1) {
+ int ret = pci_vfio_msix_is_mappable(vfio_dev_fd,
+ vfio_res->msix_table.bar_index);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n");
+ goto err_vfio_res;
+ } else if (ret != 0) {
+ /* we can map it, so we don't care where it is */
+ RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n");
+ vfio_res->msix_table.bar_index = -1;
+ }
+ }
+
+ for (i = 0; i < (int) vfio_res->nb_maps; i++) {
+ struct vfio_region_info *reg = NULL;
+ void *bar_addr;
+
+ ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, " %s cannot get device region info "
+ "error %i (%s)\n", pci_addr, errno,
+ strerror(errno));
+ goto err_vfio_res;
+ }
+
+ /* chk for io port region */
+ ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i);
+ if (ret < 0) {
+ free(reg);
+ goto err_vfio_res;
+ } else if (ret) {
+ RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n",
+ i);
+ free(reg);
+ continue;
+ }
+
+ /* skip non-mmapable BARs */
+ if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
+ free(reg);
+ continue;
+ }
+
+ /* try mapping somewhere close to the end of hugepages */
+ if (pci_map_addr == NULL)
+ pci_map_addr = pci_find_max_end_va();
+
+ bar_addr = pci_map_addr;
+ pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);
+
+ pci_map_addr = RTE_PTR_ALIGN(pci_map_addr,
+ sysconf(_SC_PAGE_SIZE));
+
+ maps[i].addr = bar_addr;
+ maps[i].offset = reg->offset;
+ maps[i].size = reg->size;
+ maps[i].path = NULL; /* vfio doesn't have per-resource paths */
+
+ ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n",
+ pci_addr, i, strerror(errno));
+ free(reg);
+ goto err_vfio_res;
+ }
+
+ dev->mem_resource[i].addr = maps[i].addr;
+
+ free(reg);
+ }
+
+ if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) {
+ RTE_LOG(ERR, EAL, " %s setup device failed\n", pci_addr);
+ goto err_vfio_res;
+ }
+
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+ if (pci_vfio_enable_notifier(dev, vfio_dev_fd) != 0) {
+ RTE_LOG(ERR, EAL, "Error setting up notifier!\n");
+ goto err_vfio_res;
+ }
+
+#endif
+ TAILQ_INSERT_TAIL(vfio_res_list, vfio_res, next);
+
+ return 0;
+err_vfio_res:
+ rte_free(vfio_res);
+err_vfio_dev_fd:
+ close(vfio_dev_fd);
+ return -1;
+}
+
+static int
+pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
+{
+ struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
+ char pci_addr[PATH_MAX] = {0};
+ int vfio_dev_fd;
+ struct rte_pci_addr *loc = &dev->addr;
+ int i, ret;
+ struct mapped_pci_resource *vfio_res = NULL;
+ struct mapped_pci_res_list *vfio_res_list =
+ RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
+
+ struct pci_map *maps;
+
+ dev->intr_handle.fd = -1;
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+ dev->vfio_req_intr_handle.fd = -1;
+#endif
+
+ /* store PCI address string */
+ snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+ loc->domain, loc->bus, loc->devid, loc->function);
+
+ /* if we're in a secondary process, just find our tailq entry */
+ TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
+ if (rte_pci_addr_cmp(&vfio_res->pci_addr,
+ &dev->addr))
+ continue;
+ break;
+ }
+ /* if we haven't found our tailq entry, something's wrong */
+ if (vfio_res == NULL) {
+ RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n",
+ pci_addr);
+ return -1;
+ }
+
+ ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
+ &vfio_dev_fd, &device_info);
+ if (ret)
+ return ret;
+
+ /* map BARs */
+ maps = vfio_res->maps;
+
+ for (i = 0; i < (int) vfio_res->nb_maps; i++) {
+ ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n",
+ pci_addr, i, strerror(errno));
+ goto err_vfio_dev_fd;
+ }
+
+ dev->mem_resource[i].addr = maps[i].addr;
+ }
+
+ /* we need save vfio_dev_fd, so it can be used during release */
+ dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+ dev->vfio_req_intr_handle.vfio_dev_fd = vfio_dev_fd;
+#endif
+
+ return 0;
+err_vfio_dev_fd:
+ close(vfio_dev_fd);
+ return -1;
+}
+
+/*
+ * map the PCI resources of a PCI device in virtual memory (VFIO version).
+ * primary and secondary processes follow almost exactly the same path
+ */
+int
+pci_vfio_map_resource(struct rte_pci_device *dev)
+{
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ return pci_vfio_map_resource_primary(dev);
+ else
+ return pci_vfio_map_resource_secondary(dev);
+}
+
+static struct mapped_pci_resource *
+find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list,
+ struct rte_pci_device *dev,
+ const char *pci_addr)
+{
+ struct mapped_pci_resource *vfio_res = NULL;
+ struct pci_map *maps;
+ int i;
+
+ /* Get vfio_res */
+ TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
+ if (rte_pci_addr_cmp(&vfio_res->pci_addr, &dev->addr))
+ continue;
+ break;
+ }
+
+ if (vfio_res == NULL)
+ return vfio_res;
+
+ RTE_LOG(INFO, EAL, "Releasing pci mapped resource for %s\n",
+ pci_addr);
+
+ maps = vfio_res->maps;
+ for (i = 0; i < (int) vfio_res->nb_maps; i++) {
+
+ /*
+ * We do not need to be aware of MSI-X table BAR mappings as
+ * when mapping. Just using current maps array is enough
+ */
+ if (maps[i].addr) {
+ RTE_LOG(INFO, EAL, "Calling pci_unmap_resource for %s at %p\n",
+ pci_addr, maps[i].addr);
+ pci_unmap_resource(maps[i].addr, maps[i].size);
+ }
+ }
+
+ return vfio_res;
+}
+
+static int
+pci_vfio_unmap_resource_primary(struct rte_pci_device *dev)
+{
+ char pci_addr[PATH_MAX] = {0};
+ struct rte_pci_addr *loc = &dev->addr;
+ struct mapped_pci_resource *vfio_res = NULL;
+ struct mapped_pci_res_list *vfio_res_list;
+ int ret;
+
+ /* store PCI address string */
+ snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+ loc->domain, loc->bus, loc->devid, loc->function);
+
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+ ret = pci_vfio_disable_notifier(dev);
+ if (ret) {
+ RTE_LOG(ERR, EAL, "fail to disable req notifier.\n");
+ return -1;
+ }
+
+#endif
+ if (close(dev->intr_handle.fd) < 0) {
+ RTE_LOG(INFO, EAL, "Error when closing eventfd file descriptor for %s\n",
+ pci_addr);
+ return -1;
+ }
+
+ if (pci_vfio_set_bus_master(dev->intr_handle.vfio_dev_fd, false)) {
+ RTE_LOG(ERR, EAL, " %s cannot unset bus mastering for PCI device!\n",
+ pci_addr);
+ return -1;
+ }
+
+ ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr,
+ dev->intr_handle.vfio_dev_fd);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL,
+ "%s(): cannot release device\n", __func__);
+ return ret;
+ }
+
+ vfio_res_list =
+ RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
+ vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr);
+
+ /* if we haven't found our tailq entry, something's wrong */
+ if (vfio_res == NULL) {
+ RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n",
+ pci_addr);
+ return -1;
+ }
+
+ TAILQ_REMOVE(vfio_res_list, vfio_res, next);
+
+ return 0;
+}
+
+static int
+pci_vfio_unmap_resource_secondary(struct rte_pci_device *dev)
+{
+ char pci_addr[PATH_MAX] = {0};
+ struct rte_pci_addr *loc = &dev->addr;
+ struct mapped_pci_resource *vfio_res = NULL;
+ struct mapped_pci_res_list *vfio_res_list;
+ int ret;
+
+ /* store PCI address string */
+ snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+ loc->domain, loc->bus, loc->devid, loc->function);
+
+ ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr,
+ dev->intr_handle.vfio_dev_fd);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL,
+ "%s(): cannot release device\n", __func__);
+ return ret;
+ }
+
+ vfio_res_list =
+ RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
+ vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr);
+
+ /* if we haven't found our tailq entry, something's wrong */
+ if (vfio_res == NULL) {
+ RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n",
+ pci_addr);
+ return -1;
+ }
+
+ return 0;
+}
+
+int
+pci_vfio_unmap_resource(struct rte_pci_device *dev)
+{
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ return pci_vfio_unmap_resource_primary(dev);
+ else
+ return pci_vfio_unmap_resource_secondary(dev);
+}
+
+int
+pci_vfio_ioport_map(struct rte_pci_device *dev, int bar,
+ struct rte_pci_ioport *p)
+{
+ if (bar < VFIO_PCI_BAR0_REGION_INDEX ||
+ bar > VFIO_PCI_BAR5_REGION_INDEX) {
+ RTE_LOG(ERR, EAL, "invalid bar (%d)!\n", bar);
+ return -1;
+ }
+
+ p->dev = dev;
+ p->base = VFIO_GET_REGION_ADDR(bar);
+ return 0;
+}
+
+void
+pci_vfio_ioport_read(struct rte_pci_ioport *p,
+ void *data, size_t len, off_t offset)
+{
+ const struct rte_intr_handle *intr_handle = &p->dev->intr_handle;
+
+ if (pread64(intr_handle->vfio_dev_fd, data,
+ len, p->base + offset) <= 0)
+ RTE_LOG(ERR, EAL,
+ "Can't read from PCI bar (%" PRIu64 ") : offset (%x)\n",
+ VFIO_GET_REGION_IDX(p->base), (int)offset);
+}
+
+void
+pci_vfio_ioport_write(struct rte_pci_ioport *p,
+ const void *data, size_t len, off_t offset)
+{
+ const struct rte_intr_handle *intr_handle = &p->dev->intr_handle;
+
+ if (pwrite64(intr_handle->vfio_dev_fd, data,
+ len, p->base + offset) <= 0)
+ RTE_LOG(ERR, EAL,
+ "Can't write to PCI bar (%" PRIu64 ") : offset (%x)\n",
+ VFIO_GET_REGION_IDX(p->base), (int)offset);
+}
+
+int
+pci_vfio_ioport_unmap(struct rte_pci_ioport *p)
+{
+ RTE_SET_USED(p);
+ return -1;
+}
+
+int
+pci_vfio_is_enabled(void)
+{
+ return rte_vfio_is_enabled("vfio_pci");
+}
+#endif