summaryrefslogtreecommitdiffstats
path: root/src/seastar/dpdk/drivers/bus/vmbus
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
commit19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/seastar/dpdk/drivers/bus/vmbus
parentInitial commit. (diff)
downloadceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz
ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/seastar/dpdk/drivers/bus/vmbus')
-rw-r--r--src/seastar/dpdk/drivers/bus/vmbus/Makefile35
-rw-r--r--src/seastar/dpdk/drivers/bus/vmbus/linux/Makefile3
-rw-r--r--src/seastar/dpdk/drivers/bus/vmbus/linux/vmbus_bus.c358
-rw-r--r--src/seastar/dpdk/drivers/bus/vmbus/linux/vmbus_uio.c454
-rw-r--r--src/seastar/dpdk/drivers/bus/vmbus/meson.build20
-rw-r--r--src/seastar/dpdk/drivers/bus/vmbus/private.h141
-rw-r--r--src/seastar/dpdk/drivers/bus/vmbus/rte_bus_vmbus.h421
-rw-r--r--src/seastar/dpdk/drivers/bus/vmbus/rte_bus_vmbus_version.map36
-rw-r--r--src/seastar/dpdk/drivers/bus/vmbus/rte_vmbus_reg.h344
-rw-r--r--src/seastar/dpdk/drivers/bus/vmbus/vmbus_bufring.c244
-rw-r--r--src/seastar/dpdk/drivers/bus/vmbus/vmbus_channel.c445
-rw-r--r--src/seastar/dpdk/drivers/bus/vmbus/vmbus_common.c307
-rw-r--r--src/seastar/dpdk/drivers/bus/vmbus/vmbus_common_uio.c234
13 files changed, 3042 insertions, 0 deletions
diff --git a/src/seastar/dpdk/drivers/bus/vmbus/Makefile b/src/seastar/dpdk/drivers/bus/vmbus/Makefile
new file mode 100644
index 000000000..8f3ec7af4
--- /dev/null
+++ b/src/seastar/dpdk/drivers/bus/vmbus/Makefile
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: BSD-3-Clause
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+LIB = librte_bus_vmbus.a
+LIBABIVER := 2
+EXPORT_MAP := rte_bus_vmbus_version.map
+
+CFLAGS += -I$(SRCDIR)
+CFLAGS += -O3 $(WERROR_FLAGS)
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
+ifneq ($(CONFIG_RTE_EXEC_ENV_LINUX),)
+SYSTEM := linux
+endif
+ifneq ($(CONFIG_RTE_EXEC_ENV_FREEBSD),)
+$(error "VMBUS not implemented for BSD yet")
+endif
+
+CFLAGS += -I$(RTE_SDK)/drivers/bus/vmbus/$(SYSTEM)
+CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common
+
+LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
+LDLIBS += -lrte_ethdev
+
+include $(RTE_SDK)/drivers/bus/vmbus/$(SYSTEM)/Makefile
+SRCS-$(CONFIG_RTE_LIBRTE_VMBUS) := $(addprefix $(SYSTEM)/,$(SRCS))
+SRCS-$(CONFIG_RTE_LIBRTE_VMBUS) += vmbus_common.c
+SRCS-$(CONFIG_RTE_LIBRTE_VMBUS) += vmbus_channel.c vmbus_bufring.c
+SRCS-$(CONFIG_RTE_LIBRTE_VMBUS) += vmbus_common_uio.c
+
+SYMLINK-$(CONFIG_RTE_LIBRTE_VMBUS)-include += rte_bus_vmbus.h
+SYMLINK-$(CONFIG_RTE_LIBRTE_VMBUS)-include += rte_vmbus_reg.h
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/src/seastar/dpdk/drivers/bus/vmbus/linux/Makefile b/src/seastar/dpdk/drivers/bus/vmbus/linux/Makefile
new file mode 100644
index 000000000..ef0d30b2d
--- /dev/null
+++ b/src/seastar/dpdk/drivers/bus/vmbus/linux/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: BSD-3-Clause
+
+SRCS += vmbus_bus.c vmbus_uio.c
diff --git a/src/seastar/dpdk/drivers/bus/vmbus/linux/vmbus_bus.c b/src/seastar/dpdk/drivers/bus/vmbus/linux/vmbus_bus.c
new file mode 100644
index 000000000..a4755a387
--- /dev/null
+++ b/src/seastar/dpdk/drivers/bus/vmbus/linux/vmbus_bus.c
@@ -0,0 +1,358 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2018, Microsoft Corporation.
+ * All Rights Reserved.
+ */
+
+#include <string.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#include <rte_eal.h>
+#include <rte_uuid.h>
+#include <rte_tailq.h>
+#include <rte_log.h>
+#include <rte_devargs.h>
+#include <rte_memory.h>
+#include <rte_malloc.h>
+#include <rte_bus_vmbus.h>
+
+#include "eal_filesystem.h"
+#include "private.h"
+
+/** Pathname of VMBUS devices directory. */
+#define SYSFS_VMBUS_DEVICES "/sys/bus/vmbus/devices"
+
+extern struct rte_vmbus_bus rte_vmbus_bus;
+
+/* Read sysfs file to get UUID */
+static int
+parse_sysfs_uuid(const char *filename, rte_uuid_t uu)
+{
+ char buf[BUFSIZ];
+ char *cp, *in = buf;
+ FILE *f;
+
+ f = fopen(filename, "r");
+ if (f == NULL) {
+ VMBUS_LOG(ERR, "cannot open sysfs value %s: %s",
+ filename, strerror(errno));
+ return -1;
+ }
+
+ if (fgets(buf, sizeof(buf), f) == NULL) {
+ VMBUS_LOG(ERR, "cannot read sysfs value %s",
+ filename);
+ fclose(f);
+ return -1;
+ }
+ fclose(f);
+
+ cp = strchr(buf, '\n');
+ if (cp)
+ *cp = '\0';
+
+ /* strip { } notation */
+ if (buf[0] == '{') {
+ in = buf + 1;
+ cp = strchr(in, '}');
+ if (cp)
+ *cp = '\0';
+ }
+
+ if (rte_uuid_parse(in, uu) < 0) {
+ VMBUS_LOG(ERR, "%s %s not a valid UUID",
+ filename, buf);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+get_sysfs_string(const char *filename, char *buf, size_t buflen)
+{
+ char *cp;
+ FILE *f;
+
+ f = fopen(filename, "r");
+ if (f == NULL) {
+ VMBUS_LOG(ERR, "cannot open sysfs value %s:%s",
+ filename, strerror(errno));
+ return -1;
+ }
+
+ if (fgets(buf, buflen, f) == NULL) {
+ VMBUS_LOG(ERR, "cannot read sysfs value %s",
+ filename);
+ fclose(f);
+ return -1;
+ }
+ fclose(f);
+
+ /* remove trailing newline */
+ cp = memchr(buf, '\n', buflen);
+ if (cp)
+ *cp = '\0';
+
+ return 0;
+}
+
+static int
+vmbus_get_uio_dev(const struct rte_vmbus_device *dev,
+ char *dstbuf, size_t buflen)
+{
+ char dirname[PATH_MAX];
+ unsigned int uio_num;
+ struct dirent *e;
+ DIR *dir;
+
+ /* Assume recent kernel where uio is in uio/uioX */
+ snprintf(dirname, sizeof(dirname),
+ SYSFS_VMBUS_DEVICES "/%s/uio", dev->device.name);
+
+ dir = opendir(dirname);
+ if (dir == NULL)
+ return -1; /* Not a UIO device */
+
+ /* take the first file starting with "uio" */
+ while ((e = readdir(dir)) != NULL) {
+ const int prefix_len = 3;
+ char *endptr;
+
+ if (strncmp(e->d_name, "uio", prefix_len) != 0)
+ continue;
+
+ /* try uio%d */
+ errno = 0;
+ uio_num = strtoull(e->d_name + prefix_len, &endptr, 10);
+ if (errno == 0 && endptr != (e->d_name + prefix_len)) {
+ snprintf(dstbuf, buflen, "%s/uio%u", dirname, uio_num);
+ break;
+ }
+ }
+ closedir(dir);
+
+ if (e == NULL)
+ return -1;
+
+ return uio_num;
+}
+
+/* Check map names with kernel names */
+static const char *map_names[VMBUS_MAX_RESOURCE] = {
+ [HV_TXRX_RING_MAP] = "txrx_rings",
+ [HV_INT_PAGE_MAP] = "int_page",
+ [HV_MON_PAGE_MAP] = "monitor_page",
+ [HV_RECV_BUF_MAP] = "recv:",
+ [HV_SEND_BUF_MAP] = "send:",
+};
+
+
+/* map the resources of a vmbus device in virtual memory */
+int
+rte_vmbus_map_device(struct rte_vmbus_device *dev)
+{
+ char uioname[PATH_MAX], filename[PATH_MAX];
+ char dirname[PATH_MAX], mapname[64];
+ int i;
+
+ dev->uio_num = vmbus_get_uio_dev(dev, uioname, sizeof(uioname));
+ if (dev->uio_num < 0) {
+ VMBUS_LOG(DEBUG, "Not managed by UIO driver, skipped");
+ return 1;
+ }
+
+ /* Extract resource value */
+ for (i = 0; i < VMBUS_MAX_RESOURCE; i++) {
+ struct rte_mem_resource *res = &dev->resource[i];
+ unsigned long len, gpad = 0;
+ char *cp;
+
+ snprintf(dirname, sizeof(dirname),
+ "%s/maps/map%d", uioname, i);
+
+ snprintf(filename, sizeof(filename),
+ "%s/name", dirname);
+
+ if (get_sysfs_string(filename, mapname, sizeof(mapname)) < 0) {
+ VMBUS_LOG(ERR, "could not read %s", filename);
+ return -1;
+ }
+
+ if (strncmp(map_names[i], mapname, strlen(map_names[i])) != 0) {
+ VMBUS_LOG(ERR,
+ "unexpected resource %s (expected %s)",
+ mapname, map_names[i]);
+ return -1;
+ }
+
+ snprintf(filename, sizeof(filename),
+ "%s/size", dirname);
+ if (eal_parse_sysfs_value(filename, &len) < 0) {
+ VMBUS_LOG(ERR,
+ "could not read %s", filename);
+ return -1;
+ }
+ res->len = len;
+
+ /* both send and receive buffers have gpad in name */
+ cp = memchr(mapname, ':', sizeof(mapname));
+ if (cp)
+ gpad = strtoul(cp+1, NULL, 0);
+
+ /* put the GPAD value in physical address */
+ res->phys_addr = gpad;
+ }
+
+ return vmbus_uio_map_resource(dev);
+}
+
+void
+rte_vmbus_unmap_device(struct rte_vmbus_device *dev)
+{
+ vmbus_uio_unmap_resource(dev);
+}
+
+/* Scan one vmbus sysfs entry, and fill the devices list from it. */
+static int
+vmbus_scan_one(const char *name)
+{
+ struct rte_vmbus_device *dev, *dev2;
+ char filename[PATH_MAX];
+ char dirname[PATH_MAX];
+ unsigned long tmp;
+
+ dev = calloc(1, sizeof(*dev));
+ if (dev == NULL)
+ return -1;
+
+ dev->device.bus = &rte_vmbus_bus.bus;
+ dev->device.name = strdup(name);
+ if (!dev->device.name)
+ goto error;
+
+ /* sysfs base directory
+ * /sys/bus/vmbus/devices/7a08391f-f5a0-4ac0-9802-d13fd964f8df
+ * or on older kernel
+ * /sys/bus/vmbus/devices/vmbus_1
+ */
+ snprintf(dirname, sizeof(dirname), "%s/%s",
+ SYSFS_VMBUS_DEVICES, name);
+
+ /* get device id */
+ snprintf(filename, sizeof(filename), "%s/device_id", dirname);
+ if (parse_sysfs_uuid(filename, dev->device_id) < 0)
+ goto error;
+
+ /* get device class */
+ snprintf(filename, sizeof(filename), "%s/class_id", dirname);
+ if (parse_sysfs_uuid(filename, dev->class_id) < 0)
+ goto error;
+
+ /* get relid */
+ snprintf(filename, sizeof(filename), "%s/id", dirname);
+ if (eal_parse_sysfs_value(filename, &tmp) < 0)
+ goto error;
+ dev->relid = tmp;
+
+ /* get monitor id */
+ snprintf(filename, sizeof(filename), "%s/monitor_id", dirname);
+ if (eal_parse_sysfs_value(filename, &tmp) < 0)
+ goto error;
+ dev->monitor_id = tmp;
+
+ /* get numa node (if present) */
+ snprintf(filename, sizeof(filename), "%s/numa_node",
+ dirname);
+
+ if (access(filename, R_OK) == 0) {
+ if (eal_parse_sysfs_value(filename, &tmp) < 0)
+ goto error;
+ dev->device.numa_node = tmp;
+ } else {
+ /* if no NUMA support, set default to 0 */
+ dev->device.numa_node = SOCKET_ID_ANY;
+ }
+
+ dev->device.devargs = vmbus_devargs_lookup(dev);
+
+ /* device is valid, add in list (sorted) */
+ VMBUS_LOG(DEBUG, "Adding vmbus device %s", name);
+
+ TAILQ_FOREACH(dev2, &rte_vmbus_bus.device_list, next) {
+ int ret;
+
+ ret = rte_uuid_compare(dev->device_id, dev2->device_id);
+ if (ret > 0)
+ continue;
+
+ if (ret < 0) {
+ vmbus_insert_device(dev2, dev);
+ } else { /* already registered */
+ VMBUS_LOG(NOTICE,
+ "%s already registered", name);
+ free(dev);
+ }
+ return 0;
+ }
+
+ vmbus_add_device(dev);
+ return 0;
+error:
+ VMBUS_LOG(DEBUG, "failed");
+
+ free(dev);
+ return -1;
+}
+
+/*
+ * Scan the content of the vmbus, and the devices in the devices list
+ */
+int
+rte_vmbus_scan(void)
+{
+ struct dirent *e;
+ DIR *dir;
+
+ dir = opendir(SYSFS_VMBUS_DEVICES);
+ if (dir == NULL) {
+ if (errno == ENOENT)
+ return 0;
+
+ VMBUS_LOG(ERR, "opendir %s failed: %s",
+ SYSFS_VMBUS_DEVICES, strerror(errno));
+ return -1;
+ }
+
+ while ((e = readdir(dir)) != NULL) {
+ if (e->d_name[0] == '.')
+ continue;
+
+ if (vmbus_scan_one(e->d_name) < 0)
+ goto error;
+ }
+ closedir(dir);
+ return 0;
+
+error:
+ closedir(dir);
+ return -1;
+}
+
+void rte_vmbus_irq_mask(struct rte_vmbus_device *device)
+{
+ vmbus_uio_irq_control(device, 1);
+}
+
+void rte_vmbus_irq_unmask(struct rte_vmbus_device *device)
+{
+ vmbus_uio_irq_control(device, 0);
+}
+
+int rte_vmbus_irq_read(struct rte_vmbus_device *device)
+{
+ return vmbus_uio_irq_read(device);
+}
diff --git a/src/seastar/dpdk/drivers/bus/vmbus/linux/vmbus_uio.c b/src/seastar/dpdk/drivers/bus/vmbus/linux/vmbus_uio.c
new file mode 100644
index 000000000..be6b677f9
--- /dev/null
+++ b/src/seastar/dpdk/drivers/bus/vmbus/linux/vmbus_uio.c
@@ -0,0 +1,454 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2018, Microsoft Corporation.
+ * All Rights Reserved.
+ */
+
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <inttypes.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#include <rte_log.h>
+#include <rte_bus.h>
+#include <rte_memory.h>
+#include <rte_eal_memconfig.h>
+#include <rte_common.h>
+#include <rte_malloc.h>
+#include <rte_bus_vmbus.h>
+#include <rte_string_fns.h>
+
+#include "private.h"
+
+/** Pathname of VMBUS devices directory. */
+#define SYSFS_VMBUS_DEVICES "/sys/bus/vmbus/devices"
+
+static void *vmbus_map_addr;
+
+/* Control interrupts */
+void vmbus_uio_irq_control(struct rte_vmbus_device *dev, int32_t onoff)
+{
+ if (write(dev->intr_handle.fd, &onoff, sizeof(onoff)) < 0) {
+ VMBUS_LOG(ERR, "cannot write to %d:%s",
+ dev->intr_handle.fd, strerror(errno));
+ }
+}
+
+int vmbus_uio_irq_read(struct rte_vmbus_device *dev)
+{
+ int32_t count;
+ int cc;
+
+ cc = read(dev->intr_handle.fd, &count, sizeof(count));
+ if (cc < (int)sizeof(count)) {
+ if (cc < 0) {
+ VMBUS_LOG(ERR, "IRQ read failed %s",
+ strerror(errno));
+ return -errno;
+ }
+ VMBUS_LOG(ERR, "can't read IRQ count");
+ return -EINVAL;
+ }
+
+ return count;
+}
+
+void
+vmbus_uio_free_resource(struct rte_vmbus_device *dev,
+ struct mapped_vmbus_resource *uio_res)
+{
+ rte_free(uio_res);
+
+ if (dev->intr_handle.uio_cfg_fd >= 0) {
+ close(dev->intr_handle.uio_cfg_fd);
+ dev->intr_handle.uio_cfg_fd = -1;
+ }
+
+ if (dev->intr_handle.fd >= 0) {
+ close(dev->intr_handle.fd);
+ dev->intr_handle.fd = -1;
+ dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+ }
+}
+
+int
+vmbus_uio_alloc_resource(struct rte_vmbus_device *dev,
+ struct mapped_vmbus_resource **uio_res)
+{
+ char devname[PATH_MAX]; /* contains the /dev/uioX */
+
+ /* save fd if in primary process */
+ snprintf(devname, sizeof(devname), "/dev/uio%u", dev->uio_num);
+ dev->intr_handle.fd = open(devname, O_RDWR);
+ if (dev->intr_handle.fd < 0) {
+ VMBUS_LOG(ERR, "Cannot open %s: %s",
+ devname, strerror(errno));
+ goto error;
+ }
+ dev->intr_handle.type = RTE_INTR_HANDLE_UIO_INTX;
+
+ /* allocate the mapping details for secondary processes*/
+ *uio_res = rte_zmalloc("UIO_RES", sizeof(**uio_res), 0);
+ if (*uio_res == NULL) {
+ VMBUS_LOG(ERR, "cannot store uio mmap details");
+ goto error;
+ }
+
+ strlcpy((*uio_res)->path, devname, PATH_MAX);
+ rte_uuid_copy((*uio_res)->id, dev->device_id);
+
+ return 0;
+
+error:
+ vmbus_uio_free_resource(dev, *uio_res);
+ return -1;
+}
+
+static int
+find_max_end_va(const struct rte_memseg_list *msl, void *arg)
+{
+ size_t sz = msl->memseg_arr.len * msl->page_sz;
+ void *end_va = RTE_PTR_ADD(msl->base_va, sz);
+ void **max_va = arg;
+
+ if (*max_va < end_va)
+ *max_va = end_va;
+ return 0;
+}
+
+/*
+ * TODO: this should be part of memseg api.
+ * code is duplicated from PCI.
+ */
+static void *
+vmbus_find_max_end_va(void)
+{
+ void *va = NULL;
+
+ rte_memseg_list_walk(find_max_end_va, &va);
+ return va;
+}
+
+int
+vmbus_uio_map_resource_by_index(struct rte_vmbus_device *dev, int idx,
+ struct mapped_vmbus_resource *uio_res,
+ int flags)
+{
+ size_t size = dev->resource[idx].len;
+ struct vmbus_map *maps = uio_res->maps;
+ void *mapaddr;
+ off_t offset;
+ int fd;
+
+ /* devname for mmap */
+ fd = open(uio_res->path, O_RDWR);
+ if (fd < 0) {
+ VMBUS_LOG(ERR, "Cannot open %s: %s",
+ uio_res->path, strerror(errno));
+ return -1;
+ }
+
+ /* try mapping somewhere close to the end of hugepages */
+ if (vmbus_map_addr == NULL)
+ vmbus_map_addr = vmbus_find_max_end_va();
+
+ /* offset is special in uio it indicates which resource */
+ offset = idx * PAGE_SIZE;
+
+ mapaddr = vmbus_map_resource(vmbus_map_addr, fd, offset, size, flags);
+ close(fd);
+
+ if (mapaddr == MAP_FAILED)
+ return -1;
+
+ dev->resource[idx].addr = mapaddr;
+ vmbus_map_addr = RTE_PTR_ADD(mapaddr, size);
+
+ /* Record result of sucessful mapping for use by secondary */
+ maps[idx].addr = mapaddr;
+ maps[idx].size = size;
+
+ return 0;
+}
+
+static int vmbus_uio_map_primary(struct vmbus_channel *chan,
+ void **ring_buf, uint32_t *ring_size)
+{
+ struct mapped_vmbus_resource *uio_res;
+
+ uio_res = vmbus_uio_find_resource(chan->device);
+ if (!uio_res) {
+ VMBUS_LOG(ERR, "can not find resources!");
+ return -ENOMEM;
+ }
+
+ if (uio_res->nb_maps < VMBUS_MAX_RESOURCE) {
+ VMBUS_LOG(ERR, "VMBUS: only %u resources found!",
+ uio_res->nb_maps);
+ return -EINVAL;
+ }
+
+ *ring_size = uio_res->maps[HV_TXRX_RING_MAP].size / 2;
+ *ring_buf = uio_res->maps[HV_TXRX_RING_MAP].addr;
+ return 0;
+}
+
+static int vmbus_uio_map_subchan(const struct rte_vmbus_device *dev,
+ const struct vmbus_channel *chan,
+ void **ring_buf, uint32_t *ring_size)
+{
+ char ring_path[PATH_MAX];
+ size_t file_size;
+ struct stat sb;
+ void *mapaddr;
+ int fd;
+
+ snprintf(ring_path, sizeof(ring_path),
+ "%s/%s/channels/%u/ring",
+ SYSFS_VMBUS_DEVICES, dev->device.name,
+ chan->relid);
+
+ fd = open(ring_path, O_RDWR);
+ if (fd < 0) {
+ VMBUS_LOG(ERR, "Cannot open %s: %s",
+ ring_path, strerror(errno));
+ return -errno;
+ }
+
+ if (fstat(fd, &sb) < 0) {
+ VMBUS_LOG(ERR, "Cannot state %s: %s",
+ ring_path, strerror(errno));
+ close(fd);
+ return -errno;
+ }
+ file_size = sb.st_size;
+
+ if (file_size == 0 || (file_size & (PAGE_SIZE - 1))) {
+ VMBUS_LOG(ERR, "incorrect size %s: %zu",
+ ring_path, file_size);
+
+ close(fd);
+ return -EINVAL;
+ }
+
+ mapaddr = vmbus_map_resource(vmbus_map_addr, fd,
+ 0, file_size, 0);
+ close(fd);
+
+ if (mapaddr == MAP_FAILED)
+ return -EIO;
+
+ *ring_size = file_size / 2;
+ *ring_buf = mapaddr;
+
+ vmbus_map_addr = RTE_PTR_ADD(ring_buf, file_size);
+ return 0;
+}
+
+int
+vmbus_uio_map_secondary_subchan(const struct rte_vmbus_device *dev,
+ const struct vmbus_channel *chan)
+{
+ const struct vmbus_br *br = &chan->txbr;
+ char ring_path[PATH_MAX];
+ void *mapaddr, *ring_buf;
+ uint32_t ring_size;
+ int fd;
+
+ snprintf(ring_path, sizeof(ring_path),
+ "%s/%s/channels/%u/ring",
+ SYSFS_VMBUS_DEVICES, dev->device.name,
+ chan->relid);
+
+ ring_buf = br->vbr;
+ ring_size = br->dsize + sizeof(struct vmbus_bufring);
+ VMBUS_LOG(INFO, "secondary ring_buf %p size %u",
+ ring_buf, ring_size);
+
+ fd = open(ring_path, O_RDWR);
+ if (fd < 0) {
+ VMBUS_LOG(ERR, "Cannot open %s: %s",
+ ring_path, strerror(errno));
+ return -errno;
+ }
+
+ mapaddr = vmbus_map_resource(ring_buf, fd, 0, 2 * ring_size, 0);
+ close(fd);
+
+ if (mapaddr == ring_buf)
+ return 0;
+
+ if (mapaddr == MAP_FAILED)
+ VMBUS_LOG(ERR,
+ "mmap subchan %u in secondary failed", chan->relid);
+ else {
+ VMBUS_LOG(ERR,
+ "mmap subchan %u in secondary address mismatch",
+ chan->relid);
+ vmbus_unmap_resource(mapaddr, 2 * ring_size);
+ }
+ return -1;
+}
+
+int vmbus_uio_map_rings(struct vmbus_channel *chan)
+{
+ const struct rte_vmbus_device *dev = chan->device;
+ uint32_t ring_size;
+ void *ring_buf;
+ int ret;
+
+ /* Primary channel */
+ if (chan->subchannel_id == 0)
+ ret = vmbus_uio_map_primary(chan, &ring_buf, &ring_size);
+ else
+ ret = vmbus_uio_map_subchan(dev, chan, &ring_buf, &ring_size);
+
+ if (ret)
+ return ret;
+
+ vmbus_br_setup(&chan->txbr, ring_buf, ring_size);
+ vmbus_br_setup(&chan->rxbr, (char *)ring_buf + ring_size, ring_size);
+ return 0;
+}
+
+static int vmbus_uio_sysfs_read(const char *dir, const char *name,
+ unsigned long *val, unsigned long max_range)
+{
+ char path[PATH_MAX];
+ FILE *f;
+ int ret;
+
+ snprintf(path, sizeof(path), "%s/%s", dir, name);
+ f = fopen(path, "r");
+ if (!f) {
+ VMBUS_LOG(ERR, "can't open %s:%s",
+ path, strerror(errno));
+ return -errno;
+ }
+
+ if (fscanf(f, "%lu", val) != 1)
+ ret = -EIO;
+ else if (*val > max_range)
+ ret = -ERANGE;
+ else
+ ret = 0;
+ fclose(f);
+
+ return ret;
+}
+
+static bool vmbus_uio_ring_present(const struct rte_vmbus_device *dev,
+ uint32_t relid)
+{
+ char ring_path[PATH_MAX];
+
+ /* Check if kernel has subchannel sysfs files */
+ snprintf(ring_path, sizeof(ring_path),
+ "%s/%s/channels/%u/ring",
+ SYSFS_VMBUS_DEVICES, dev->device.name, relid);
+
+ return access(ring_path, R_OK|W_OK) == 0;
+}
+
+bool vmbus_uio_subchannels_supported(const struct rte_vmbus_device *dev,
+ const struct vmbus_channel *chan)
+{
+ return vmbus_uio_ring_present(dev, chan->relid);
+}
+
+static bool vmbus_isnew_subchannel(struct vmbus_channel *primary,
+ unsigned long id)
+{
+ const struct vmbus_channel *c;
+
+ STAILQ_FOREACH(c, &primary->subchannel_list, next) {
+ if (c->relid == id)
+ return false;
+ }
+ return true;
+}
+
+int vmbus_uio_get_subchan(struct vmbus_channel *primary,
+ struct vmbus_channel **subchan)
+{
+ const struct rte_vmbus_device *dev = primary->device;
+ char chan_path[PATH_MAX], subchan_path[PATH_MAX];
+ struct dirent *ent;
+ DIR *chan_dir;
+ int err;
+
+ snprintf(chan_path, sizeof(chan_path),
+ "%s/%s/channels",
+ SYSFS_VMBUS_DEVICES, dev->device.name);
+
+ chan_dir = opendir(chan_path);
+ if (!chan_dir) {
+ VMBUS_LOG(ERR, "cannot open %s: %s",
+ chan_path, strerror(errno));
+ return -errno;
+ }
+
+ while ((ent = readdir(chan_dir))) {
+ unsigned long relid, subid, monid;
+ char *endp;
+
+ if (ent->d_name[0] == '.')
+ continue;
+
+ errno = 0;
+ relid = strtoul(ent->d_name, &endp, 0);
+ if (*endp || errno != 0 || relid > UINT16_MAX) {
+ VMBUS_LOG(NOTICE, "not a valid channel relid: %s",
+ ent->d_name);
+ continue;
+ }
+
+ if (!vmbus_isnew_subchannel(primary, relid)) {
+ VMBUS_LOG(DEBUG, "skip already found channel: %lu",
+ relid);
+ continue;
+ }
+
+ if (!vmbus_uio_ring_present(dev, relid)) {
+ VMBUS_LOG(DEBUG, "ring mmap not found (yet) for: %lu",
+ relid);
+ continue;
+ }
+
+ snprintf(subchan_path, sizeof(subchan_path), "%s/%lu",
+ chan_path, relid);
+ err = vmbus_uio_sysfs_read(subchan_path, "subchannel_id",
+ &subid, UINT16_MAX);
+ if (err) {
+ VMBUS_LOG(NOTICE, "no subchannel_id in %s:%s",
+ subchan_path, strerror(-err));
+ goto fail;
+ }
+
+ if (subid == 0)
+ continue; /* skip primary channel */
+
+ err = vmbus_uio_sysfs_read(subchan_path, "monitor_id",
+ &monid, UINT8_MAX);
+ if (err) {
+ VMBUS_LOG(NOTICE, "no monitor_id in %s:%s",
+ subchan_path, strerror(-err));
+ goto fail;
+ }
+
+ err = vmbus_chan_create(dev, relid, subid, monid, subchan);
+ if (err) {
+ VMBUS_LOG(ERR, "subchannel setup failed");
+ goto fail;
+ }
+ break;
+ }
+ closedir(chan_dir);
+
+ return (ent == NULL) ? -ENOENT : 0;
+fail:
+ closedir(chan_dir);
+ return err;
+}
diff --git a/src/seastar/dpdk/drivers/bus/vmbus/meson.build b/src/seastar/dpdk/drivers/bus/vmbus/meson.build
new file mode 100644
index 000000000..9fd430dae
--- /dev/null
+++ b/src/seastar/dpdk/drivers/bus/vmbus/meson.build
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: BSD-3-Clause
+
+version = 2
+
+allow_experimental_apis = true
+
+install_headers('rte_bus_vmbus.h','rte_vmbus_reg.h')
+
+sources = files('vmbus_common.c',
+ 'vmbus_channel.c',
+ 'vmbus_bufring.c',
+ 'vmbus_common_uio.c')
+
+if is_linux
+ sources += files('linux/vmbus_bus.c',
+ 'linux/vmbus_uio.c')
+ includes += include_directories('linux')
+else
+ build = false
+endif
diff --git a/src/seastar/dpdk/drivers/bus/vmbus/private.h b/src/seastar/dpdk/drivers/bus/vmbus/private.h
new file mode 100644
index 000000000..f19b14e4a
--- /dev/null
+++ b/src/seastar/dpdk/drivers/bus/vmbus/private.h
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2018, Microsoft Corporation.
+ * All Rights Reserved.
+ */
+
+#ifndef _VMBUS_PRIVATE_H_
+#define _VMBUS_PRIVATE_H_
+
+#include <stdbool.h>
+#include <sys/uio.h>
+#include <rte_log.h>
+#include <rte_vmbus_reg.h>
+#include <rte_bus_vmbus.h>
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+
+extern struct rte_vmbus_bus rte_vmbus_bus;
+
+extern int vmbus_logtype_bus;
+#define VMBUS_LOG(level, fmt, args...) \
+ rte_log(RTE_LOG_ ## level, vmbus_logtype_bus, "%s(): " fmt "\n", \
+ __func__, ##args)
+
+struct vmbus_br {
+ struct vmbus_bufring *vbr;
+ uint32_t dsize;
+ uint32_t windex; /* next available location */
+};
+
+#define UIO_NAME_MAX 64
+
+struct vmbus_map {
+ void *addr; /* user mmap of resource */
+ uint64_t size; /* length */
+};
+
+/*
+ * For multi-process we need to reproduce all vmbus mappings in secondary
+ * processes, so save them in a tailq.
+ */
+struct mapped_vmbus_resource {
+ TAILQ_ENTRY(mapped_vmbus_resource) next;
+
+ rte_uuid_t id;
+ int nb_maps;
+ struct vmbus_channel *primary;
+ struct vmbus_map maps[VMBUS_MAX_RESOURCE];
+ char path[PATH_MAX];
+};
+
+TAILQ_HEAD(mapped_vmbus_res_list, mapped_vmbus_resource);
+
+#define HV_MON_TRIG_LEN 32
+#define HV_MON_TRIG_MAX 4
+
+struct vmbus_channel {
+ STAILQ_HEAD(, vmbus_channel) subchannel_list;
+ STAILQ_ENTRY(vmbus_channel) next;
+ const struct rte_vmbus_device *device;
+
+ struct vmbus_br rxbr;
+ struct vmbus_br txbr;
+
+ uint16_t relid;
+ uint16_t subchannel_id;
+ uint8_t monitor_id;
+};
+
+#define VMBUS_MAX_CHANNELS 64
+
+struct rte_devargs *
+vmbus_devargs_lookup(struct rte_vmbus_device *dev);
+
+int vmbus_chan_create(const struct rte_vmbus_device *device,
+ uint16_t relid, uint16_t subid, uint8_t monitor_id,
+ struct vmbus_channel **new_chan);
+
+void vmbus_add_device(struct rte_vmbus_device *vmbus_dev);
+void vmbus_insert_device(struct rte_vmbus_device *exist_vmbus_dev,
+ struct rte_vmbus_device *new_vmbus_dev);
+void vmbus_remove_device(struct rte_vmbus_device *vmbus_device);
+
+void vmbus_uio_irq_control(struct rte_vmbus_device *dev, int32_t onoff);
+int vmbus_uio_irq_read(struct rte_vmbus_device *dev);
+
+int vmbus_uio_map_resource(struct rte_vmbus_device *dev);
+void vmbus_uio_unmap_resource(struct rte_vmbus_device *dev);
+
+int vmbus_uio_alloc_resource(struct rte_vmbus_device *dev,
+ struct mapped_vmbus_resource **uio_res);
+void vmbus_uio_free_resource(struct rte_vmbus_device *dev,
+ struct mapped_vmbus_resource *uio_res);
+
+struct mapped_vmbus_resource *
+vmbus_uio_find_resource(const struct rte_vmbus_device *dev);
+int vmbus_uio_map_resource_by_index(struct rte_vmbus_device *dev, int res_idx,
+ struct mapped_vmbus_resource *uio_res,
+ int flags);
+
+void *vmbus_map_resource(void *requested_addr, int fd, off_t offset,
+ size_t size, int additional_flags);
+void vmbus_unmap_resource(void *requested_addr, size_t size);
+
+bool vmbus_uio_subchannels_supported(const struct rte_vmbus_device *dev,
+ const struct vmbus_channel *chan);
+int vmbus_uio_get_subchan(struct vmbus_channel *primary,
+ struct vmbus_channel **subchan);
+int vmbus_uio_map_rings(struct vmbus_channel *chan);
+int vmbus_uio_map_secondary_subchan(const struct rte_vmbus_device *dev,
+ const struct vmbus_channel *chan);
+
+void vmbus_br_setup(struct vmbus_br *br, void *buf, unsigned int blen);
+
+/* Amount of space available for write */
+static inline uint32_t
+vmbus_br_availwrite(const struct vmbus_br *br, uint32_t windex)
+{
+ uint32_t rindex = br->vbr->rindex;
+
+ if (windex >= rindex)
+ return br->dsize - (windex - rindex);
+ else
+ return rindex - windex;
+}
+
+static inline uint32_t
+vmbus_br_availread(const struct vmbus_br *br)
+{
+ return br->dsize - vmbus_br_availwrite(br, br->vbr->windex);
+}
+
+int vmbus_txbr_write(struct vmbus_br *tbr, const struct iovec iov[], int iovlen,
+ bool *need_sig);
+
+int vmbus_rxbr_peek(const struct vmbus_br *rbr, void *data, size_t dlen);
+
+int vmbus_rxbr_read(struct vmbus_br *rbr, void *data, size_t dlen, size_t hlen);
+
+#endif /* _VMBUS_PRIVATE_H_ */
diff --git a/src/seastar/dpdk/drivers/bus/vmbus/rte_bus_vmbus.h b/src/seastar/dpdk/drivers/bus/vmbus/rte_bus_vmbus.h
new file mode 100644
index 000000000..4cf73ce81
--- /dev/null
+++ b/src/seastar/dpdk/drivers/bus/vmbus/rte_bus_vmbus.h
@@ -0,0 +1,421 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2018, Microsoft Corporation.
+ * All Rights Reserved.
+ */
+
+#ifndef _VMBUS_H_
+#define _VMBUS_H_
+
+/**
+ * @file
+ *
+ * VMBUS Interface
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <errno.h>
+#include <sys/queue.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_compat.h>
+#include <rte_uuid.h>
+#include <rte_debug.h>
+#include <rte_interrupts.h>
+#include <rte_dev.h>
+#include <rte_vmbus_reg.h>
+
+/* Forward declarations */
+struct rte_vmbus_device;
+struct rte_vmbus_driver;
+struct rte_vmbus_bus;
+struct vmbus_channel;
+struct vmbus_mon_page;
+
+TAILQ_HEAD(rte_vmbus_device_list, rte_vmbus_device);
+TAILQ_HEAD(rte_vmbus_driver_list, rte_vmbus_driver);
+
+/* VMBus iterators */
+#define FOREACH_DEVICE_ON_VMBUS(p) \
+ TAILQ_FOREACH(p, &(rte_vmbus_bus.device_list), next)
+
+#define FOREACH_DRIVER_ON_VMBUS(p) \
+ TAILQ_FOREACH(p, &(rte_vmbus_bus.driver_list), next)
+
+/** Maximum number of VMBUS resources. */
+enum hv_uio_map {
+ HV_TXRX_RING_MAP = 0,
+ HV_INT_PAGE_MAP,
+ HV_MON_PAGE_MAP,
+ HV_RECV_BUF_MAP,
+ HV_SEND_BUF_MAP
+};
+#define VMBUS_MAX_RESOURCE 5
+
+/**
+ * A structure describing a VMBUS device.
+ */
+struct rte_vmbus_device {
+ TAILQ_ENTRY(rte_vmbus_device) next; /**< Next probed VMBUS device */
+ const struct rte_vmbus_driver *driver; /**< Associated driver */
+ struct rte_device device; /**< Inherit core device */
+ rte_uuid_t device_id; /**< VMBUS device id */
+ rte_uuid_t class_id; /**< VMBUS device type */
+ uint32_t relid; /**< id for primary */
+ uint8_t monitor_id; /**< monitor page */
+ int uio_num; /**< UIO device number */
+ uint32_t *int_page; /**< VMBUS interrupt page */
+ struct vmbus_channel *primary; /**< VMBUS primary channel */
+ struct vmbus_mon_page *monitor_page; /**< VMBUS monitor page */
+
+ struct rte_intr_handle intr_handle; /**< Interrupt handle */
+ struct rte_mem_resource resource[VMBUS_MAX_RESOURCE];
+};
+
+/**
+ * Initialization function for the driver called during VMBUS probing.
+ */
+typedef int (vmbus_probe_t)(struct rte_vmbus_driver *,
+ struct rte_vmbus_device *);
+
+/**
+ * Initialization function for the driver called during hot plugging.
+ */
+typedef int (vmbus_remove_t)(struct rte_vmbus_device *);
+
+/**
+ * A structure describing a VMBUS driver.
+ */
+struct rte_vmbus_driver {
+ TAILQ_ENTRY(rte_vmbus_driver) next; /**< Next in list. */
+ struct rte_driver driver;
+ struct rte_vmbus_bus *bus; /**< VM bus reference. */
+ vmbus_probe_t *probe; /**< Device Probe function. */
+ vmbus_remove_t *remove; /**< Device Remove function. */
+
+ const rte_uuid_t *id_table; /**< ID table. */
+};
+
+
+/**
+ * Structure describing the VM bus
+ */
+struct rte_vmbus_bus {
+ struct rte_bus bus; /**< Inherit the generic class */
+ struct rte_vmbus_device_list device_list; /**< List of devices */
+ struct rte_vmbus_driver_list driver_list; /**< List of drivers */
+};
+
+/**
+ * Scan the content of the VMBUS bus, and the devices in the devices
+ * list
+ *
+ * @return
+ * 0 on success, negative on error
+ */
+int rte_vmbus_scan(void);
+
+/**
+ * Probe the VMBUS bus
+ *
+ * @return
+ * - 0 on success.
+ * - !0 on error.
+ */
+int rte_vmbus_probe(void);
+
+/**
+ * Map the VMBUS device resources in user space virtual memory address
+ *
+ * @param dev
+ * A pointer to a rte_vmbus_device structure describing the device
+ * to use
+ *
+ * @return
+ * 0 on success, negative on error and positive if no driver
+ * is found for the device.
+ */
+int rte_vmbus_map_device(struct rte_vmbus_device *dev);
+
+/**
+ * Unmap this device
+ *
+ * @param dev
+ * A pointer to a rte_vmbus_device structure describing the device
+ * to use
+ */
+void rte_vmbus_unmap_device(struct rte_vmbus_device *dev);
+
+/**
+ * Get connection to primary VMBUS channel
+ *
+ * @param device
+ * A pointer to a rte_vmbus_device structure describing the device
+ * @param chan
+ * A pointer to a VMBUS channel pointer that will be filled.
+ * @return
+ * - 0 Success; channel opened.
+ * - -ENOMEM: Not enough memory available.
+ * - -EINVAL: Regions could not be mapped.
+ */
+int rte_vmbus_chan_open(struct rte_vmbus_device *device,
+ struct vmbus_channel **chan);
+
+/**
+ * Free connection to VMBUS channel
+ *
+ * @param chan
+ * VMBUS channel
+ */
+void rte_vmbus_chan_close(struct vmbus_channel *chan);
+
+/**
+ * Gets the maximum number of channels supported on device
+ *
+ * @param device
+ * A pointer to a rte_vmbus_device structure describing the device
+ * @return
+ * Number of channels available.
+ */
+int rte_vmbus_max_channels(const struct rte_vmbus_device *device);
+
+/**
+ * Get a connection to new secondary vmbus channel
+ *
+ * @param primary
+ * A pointer to primary VMBUS channel
+ * @param chan
+ * A pointer to a secondary VMBUS channel pointer that will be filled.
+ * @return
+ * - 0 Success; channel opened.
+ * - -ENOMEM: Not enough memory available.
+ * - -EINVAL: Regions could not be mapped.
+ */
+int rte_vmbus_subchan_open(struct vmbus_channel *primary,
+ struct vmbus_channel **new_chan);
+
+/**
+ * Disable IRQ for device
+ *
+ * @param device
+ * VMBUS device
+ */
+void rte_vmbus_irq_mask(struct rte_vmbus_device *device);
+
+/**
+ * Enable IRQ for device
+ *
+ * @param device
+ * VMBUS device
+ */
+void rte_vmbus_irq_unmask(struct rte_vmbus_device *device);
+
+/**
+ * Read (and wait) for IRQ
+ *
+ * @param device
+ * VMBUS device
+ */
+int rte_vmbus_irq_read(struct rte_vmbus_device *device);
+
+/**
+ * Test if channel is empty
+ *
+ * @param channel
+ * Pointer to vmbus_channel structure.
+ * @return
+ * Return true if no data present in incoming ring.
+ */
+bool rte_vmbus_chan_rx_empty(const struct vmbus_channel *channel);
+
+/**
+ * Send the specified buffer on the given channel
+ *
+ * @param channel
+ * Pointer to vmbus_channel structure.
+ * @param type
+ * Type of packet that is being send e.g. negotiate, time
+ * packet etc.
+ * @param data
+ * Pointer to the buffer to send
+ * @param dlen
+ * Number of bytes of data to send
+ * @param xact
+ * Identifier of the request
+ * @param flags
+ * Message type inband, rxbuf, gpa
+ * @param need_sig
+ * Is host signal tx is required (optional)
+ *
+ * Sends data in buffer directly to hyper-v via the vmbus
+ */
+int rte_vmbus_chan_send(struct vmbus_channel *channel, uint16_t type,
+ void *data, uint32_t dlen,
+ uint64_t xact, uint32_t flags, bool *need_sig);
+
+/**
+ * Explicitly signal host that data is available
+ *
+ * @param
+ * Pointer to vmbus_channel structure.
+ *
+ * Used when batching multiple sends and only signaling host
+ * after the last send.
+ */
+void rte_vmbus_chan_signal_tx(const struct vmbus_channel *channel);
+
+/* Structure for scatter/gather I/O */
+struct iova_list {
+ rte_iova_t addr;
+ uint32_t len;
+};
+#define MAX_PAGE_BUFFER_COUNT 32
+
+/**
+ * Send a scattered buffer on the given channel
+ *
+ * @param channel
+ * Pointer to vmbus_channel structure.
+ * @param type
+ * Type of packet that is being send e.g. negotiate, time
+ * packet etc.
+ * @param gpa
+ * Array of buffers to send
+ * @param gpacnt
+ * Number of elements in iov
+ * @param data
+ * Pointer to the buffer additional data to send
+ * @param dlen
+ * Maximum size of what the the buffer will hold
+ * @param xact
+ * Identifier of the request
+ * @param flags
+ * Message type inband, rxbuf, gpa
+ * @param need_sig
+ * Is host signal tx is required (optional)
+ *
+ * Sends data in buffer directly to hyper-v via the vmbus
+ */
+int rte_vmbus_chan_send_sglist(struct vmbus_channel *channel,
+ struct vmbus_gpa gpa[], uint32_t gpacnt,
+ void *data, uint32_t dlen,
+ uint64_t xact, bool *need_sig);
+/**
+ * Receive response to request on the given channel
+ * skips the channel header.
+ *
+ * @param channel
+ * Pointer to vmbus_channel structure.
+ * @param data
+ * Pointer to the buffer you want to receive the data into.
+ * @param len
+ * Pointer to size of receive buffer (in/out)
+ * @param
+ * Pointer to received transaction_id
+ * @return
+ * On success, returns 0
+ * On failure, returns negative errno.
+ */
+int rte_vmbus_chan_recv(struct vmbus_channel *chan,
+ void *data, uint32_t *len,
+ uint64_t *request_id);
+
+/**
+ * Receive response to request on the given channel
+ * includes the channel header.
+ *
+ * @param channel
+ * Pointer to vmbus_channel structure.
+ * @param data
+ * Pointer to the buffer you want to receive the data into.
+ * @param len
+ * Pointer to size of receive buffer (in/out)
+ * @return
+ * On success, returns number of bytes read.
+ * On failure, returns negative errno.
+ */
+int rte_vmbus_chan_recv_raw(struct vmbus_channel *chan,
+ void *data, uint32_t *len);
+
+/**
+ * Notify host of bytes read (after recv_raw)
+ * Signals host if required.
+ *
+ * @param channel
+ * Pointer to vmbus_channel structure.
+ * @param bytes_read
+ * Number of bytes read since last signal
+ */
+void rte_vmbus_chan_signal_read(struct vmbus_channel *chan, uint32_t bytes_read);
+
+/**
+ * Determine sub channel index of the given channel
+ *
+ * @param channel
+ * Pointer to vmbus_channel structure.
+ * @return
+ * Sub channel index (0 for primary)
+ */
+uint16_t rte_vmbus_sub_channel_index(const struct vmbus_channel *chan);
+
+/**
+ * Set the host monitor latency hint
+ *
+ * @param dev
+ * VMBUS device
+ * @param chan
+ * Pointer to vmbus_channel structure.
+ * @param latency
+ * Approximate wait period between hypervisor examinations of
+ * the trigger page (in nanoseconds).
+ */
+void rte_vmbus_set_latency(const struct rte_vmbus_device *dev,
+ const struct vmbus_channel *chan,
+ uint32_t latency);
+
+/**
+ * Register a VMBUS driver.
+ *
+ * @param driver
+ * A pointer to a rte_vmbus_driver structure describing the driver
+ * to be registered.
+ */
+void rte_vmbus_register(struct rte_vmbus_driver *driver);
+
+/**
+ * For debug dump contents of ring buffer.
+ *
+ * @param channel
+ * Pointer to vmbus_channel structure.
+ */
+void rte_vmbus_chan_dump(FILE *f, const struct vmbus_channel *chan);
+
+/**
+ * Unregister a VMBUS driver.
+ *
+ * @param driver
+ * A pointer to a rte_vmbus_driver structure describing the driver
+ * to be unregistered.
+ */
+void rte_vmbus_unregister(struct rte_vmbus_driver *driver);
+
+/** Helper for VMBUS device registration from driver instance */
+#define RTE_PMD_REGISTER_VMBUS(nm, vmbus_drv) \
+ RTE_INIT(vmbusinitfn_ ##nm) \
+ { \
+ (vmbus_drv).driver.name = RTE_STR(nm); \
+ rte_vmbus_register(&vmbus_drv); \
+ } \
+ RTE_PMD_EXPORT_NAME(nm, __COUNTER__)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VMBUS_H_ */
diff --git a/src/seastar/dpdk/drivers/bus/vmbus/rte_bus_vmbus_version.map b/src/seastar/dpdk/drivers/bus/vmbus/rte_bus_vmbus_version.map
new file mode 100644
index 000000000..ae231ad32
--- /dev/null
+++ b/src/seastar/dpdk/drivers/bus/vmbus/rte_bus_vmbus_version.map
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+
+DPDK_18.08 {
+ global:
+
+ rte_vmbus_chan_close;
+ rte_vmbus_chan_open;
+ rte_vmbus_chan_recv;
+ rte_vmbus_chan_recv_raw;
+ rte_vmbus_chan_rx_empty;
+ rte_vmbus_chan_send;
+ rte_vmbus_chan_send_sglist;
+ rte_vmbus_chan_signal_read;
+ rte_vmbus_chan_signal_tx;
+ rte_vmbus_irq_mask;
+ rte_vmbus_irq_read;
+ rte_vmbus_irq_unmask;
+ rte_vmbus_map_device;
+ rte_vmbus_max_channels;
+ rte_vmbus_probe;
+ rte_vmbus_register;
+ rte_vmbus_scan;
+ rte_vmbus_sub_channel_index;
+ rte_vmbus_subchan_open;
+ rte_vmbus_unmap_device;
+ rte_vmbus_unregister;
+
+ local: *;
+};
+
+DPDK_18.11 {
+ global:
+
+ rte_vmbus_set_latency;
+
+} DPDK_18.08;
diff --git a/src/seastar/dpdk/drivers/bus/vmbus/rte_vmbus_reg.h b/src/seastar/dpdk/drivers/bus/vmbus/rte_vmbus_reg.h
new file mode 100644
index 000000000..f5a0693dc
--- /dev/null
+++ b/src/seastar/dpdk/drivers/bus/vmbus/rte_vmbus_reg.h
@@ -0,0 +1,344 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2018, Microsoft Corporation.
+ * All Rights Reserved.
+ */
+
+#ifndef _VMBUS_REG_H_
+#define _VMBUS_REG_H_
+
+/*
+ * Hyper-V SynIC message format.
+ */
+#define VMBUS_MSG_DSIZE_MAX 240
+#define VMBUS_MSG_SIZE 256
+
+struct vmbus_message {
+ uint32_t type; /* HYPERV_MSGTYPE_ */
+ uint8_t dsize; /* data size */
+ uint8_t flags; /* VMBUS_MSGFLAG_ */
+ uint16_t rsvd;
+ uint64_t id;
+ uint8_t data[VMBUS_MSG_DSIZE_MAX];
+} __rte_packed;
+
+#define VMBUS_MSGFLAG_PENDING 0x01
+
+/*
+ * Hyper-V Monitor Notification Facility
+ */
+
+struct vmbus_mon_trig {
+ uint32_t pending;
+ uint32_t armed;
+} __rte_packed;
+
+#define VMBUS_MONTRIGS_MAX 4
+#define VMBUS_MONTRIG_LEN 32
+
+/*
+ * Hyper-V Monitor Notification Facility
+ */
+struct hyperv_mon_param {
+ uint32_t connid;
+ uint16_t evtflag_ofs;
+ uint16_t rsvd;
+} __rte_packed;
+
+struct vmbus_mon_page {
+ uint32_t state;
+ uint32_t rsvd1;
+
+ struct vmbus_mon_trig trigs[VMBUS_MONTRIGS_MAX];
+ uint8_t rsvd2[536];
+
+ uint16_t lat[VMBUS_MONTRIGS_MAX][VMBUS_MONTRIG_LEN];
+ uint8_t rsvd3[256];
+
+ struct hyperv_mon_param
+ param[VMBUS_MONTRIGS_MAX][VMBUS_MONTRIG_LEN];
+ uint8_t rsvd4[1984];
+} __rte_packed;
+
+/*
+ * Buffer ring
+ */
+
+struct vmbus_bufring {
+ volatile uint32_t windex;
+ volatile uint32_t rindex;
+
+ /*
+ * Interrupt mask {0,1}
+ *
+ * For TX bufring, host set this to 1, when it is processing
+ * the TX bufring, so that we can safely skip the TX event
+ * notification to host.
+ *
+ * For RX bufring, once this is set to 1 by us, host will not
+ * further dispatch interrupts to us, even if there are data
+ * pending on the RX bufring. This effectively disables the
+ * interrupt of the channel to which this RX bufring is attached.
+ */
+ volatile uint32_t imask;
+
+ /*
+ * Win8 uses some of the reserved bits to implement
+ * interrupt driven flow management. On the send side
+ * we can request that the receiver interrupt the sender
+ * when the ring transitions from being full to being able
+ * to handle a message of size "pending_send_sz".
+ *
+ * Add necessary state for this enhancement.
+ */
+ volatile uint32_t pending_send;
+ uint32_t reserved1[12];
+
+ union {
+ struct {
+ uint32_t feat_pending_send_sz:1;
+ };
+ uint32_t value;
+ } feature_bits;
+
+ /* Pad it to PAGE_SIZE so that data starts on page boundary */
+ uint8_t reserved2[4028];
+
+ /*
+ * Ring data starts here + RingDataStartOffset
+ * !!! DO NOT place any fields below this !!!
+ */
+ uint8_t data[0];
+} __rte_packed;
+
+/*
+ * Channel packets
+ */
+
+/* Channel packet flags */
+#define VMBUS_CHANPKT_TYPE_INBAND 0x0006
+#define VMBUS_CHANPKT_TYPE_RXBUF 0x0007
+#define VMBUS_CHANPKT_TYPE_GPA 0x0009
+#define VMBUS_CHANPKT_TYPE_COMP 0x000b
+
+#define VMBUS_CHANPKT_FLAG_NONE 0
+#define VMBUS_CHANPKT_FLAG_RC 0x0001 /* report completion */
+
+#define VMBUS_CHANPKT_SIZE_SHIFT 3
+#define VMBUS_CHANPKT_SIZE_ALIGN (1 << VMBUS_CHANPKT_SIZE_SHIFT)
+#define VMBUS_CHANPKT_HLEN_MIN \
+ (sizeof(struct vmbus_chanpkt_hdr) >> VMBUS_CHANPKT_SIZE_SHIFT)
+
+static inline uint32_t
+vmbus_chanpkt_getlen(uint16_t pktlen)
+{
+ return (uint32_t)pktlen << VMBUS_CHANPKT_SIZE_SHIFT;
+}
+
+/*
+ * GPA stuffs.
+ */
+struct vmbus_gpa_range {
+ uint32_t len;
+ uint32_t ofs;
+ uint64_t page[0];
+} __rte_packed;
+
+/* This is actually vmbus_gpa_range.gpa_page[1] */
+struct vmbus_gpa {
+ uint32_t len;
+ uint32_t ofs;
+ uint64_t page;
+} __rte_packed;
+
+struct vmbus_chanpkt_hdr {
+ uint16_t type; /* VMBUS_CHANPKT_TYPE_ */
+ uint16_t hlen; /* header len, in 8 bytes */
+ uint16_t tlen; /* total len, in 8 bytes */
+ uint16_t flags; /* VMBUS_CHANPKT_FLAG_ */
+ uint64_t xactid;
+} __rte_packed;
+
+static inline uint32_t
+vmbus_chanpkt_datalen(const struct vmbus_chanpkt_hdr *pkt)
+{
+ return vmbus_chanpkt_getlen(pkt->tlen)
+ - vmbus_chanpkt_getlen(pkt->hlen);
+}
+
+struct vmbus_chanpkt {
+ struct vmbus_chanpkt_hdr hdr;
+} __rte_packed;
+
+struct vmbus_rxbuf_desc {
+ uint32_t len;
+ uint32_t ofs;
+} __rte_packed;
+
+struct vmbus_chanpkt_rxbuf {
+ struct vmbus_chanpkt_hdr hdr;
+ uint16_t rxbuf_id;
+ uint16_t rsvd;
+ uint32_t rxbuf_cnt;
+ struct vmbus_rxbuf_desc rxbuf[];
+} __rte_packed;
+
+struct vmbus_chanpkt_sglist {
+ struct vmbus_chanpkt_hdr hdr;
+ uint32_t rsvd;
+ uint32_t gpa_cnt;
+ struct vmbus_gpa gpa[];
+} __rte_packed;
+
+/*
+ * Channel messages
+ * - Embedded in vmbus_message.msg_data, e.g. response and notification.
+ * - Embedded in hypercall_postmsg_in.hc_data, e.g. request.
+ */
+
+#define VMBUS_CHANMSG_TYPE_CHOFFER 1 /* NOTE */
+#define VMBUS_CHANMSG_TYPE_CHRESCIND 2 /* NOTE */
+#define VMBUS_CHANMSG_TYPE_CHREQUEST 3 /* REQ */
+#define VMBUS_CHANMSG_TYPE_CHOFFER_DONE 4 /* NOTE */
+#define VMBUS_CHANMSG_TYPE_CHOPEN 5 /* REQ */
+#define VMBUS_CHANMSG_TYPE_CHOPEN_RESP 6 /* RESP */
+#define VMBUS_CHANMSG_TYPE_CHCLOSE 7 /* REQ */
+#define VMBUS_CHANMSG_TYPE_GPADL_CONN 8 /* REQ */
+#define VMBUS_CHANMSG_TYPE_GPADL_SUBCONN 9 /* REQ */
+#define VMBUS_CHANMSG_TYPE_GPADL_CONNRESP 10 /* RESP */
+#define VMBUS_CHANMSG_TYPE_GPADL_DISCONN 11 /* REQ */
+#define VMBUS_CHANMSG_TYPE_GPADL_DISCONNRESP 12 /* RESP */
+#define VMBUS_CHANMSG_TYPE_CHFREE 13 /* REQ */
+#define VMBUS_CHANMSG_TYPE_CONNECT 14 /* REQ */
+#define VMBUS_CHANMSG_TYPE_CONNECT_RESP 15 /* RESP */
+#define VMBUS_CHANMSG_TYPE_DISCONNECT 16 /* REQ */
+#define VMBUS_CHANMSG_TYPE_MAX 22
+
+struct vmbus_chanmsg_hdr {
+ uint32_t type; /* VMBUS_CHANMSG_TYPE_ */
+ uint32_t rsvd;
+} __rte_packed;
+
+/* VMBUS_CHANMSG_TYPE_CONNECT */
+struct vmbus_chanmsg_connect {
+ struct vmbus_chanmsg_hdr hdr;
+ uint32_t ver;
+ uint32_t rsvd;
+ uint64_t evtflags;
+ uint64_t mnf1;
+ uint64_t mnf2;
+} __rte_packed;
+
+/* VMBUS_CHANMSG_TYPE_CONNECT_RESP */
+struct vmbus_chanmsg_connect_resp {
+ struct vmbus_chanmsg_hdr hdr;
+ uint8_t done;
+} __rte_packed;
+
+/* VMBUS_CHANMSG_TYPE_CHREQUEST */
+struct vmbus_chanmsg_chrequest {
+ struct vmbus_chanmsg_hdr hdr;
+} __rte_packed;
+
+/* VMBUS_CHANMSG_TYPE_DISCONNECT */
+struct vmbus_chanmsg_disconnect {
+ struct vmbus_chanmsg_hdr hdr;
+} __rte_packed;
+
+/* VMBUS_CHANMSG_TYPE_CHOPEN */
+struct vmbus_chanmsg_chopen {
+ struct vmbus_chanmsg_hdr hdr;
+ uint32_t chanid;
+ uint32_t openid;
+ uint32_t gpadl;
+ uint32_t vcpuid;
+ uint32_t txbr_pgcnt;
+#define VMBUS_CHANMSG_CHOPEN_UDATA_SIZE 120
+ uint8_t udata[VMBUS_CHANMSG_CHOPEN_UDATA_SIZE];
+} __rte_packed;
+
+/* VMBUS_CHANMSG_TYPE_CHOPEN_RESP */
+struct vmbus_chanmsg_chopen_resp {
+ struct vmbus_chanmsg_hdr hdr;
+ uint32_t chanid;
+ uint32_t openid;
+ uint32_t status;
+} __rte_packed;
+
+/* VMBUS_CHANMSG_TYPE_GPADL_CONN */
+struct vmbus_chanmsg_gpadl_conn {
+ struct vmbus_chanmsg_hdr hdr;
+ uint32_t chanid;
+ uint32_t gpadl;
+ uint16_t range_len;
+ uint16_t range_cnt;
+ struct vmbus_gpa_range range;
+} __rte_packed;
+
+#define VMBUS_CHANMSG_GPADL_CONN_PGMAX 26
+
+/* VMBUS_CHANMSG_TYPE_GPADL_SUBCONN */
+struct vmbus_chanmsg_gpadl_subconn {
+ struct vmbus_chanmsg_hdr hdr;
+ uint32_t msgno;
+ uint32_t gpadl;
+ uint64_t gpa_page[];
+} __rte_packed;
+
+#define VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX 28
+
+/* VMBUS_CHANMSG_TYPE_GPADL_CONNRESP */
+struct vmbus_chanmsg_gpadl_connresp {
+ struct vmbus_chanmsg_hdr hdr;
+ uint32_t chanid;
+ uint32_t gpadl;
+ uint32_t status;
+} __rte_packed;
+
+/* VMBUS_CHANMSG_TYPE_CHCLOSE */
+struct vmbus_chanmsg_chclose {
+ struct vmbus_chanmsg_hdr hdr;
+ uint32_t chanid;
+} __rte_packed;
+
+/* VMBUS_CHANMSG_TYPE_GPADL_DISCONN */
+struct vmbus_chanmsg_gpadl_disconn {
+ struct vmbus_chanmsg_hdr hdr;
+ uint32_t chanid;
+ uint32_t gpadl;
+} __rte_packed;
+
+/* VMBUS_CHANMSG_TYPE_CHFREE */
+struct vmbus_chanmsg_chfree {
+ struct vmbus_chanmsg_hdr hdr;
+ uint32_t chanid;
+} __rte_packed;
+
+/* VMBUS_CHANMSG_TYPE_CHRESCIND */
+struct vmbus_chanmsg_chrescind {
+ struct vmbus_chanmsg_hdr hdr;
+ uint32_t chanid;
+} __rte_packed;
+
+/* VMBUS_CHANMSG_TYPE_CHOFFER */
+struct vmbus_chanmsg_choffer {
+ struct vmbus_chanmsg_hdr hdr;
+ rte_uuid_t chtype;
+ rte_uuid_t chinst;
+ uint64_t chlat; /* unit: 100ns */
+ uint32_t chrev;
+ uint32_t svrctx_sz;
+ uint16_t chflags;
+ uint16_t mmio_sz; /* unit: MB */
+ uint8_t udata[120];
+ uint16_t subidx;
+ uint16_t rsvd;
+ uint32_t chanid;
+ uint8_t montrig;
+ uint8_t flags1; /* VMBUS_CHOFFER_FLAG1_ */
+ uint16_t flags2;
+ uint32_t connid;
+} __rte_packed;
+
+#define VMBUS_CHOFFER_FLAG1_HASMNF 0x01
+
+#endif /* !_VMBUS_REG_H_ */
diff --git a/src/seastar/dpdk/drivers/bus/vmbus/vmbus_bufring.c b/src/seastar/dpdk/drivers/bus/vmbus/vmbus_bufring.c
new file mode 100644
index 000000000..c88001605
--- /dev/null
+++ b/src/seastar/dpdk/drivers/bus/vmbus/vmbus_bufring.c
@@ -0,0 +1,244 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ */
+
+#include <unistd.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sys/uio.h>
+
+#include <rte_eal.h>
+#include <rte_tailq.h>
+#include <rte_log.h>
+#include <rte_malloc.h>
+#include <rte_bus.h>
+#include <rte_atomic.h>
+#include <rte_memory.h>
+#include <rte_pause.h>
+#include <rte_bus_vmbus.h>
+
+#include "private.h"
+
+/* Increase bufring index by inc with wraparound */
+static inline uint32_t vmbus_br_idxinc(uint32_t idx, uint32_t inc, uint32_t sz)
+{
+ idx += inc;
+ if (idx >= sz)
+ idx -= sz;
+
+ return idx;
+}
+
+void vmbus_br_setup(struct vmbus_br *br, void *buf, unsigned int blen)
+{
+ br->vbr = buf;
+ br->windex = br->vbr->windex;
+ br->dsize = blen - sizeof(struct vmbus_bufring);
+}
+
+/*
+ * When we write to the ring buffer, check if the host needs to be
+ * signaled.
+ *
+ * The contract:
+ * - The host guarantees that while it is draining the TX bufring,
+ * it will set the br_imask to indicate it does not need to be
+ * interrupted when new data are added.
+ * - The host guarantees that it will completely drain the TX bufring
+ * before exiting the read loop. Further, once the TX bufring is
+ * empty, it will clear the br_imask and re-check to see if new
+ * data have arrived.
+ */
+static inline bool
+vmbus_txbr_need_signal(const struct vmbus_br *tbr, uint32_t old_windex)
+{
+ rte_smp_mb();
+ if (tbr->vbr->imask)
+ return false;
+
+ rte_smp_rmb();
+
+ /*
+ * This is the only case we need to signal when the
+ * ring transitions from being empty to non-empty.
+ */
+ return old_windex == tbr->vbr->rindex;
+}
+
+static inline uint32_t
+vmbus_txbr_copyto(const struct vmbus_br *tbr, uint32_t windex,
+ const void *src0, uint32_t cplen)
+{
+ uint8_t *br_data = tbr->vbr->data;
+ uint32_t br_dsize = tbr->dsize;
+ const uint8_t *src = src0;
+
+ /* XXX use double mapping like Linux kernel? */
+ if (cplen > br_dsize - windex) {
+ uint32_t fraglen = br_dsize - windex;
+
+ /* Wrap-around detected */
+ memcpy(br_data + windex, src, fraglen);
+ memcpy(br_data, src + fraglen, cplen - fraglen);
+ } else {
+ memcpy(br_data + windex, src, cplen);
+ }
+
+ return vmbus_br_idxinc(windex, cplen, br_dsize);
+}
+
+/*
+ * Write scattered channel packet to TX bufring.
+ *
+ * The offset of this channel packet is written as a 64bits value
+ * immediately after this channel packet.
+ *
+ * The write goes through three stages:
+ * 1. Reserve space in ring buffer for the new data.
+ * Writer atomically moves priv_write_index.
+ * 2. Copy the new data into the ring.
+ * 3. Update the tail of the ring (visible to host) that indicates
+ * next read location. Writer updates write_index
+ */
+int
+vmbus_txbr_write(struct vmbus_br *tbr, const struct iovec iov[], int iovlen,
+ bool *need_sig)
+{
+ struct vmbus_bufring *vbr = tbr->vbr;
+ uint32_t ring_size = tbr->dsize;
+ uint32_t old_windex, next_windex, windex, total;
+ uint64_t save_windex;
+ int i;
+
+ total = 0;
+ for (i = 0; i < iovlen; i++)
+ total += iov[i].iov_len;
+ total += sizeof(save_windex);
+
+ /* Reserve space in ring */
+ do {
+ uint32_t avail;
+
+ /* Get current free location */
+ old_windex = tbr->windex;
+
+ /* Prevent compiler reordering this with calculation */
+ rte_compiler_barrier();
+
+ avail = vmbus_br_availwrite(tbr, old_windex);
+
+ /* If not enough space in ring, then tell caller. */
+ if (avail <= total)
+ return -EAGAIN;
+
+ next_windex = vmbus_br_idxinc(old_windex, total, ring_size);
+
+ /* Atomic update of next write_index for other threads */
+ } while (!rte_atomic32_cmpset(&tbr->windex, old_windex, next_windex));
+
+ /* Space from old..new is now reserved */
+ windex = old_windex;
+ for (i = 0; i < iovlen; i++) {
+ windex = vmbus_txbr_copyto(tbr, windex,
+ iov[i].iov_base, iov[i].iov_len);
+ }
+
+ /* Set the offset of the current channel packet. */
+ save_windex = ((uint64_t)old_windex) << 32;
+ windex = vmbus_txbr_copyto(tbr, windex, &save_windex,
+ sizeof(save_windex));
+
+ /* The region reserved should match region used */
+ RTE_ASSERT(windex == next_windex);
+
+ /* Ensure that data is available before updating host index */
+ rte_smp_wmb();
+
+ /* Checkin for our reservation. wait for our turn to update host */
+ while (!rte_atomic32_cmpset(&vbr->windex, old_windex, next_windex))
+ rte_pause();
+
+ /* If host had read all data before this, then need to signal */
+ *need_sig |= vmbus_txbr_need_signal(tbr, old_windex);
+ return 0;
+}
+
+static inline uint32_t
+vmbus_rxbr_copyfrom(const struct vmbus_br *rbr, uint32_t rindex,
+ void *dst0, size_t cplen)
+{
+ const uint8_t *br_data = rbr->vbr->data;
+ uint32_t br_dsize = rbr->dsize;
+ uint8_t *dst = dst0;
+
+ if (cplen > br_dsize - rindex) {
+ uint32_t fraglen = br_dsize - rindex;
+
+ /* Wrap-around detected. */
+ memcpy(dst, br_data + rindex, fraglen);
+ memcpy(dst + fraglen, br_data, cplen - fraglen);
+ } else {
+ memcpy(dst, br_data + rindex, cplen);
+ }
+
+ return vmbus_br_idxinc(rindex, cplen, br_dsize);
+}
+
+/* Copy data from receive ring but don't change index */
+int
+vmbus_rxbr_peek(const struct vmbus_br *rbr, void *data, size_t dlen)
+{
+ uint32_t avail;
+
+ /*
+ * The requested data and the 64bits channel packet
+ * offset should be there at least.
+ */
+ avail = vmbus_br_availread(rbr);
+ if (avail < dlen + sizeof(uint64_t))
+ return -EAGAIN;
+
+ vmbus_rxbr_copyfrom(rbr, rbr->vbr->rindex, data, dlen);
+ return 0;
+}
+
+/*
+ * Copy data from receive ring and change index
+ * NOTE:
+ * We assume (dlen + skip) == sizeof(channel packet).
+ */
+int
+vmbus_rxbr_read(struct vmbus_br *rbr, void *data, size_t dlen, size_t skip)
+{
+ struct vmbus_bufring *vbr = rbr->vbr;
+ uint32_t br_dsize = rbr->dsize;
+ uint32_t rindex;
+
+ if (vmbus_br_availread(rbr) < dlen + skip + sizeof(uint64_t))
+ return -EAGAIN;
+
+ /* Record where host was when we started read (for debug) */
+ rbr->windex = rbr->vbr->windex;
+
+ /*
+ * Copy channel packet from RX bufring.
+ */
+ rindex = vmbus_br_idxinc(rbr->vbr->rindex, skip, br_dsize);
+ rindex = vmbus_rxbr_copyfrom(rbr, rindex, data, dlen);
+
+ /*
+ * Discard this channel packet's 64bits offset, which is useless to us.
+ */
+ rindex = vmbus_br_idxinc(rindex, sizeof(uint64_t), br_dsize);
+
+ /* Update the read index _after_ the channel packet is fetched. */
+ rte_compiler_barrier();
+
+ vbr->rindex = rindex;
+
+ return 0;
+}
diff --git a/src/seastar/dpdk/drivers/bus/vmbus/vmbus_channel.c b/src/seastar/dpdk/drivers/bus/vmbus/vmbus_channel.c
new file mode 100644
index 000000000..46b3ba3f9
--- /dev/null
+++ b/src/seastar/dpdk/drivers/bus/vmbus/vmbus_channel.c
@@ -0,0 +1,445 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2018, Microsoft Corporation.
+ * All Rights Reserved.
+ */
+
+#include <unistd.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/uio.h>
+
+#include <rte_eal.h>
+#include <rte_tailq.h>
+#include <rte_log.h>
+#include <rte_malloc.h>
+#include <rte_bus.h>
+#include <rte_atomic.h>
+#include <rte_memory.h>
+#include <rte_bus_vmbus.h>
+
+#include "private.h"
+
+static inline void
+vmbus_sync_set_bit(volatile uint32_t *addr, uint32_t mask)
+{
+ /* Use GCC builtin which atomic does atomic OR operation */
+ __sync_or_and_fetch(addr, mask);
+}
+
+static inline void
+vmbus_send_interrupt(const struct rte_vmbus_device *dev, uint32_t relid)
+{
+ uint32_t *int_addr;
+ uint32_t int_mask;
+
+ int_addr = dev->int_page + relid / 32;
+ int_mask = 1u << (relid % 32);
+
+ vmbus_sync_set_bit(int_addr, int_mask);
+}
+
+static inline void
+vmbus_set_monitor(const struct rte_vmbus_device *dev, uint32_t monitor_id)
+{
+ uint32_t *monitor_addr, monitor_mask;
+ unsigned int trigger_index;
+
+ trigger_index = monitor_id / HV_MON_TRIG_LEN;
+ monitor_mask = 1u << (monitor_id % HV_MON_TRIG_LEN);
+
+ monitor_addr = &dev->monitor_page->trigs[trigger_index].pending;
+ vmbus_sync_set_bit(monitor_addr, monitor_mask);
+}
+
+static void
+vmbus_set_event(const struct rte_vmbus_device *dev,
+ const struct vmbus_channel *chan)
+{
+ vmbus_send_interrupt(dev, chan->relid);
+ vmbus_set_monitor(dev, chan->monitor_id);
+}
+
+/*
+ * Set the wait between when hypervisor examines the trigger.
+ */
+void
+rte_vmbus_set_latency(const struct rte_vmbus_device *dev,
+ const struct vmbus_channel *chan,
+ uint32_t latency)
+{
+ uint32_t trig_idx = chan->monitor_id / VMBUS_MONTRIG_LEN;
+ uint32_t trig_offs = chan->monitor_id % VMBUS_MONTRIG_LEN;
+
+ if (latency >= UINT16_MAX * 100) {
+ VMBUS_LOG(ERR, "invalid latency value %u", latency);
+ return;
+ }
+
+ if (trig_idx >= VMBUS_MONTRIGS_MAX) {
+ VMBUS_LOG(ERR, "invalid monitor trigger %u",
+ trig_idx);
+ return;
+ }
+
+ /* Host value is expressed in 100 nanosecond units */
+ dev->monitor_page->lat[trig_idx][trig_offs] = latency / 100;
+}
+
+/*
+ * Notify host that there are data pending on our TX bufring.
+ *
+ * Since this in userspace, rely on the monitor page.
+ * Can't do a hypercall from userspace.
+ */
+void
+rte_vmbus_chan_signal_tx(const struct vmbus_channel *chan)
+{
+ const struct rte_vmbus_device *dev = chan->device;
+ const struct vmbus_br *tbr = &chan->txbr;
+
+ /* Make sure all updates are done before signaling host */
+ rte_smp_wmb();
+
+ /* If host is ignoring interrupts? */
+ if (tbr->vbr->imask)
+ return;
+
+ vmbus_set_event(dev, chan);
+}
+
+
+/* Do a simple send directly using transmit ring. */
+int rte_vmbus_chan_send(struct vmbus_channel *chan, uint16_t type,
+ void *data, uint32_t dlen,
+ uint64_t xactid, uint32_t flags, bool *need_sig)
+{
+ struct vmbus_chanpkt pkt;
+ unsigned int pktlen, pad_pktlen;
+ const uint32_t hlen = sizeof(pkt);
+ bool send_evt = false;
+ uint64_t pad = 0;
+ struct iovec iov[3];
+ int error;
+
+ pktlen = hlen + dlen;
+ pad_pktlen = RTE_ALIGN(pktlen, sizeof(uint64_t));
+
+ pkt.hdr.type = type;
+ pkt.hdr.flags = flags;
+ pkt.hdr.hlen = hlen >> VMBUS_CHANPKT_SIZE_SHIFT;
+ pkt.hdr.tlen = pad_pktlen >> VMBUS_CHANPKT_SIZE_SHIFT;
+ pkt.hdr.xactid = xactid;
+
+ iov[0].iov_base = &pkt;
+ iov[0].iov_len = hlen;
+ iov[1].iov_base = data;
+ iov[1].iov_len = dlen;
+ iov[2].iov_base = &pad;
+ iov[2].iov_len = pad_pktlen - pktlen;
+
+ error = vmbus_txbr_write(&chan->txbr, iov, 3, &send_evt);
+
+ /*
+ * caller sets need_sig to non-NULL if it will handle
+ * signaling if required later.
+ * if need_sig is NULL, signal now if needed.
+ */
+ if (need_sig)
+ *need_sig |= send_evt;
+ else if (error == 0 && send_evt)
+ rte_vmbus_chan_signal_tx(chan);
+ return error;
+}
+
+/* Do a scatter/gather send where the descriptor points to data. */
+int rte_vmbus_chan_send_sglist(struct vmbus_channel *chan,
+ struct vmbus_gpa sg[], uint32_t sglen,
+ void *data, uint32_t dlen,
+ uint64_t xactid, bool *need_sig)
+{
+ struct vmbus_chanpkt_sglist pkt;
+ unsigned int pktlen, pad_pktlen, hlen;
+ bool send_evt = false;
+ struct iovec iov[4];
+ uint64_t pad = 0;
+ int error;
+
+ hlen = offsetof(struct vmbus_chanpkt_sglist, gpa[sglen]);
+ pktlen = hlen + dlen;
+ pad_pktlen = RTE_ALIGN(pktlen, sizeof(uint64_t));
+
+ pkt.hdr.type = VMBUS_CHANPKT_TYPE_GPA;
+ pkt.hdr.flags = VMBUS_CHANPKT_FLAG_RC;
+ pkt.hdr.hlen = hlen >> VMBUS_CHANPKT_SIZE_SHIFT;
+ pkt.hdr.tlen = pad_pktlen >> VMBUS_CHANPKT_SIZE_SHIFT;
+ pkt.hdr.xactid = xactid;
+ pkt.rsvd = 0;
+ pkt.gpa_cnt = sglen;
+
+ iov[0].iov_base = &pkt;
+ iov[0].iov_len = sizeof(pkt);
+ iov[1].iov_base = sg;
+ iov[1].iov_len = sizeof(struct vmbus_gpa) * sglen;
+ iov[2].iov_base = data;
+ iov[2].iov_len = dlen;
+ iov[3].iov_base = &pad;
+ iov[3].iov_len = pad_pktlen - pktlen;
+
+ error = vmbus_txbr_write(&chan->txbr, iov, 4, &send_evt);
+
+ /* if caller is batching, just propagate the status */
+ if (need_sig)
+ *need_sig |= send_evt;
+ else if (error == 0 && send_evt)
+ rte_vmbus_chan_signal_tx(chan);
+ return error;
+}
+
+bool rte_vmbus_chan_rx_empty(const struct vmbus_channel *channel)
+{
+ const struct vmbus_br *br = &channel->rxbr;
+
+ return br->vbr->rindex == br->vbr->windex;
+}
+
+/* Signal host after reading N bytes */
+void rte_vmbus_chan_signal_read(struct vmbus_channel *chan, uint32_t bytes_read)
+{
+ struct vmbus_br *rbr = &chan->rxbr;
+ uint32_t write_sz, pending_sz;
+
+ /* No need for signaling on older versions */
+ if (!rbr->vbr->feature_bits.feat_pending_send_sz)
+ return;
+
+ /* Make sure reading of pending happens after new read index */
+ rte_mb();
+
+ pending_sz = rbr->vbr->pending_send;
+ if (!pending_sz)
+ return;
+
+ rte_smp_rmb();
+ write_sz = vmbus_br_availwrite(rbr, rbr->vbr->windex);
+
+ /* If there was space before then host was not blocked */
+ if (write_sz - bytes_read > pending_sz)
+ return;
+
+ /* If pending write will not fit */
+ if (write_sz <= pending_sz)
+ return;
+
+ vmbus_set_event(chan->device, chan);
+}
+
+int rte_vmbus_chan_recv(struct vmbus_channel *chan, void *data, uint32_t *len,
+ uint64_t *request_id)
+{
+ struct vmbus_chanpkt_hdr pkt;
+ uint32_t dlen, hlen, bufferlen = *len;
+ int error;
+
+ *len = 0;
+
+ error = vmbus_rxbr_peek(&chan->rxbr, &pkt, sizeof(pkt));
+ if (error)
+ return error;
+
+ if (unlikely(pkt.hlen < VMBUS_CHANPKT_HLEN_MIN)) {
+ VMBUS_LOG(ERR, "VMBUS recv, invalid hlen %u", pkt.hlen);
+ /* XXX this channel is dead actually. */
+ return -EIO;
+ }
+
+ if (unlikely(pkt.hlen > pkt.tlen)) {
+ VMBUS_LOG(ERR, "VMBUS recv,invalid hlen %u and tlen %u",
+ pkt.hlen, pkt.tlen);
+ return -EIO;
+ }
+
+ /* Length are in quad words */
+ hlen = pkt.hlen << VMBUS_CHANPKT_SIZE_SHIFT;
+ dlen = (pkt.tlen << VMBUS_CHANPKT_SIZE_SHIFT) - hlen;
+ *len = dlen;
+
+ /* If caller buffer is not large enough */
+ if (unlikely(dlen > bufferlen))
+ return -ENOBUFS;
+
+ if (request_id)
+ *request_id = pkt.xactid;
+
+ /* Read data and skip packet header */
+ error = vmbus_rxbr_read(&chan->rxbr, data, dlen, hlen);
+ if (error)
+ return error;
+
+ rte_vmbus_chan_signal_read(chan, dlen + hlen + sizeof(uint64_t));
+ return 0;
+}
+
+/* TODO: replace this with inplace ring buffer (no copy) */
+int rte_vmbus_chan_recv_raw(struct vmbus_channel *chan,
+ void *data, uint32_t *len)
+{
+ struct vmbus_chanpkt_hdr pkt;
+ uint32_t dlen, bufferlen = *len;
+ int error;
+
+ error = vmbus_rxbr_peek(&chan->rxbr, &pkt, sizeof(pkt));
+ if (error)
+ return error;
+
+ if (unlikely(pkt.hlen < VMBUS_CHANPKT_HLEN_MIN)) {
+ VMBUS_LOG(ERR, "VMBUS recv, invalid hlen %u", pkt.hlen);
+ /* XXX this channel is dead actually. */
+ return -EIO;
+ }
+
+ if (unlikely(pkt.hlen > pkt.tlen)) {
+ VMBUS_LOG(ERR, "VMBUS recv,invalid hlen %u and tlen %u",
+ pkt.hlen, pkt.tlen);
+ return -EIO;
+ }
+
+ /* Length are in quad words */
+ dlen = pkt.tlen << VMBUS_CHANPKT_SIZE_SHIFT;
+ *len = dlen;
+
+ /* If caller buffer is not large enough */
+ if (unlikely(dlen > bufferlen))
+ return -ENOBUFS;
+
+ /* Read data and skip packet header */
+ error = vmbus_rxbr_read(&chan->rxbr, data, dlen, 0);
+ if (error)
+ return error;
+
+ /* Return the number of bytes read */
+ return dlen + sizeof(uint64_t);
+}
+
+int vmbus_chan_create(const struct rte_vmbus_device *device,
+ uint16_t relid, uint16_t subid, uint8_t monitor_id,
+ struct vmbus_channel **new_chan)
+{
+ struct vmbus_channel *chan;
+ int err;
+
+ chan = rte_zmalloc_socket("VMBUS", sizeof(*chan), RTE_CACHE_LINE_SIZE,
+ device->device.numa_node);
+ if (!chan)
+ return -ENOMEM;
+
+ STAILQ_INIT(&chan->subchannel_list);
+ chan->device = device;
+ chan->subchannel_id = subid;
+ chan->relid = relid;
+ chan->monitor_id = monitor_id;
+ *new_chan = chan;
+
+ err = vmbus_uio_map_rings(chan);
+ if (err) {
+ rte_free(chan);
+ return err;
+ }
+
+ return 0;
+}
+
+/* Setup the primary channel */
+int rte_vmbus_chan_open(struct rte_vmbus_device *device,
+ struct vmbus_channel **new_chan)
+{
+ struct mapped_vmbus_resource *uio_res;
+ int err;
+
+ uio_res = vmbus_uio_find_resource(device);
+ if (!uio_res) {
+ VMBUS_LOG(ERR, "can't find uio resource");
+ return -EINVAL;
+ }
+
+ err = vmbus_chan_create(device, device->relid, 0,
+ device->monitor_id, new_chan);
+ if (!err) {
+ device->primary = *new_chan;
+ uio_res->primary = *new_chan;
+ }
+
+ return err;
+}
+
+int rte_vmbus_max_channels(const struct rte_vmbus_device *device)
+{
+ if (vmbus_uio_subchannels_supported(device, device->primary))
+ return VMBUS_MAX_CHANNELS;
+ else
+ return 1;
+}
+
+/* Setup secondary channel */
+int rte_vmbus_subchan_open(struct vmbus_channel *primary,
+ struct vmbus_channel **new_chan)
+{
+ struct vmbus_channel *chan;
+ int err;
+
+ err = vmbus_uio_get_subchan(primary, &chan);
+ if (err)
+ return err;
+
+ STAILQ_INSERT_TAIL(&primary->subchannel_list, chan, next);
+ *new_chan = chan;
+ return 0;
+}
+
+uint16_t rte_vmbus_sub_channel_index(const struct vmbus_channel *chan)
+{
+ return chan->subchannel_id;
+}
+
+void rte_vmbus_chan_close(struct vmbus_channel *chan)
+{
+ const struct rte_vmbus_device *device = chan->device;
+ struct vmbus_channel *primary = device->primary;
+
+ /*
+ * intentionally leak primary channel because
+ * secondary may still reference it
+ */
+ if (chan != primary) {
+ STAILQ_REMOVE(&primary->subchannel_list, chan,
+ vmbus_channel, next);
+ rte_free(chan);
+ }
+
+}
+
+static void vmbus_dump_ring(FILE *f, const char *id, const struct vmbus_br *br)
+{
+ const struct vmbus_bufring *vbr = br->vbr;
+ struct vmbus_chanpkt_hdr pkt;
+
+ fprintf(f, "%s windex=%u rindex=%u mask=%u pending=%u feature=%#x\n",
+ id, vbr->windex, vbr->rindex, vbr->imask,
+ vbr->pending_send, vbr->feature_bits.value);
+ fprintf(f, " size=%u avail write=%u read=%u\n",
+ br->dsize, vmbus_br_availwrite(br, vbr->windex),
+ vmbus_br_availread(br));
+
+ if (vmbus_rxbr_peek(br, &pkt, sizeof(pkt)) == 0)
+ fprintf(f, " pkt type %#x len %u flags %#x xactid %#"PRIx64"\n",
+ pkt.type,
+ pkt.tlen << VMBUS_CHANPKT_SIZE_SHIFT,
+ pkt.flags, pkt.xactid);
+}
+
+void rte_vmbus_chan_dump(FILE *f, const struct vmbus_channel *chan)
+{
+ fprintf(f, "channel[%u] relid=%u monitor=%u\n",
+ chan->subchannel_id, chan->relid, chan->monitor_id);
+ vmbus_dump_ring(f, "rxbr", &chan->rxbr);
+ vmbus_dump_ring(f, "txbr", &chan->txbr);
+}
diff --git a/src/seastar/dpdk/drivers/bus/vmbus/vmbus_common.c b/src/seastar/dpdk/drivers/bus/vmbus/vmbus_common.c
new file mode 100644
index 000000000..48a219f73
--- /dev/null
+++ b/src/seastar/dpdk/drivers/bus/vmbus/vmbus_common.c
@@ -0,0 +1,307 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2018, Microsoft Corporation.
+ * All Rights Reserved.
+ */
+
+#include <string.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <sys/queue.h>
+#include <sys/mman.h>
+
+#include <rte_log.h>
+#include <rte_bus.h>
+#include <rte_eal.h>
+#include <rte_tailq.h>
+#include <rte_devargs.h>
+#include <rte_malloc.h>
+#include <rte_errno.h>
+#include <rte_memory.h>
+#include <rte_bus_vmbus.h>
+
+#include "private.h"
+
+int vmbus_logtype_bus;
+extern struct rte_vmbus_bus rte_vmbus_bus;
+
+/* map a particular resource from a file */
+void *
+vmbus_map_resource(void *requested_addr, int fd, off_t offset, size_t size,
+ int flags)
+{
+ void *mapaddr;
+
+ /* Map the memory resource of device */
+ mapaddr = mmap(requested_addr, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | flags, fd, offset);
+ if (mapaddr == MAP_FAILED) {
+ VMBUS_LOG(ERR,
+ "mmap(%d, %p, %zu, %ld) failed: %s",
+ fd, requested_addr, size, (long)offset,
+ strerror(errno));
+ }
+ return mapaddr;
+}
+
+/* unmap a particular resource */
+void
+vmbus_unmap_resource(void *requested_addr, size_t size)
+{
+ if (requested_addr == NULL)
+ return;
+
+ /* Unmap the VMBUS memory resource of device */
+ if (munmap(requested_addr, size)) {
+ VMBUS_LOG(ERR, "munmap(%p, 0x%lx) failed: %s",
+ requested_addr, (unsigned long)size,
+ strerror(errno));
+ } else
+ VMBUS_LOG(DEBUG, " VMBUS memory unmapped at %p",
+ requested_addr);
+}
+
+/**
+ * Match the VMBUS driver and device using UUID table
+ *
+ * @param drv
+ * VMBUS driver from which ID table would be extracted
+ * @param pci_dev
+ * VMBUS device to match against the driver
+ * @return
+ * true for successful match
+ * false for unsuccessful match
+ */
+static bool
+vmbus_match(const struct rte_vmbus_driver *dr,
+ const struct rte_vmbus_device *dev)
+{
+ const rte_uuid_t *id_table;
+
+ for (id_table = dr->id_table; !rte_uuid_is_null(*id_table); ++id_table) {
+ if (rte_uuid_compare(*id_table, dev->class_id) == 0)
+ return true;
+ }
+
+ return false;
+}
+/*
+ * If device ID match, call the devinit() function of the driver.
+ */
+static int
+vmbus_probe_one_driver(struct rte_vmbus_driver *dr,
+ struct rte_vmbus_device *dev)
+{
+ char guid[RTE_UUID_STRLEN];
+ int ret;
+
+ if (!vmbus_match(dr, dev))
+ return 1; /* not supported */
+
+ rte_uuid_unparse(dev->device_id, guid, sizeof(guid));
+ VMBUS_LOG(INFO, "VMBUS device %s on NUMA socket %i",
+ guid, dev->device.numa_node);
+
+ /* TODO add blacklisted */
+
+ /* map resources for device */
+ ret = rte_vmbus_map_device(dev);
+ if (ret != 0)
+ return ret;
+
+ /* reference driver structure */
+ dev->driver = dr;
+
+ if (dev->device.numa_node < 0) {
+ VMBUS_LOG(WARNING, " Invalid NUMA socket, default to 0");
+ dev->device.numa_node = 0;
+ }
+
+ /* call the driver probe() function */
+ VMBUS_LOG(INFO, " probe driver: %s", dr->driver.name);
+ ret = dr->probe(dr, dev);
+ if (ret) {
+ dev->driver = NULL;
+ rte_vmbus_unmap_device(dev);
+ } else {
+ dev->device.driver = &dr->driver;
+ }
+
+ return ret;
+}
+
+/*
+ * IF device class GUID mathces, call the probe function of
+ * registere drivers for the vmbus device.
+ * Return -1 if initialization failed,
+ * and 1 if no driver found for this device.
+ */
+static int
+vmbus_probe_all_drivers(struct rte_vmbus_device *dev)
+{
+ struct rte_vmbus_driver *dr;
+ int rc;
+
+ /* Check if a driver is already loaded */
+ if (rte_dev_is_probed(&dev->device)) {
+ VMBUS_LOG(DEBUG, "VMBUS driver already loaded");
+ return 0;
+ }
+
+ FOREACH_DRIVER_ON_VMBUS(dr) {
+ rc = vmbus_probe_one_driver(dr, dev);
+ if (rc < 0) /* negative is an error */
+ return -1;
+
+ if (rc > 0) /* positive driver doesn't support it */
+ continue;
+
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * Scan the vmbus, and call the devinit() function for
+ * all registered drivers that have a matching entry in its id_table
+ * for discovered devices.
+ */
+int
+rte_vmbus_probe(void)
+{
+ struct rte_vmbus_device *dev;
+ size_t probed = 0, failed = 0;
+ char ubuf[RTE_UUID_STRLEN];
+
+ FOREACH_DEVICE_ON_VMBUS(dev) {
+ probed++;
+
+ rte_uuid_unparse(dev->device_id, ubuf, sizeof(ubuf));
+
+ /* TODO: add whitelist/blacklist */
+
+ if (vmbus_probe_all_drivers(dev) < 0) {
+ VMBUS_LOG(NOTICE,
+ "Requested device %s cannot be used", ubuf);
+ rte_errno = errno;
+ failed++;
+ }
+ }
+
+ return (probed && probed == failed) ? -1 : 0;
+}
+
+static int
+vmbus_parse(const char *name, void *addr)
+{
+ rte_uuid_t guid;
+ int ret;
+
+ ret = rte_uuid_parse(name, guid);
+ if (ret == 0 && addr)
+ memcpy(addr, &guid, sizeof(guid));
+
+ return ret;
+}
+
+/*
+ * scan for matching device args on command line
+ * example:
+ * -w 'vmbus:635a7ae3-091e-4410-ad59-667c4f8c04c3,latency=20'
+ */
+struct rte_devargs *
+vmbus_devargs_lookup(struct rte_vmbus_device *dev)
+{
+ struct rte_devargs *devargs;
+ rte_uuid_t addr;
+
+ RTE_EAL_DEVARGS_FOREACH("vmbus", devargs) {
+ vmbus_parse(devargs->name, &addr);
+
+ if (rte_uuid_compare(dev->device_id, addr) == 0)
+ return devargs;
+ }
+ return NULL;
+
+}
+
+/* register vmbus driver */
+void
+rte_vmbus_register(struct rte_vmbus_driver *driver)
+{
+ VMBUS_LOG(DEBUG,
+ "Registered driver %s", driver->driver.name);
+
+ TAILQ_INSERT_TAIL(&rte_vmbus_bus.driver_list, driver, next);
+ driver->bus = &rte_vmbus_bus;
+}
+
+/* unregister vmbus driver */
+void
+rte_vmbus_unregister(struct rte_vmbus_driver *driver)
+{
+ TAILQ_REMOVE(&rte_vmbus_bus.driver_list, driver, next);
+ driver->bus = NULL;
+}
+
+/* Add a device to VMBUS bus */
+void
+vmbus_add_device(struct rte_vmbus_device *vmbus_dev)
+{
+ TAILQ_INSERT_TAIL(&rte_vmbus_bus.device_list, vmbus_dev, next);
+}
+
+/* Insert a device into a predefined position in VMBUS bus */
+void
+vmbus_insert_device(struct rte_vmbus_device *exist_vmbus_dev,
+ struct rte_vmbus_device *new_vmbus_dev)
+{
+ TAILQ_INSERT_BEFORE(exist_vmbus_dev, new_vmbus_dev, next);
+}
+
+/* Remove a device from VMBUS bus */
+void
+vmbus_remove_device(struct rte_vmbus_device *vmbus_dev)
+{
+ TAILQ_REMOVE(&rte_vmbus_bus.device_list, vmbus_dev, next);
+}
+
+/* VMBUS doesn't support hotplug */
+static struct rte_device *
+vmbus_find_device(const struct rte_device *start, rte_dev_cmp_t cmp,
+ const void *data)
+{
+ struct rte_vmbus_device *dev;
+
+ FOREACH_DEVICE_ON_VMBUS(dev) {
+ if (start && &dev->device == start) {
+ start = NULL;
+ continue;
+ }
+ if (cmp(&dev->device, data) == 0)
+ return &dev->device;
+ }
+
+ return NULL;
+}
+
+
+struct rte_vmbus_bus rte_vmbus_bus = {
+ .bus = {
+ .scan = rte_vmbus_scan,
+ .probe = rte_vmbus_probe,
+ .find_device = vmbus_find_device,
+ .parse = vmbus_parse,
+ },
+ .device_list = TAILQ_HEAD_INITIALIZER(rte_vmbus_bus.device_list),
+ .driver_list = TAILQ_HEAD_INITIALIZER(rte_vmbus_bus.driver_list),
+};
+
+RTE_REGISTER_BUS(vmbus, rte_vmbus_bus.bus);
+
+RTE_INIT(vmbus_init_log)
+{
+ vmbus_logtype_bus = rte_log_register("bus.vmbus");
+ if (vmbus_logtype_bus >= 0)
+ rte_log_set_level(vmbus_logtype_bus, RTE_LOG_NOTICE);
+}
diff --git a/src/seastar/dpdk/drivers/bus/vmbus/vmbus_common_uio.c b/src/seastar/dpdk/drivers/bus/vmbus/vmbus_common_uio.c
new file mode 100644
index 000000000..8e476f2ea
--- /dev/null
+++ b/src/seastar/dpdk/drivers/bus/vmbus/vmbus_common_uio.c
@@ -0,0 +1,234 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2018, Microsoft Corporation.
+ * All Rights Reserved.
+ */
+
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+
+#include <rte_eal.h>
+#include <rte_tailq.h>
+#include <rte_log.h>
+#include <rte_malloc.h>
+#include <rte_bus.h>
+#include <rte_bus_vmbus.h>
+
+#include "private.h"
+
+static struct rte_tailq_elem vmbus_tailq = {
+ .name = "VMBUS_RESOURCE_LIST",
+};
+EAL_REGISTER_TAILQ(vmbus_tailq)
+
+struct mapped_vmbus_resource *
+vmbus_uio_find_resource(const struct rte_vmbus_device *dev)
+{
+ struct mapped_vmbus_resource *uio_res;
+ struct mapped_vmbus_res_list *uio_res_list =
+ RTE_TAILQ_CAST(vmbus_tailq.head, mapped_vmbus_res_list);
+
+ if (dev == NULL)
+ return NULL;
+
+ TAILQ_FOREACH(uio_res, uio_res_list, next) {
+ if (rte_uuid_compare(uio_res->id, dev->device_id) == 0)
+ return uio_res;
+ }
+ return NULL;
+}
+
+static int
+vmbus_uio_map_secondary(struct rte_vmbus_device *dev)
+{
+ struct mapped_vmbus_resource *uio_res;
+ struct vmbus_channel *chan;
+ int fd, i;
+
+ uio_res = vmbus_uio_find_resource(dev);
+ if (!uio_res) {
+ VMBUS_LOG(ERR, "Cannot find resource for device");
+ return -1;
+ }
+
+ /* open /dev/uioX */
+ fd = open(uio_res->path, O_RDWR);
+ if (fd < 0) {
+ VMBUS_LOG(ERR, "Cannot open %s: %s",
+ uio_res->path, strerror(errno));
+ return -1;
+ }
+
+ for (i = 0; i != uio_res->nb_maps; i++) {
+ void *mapaddr;
+ off_t offset = i * PAGE_SIZE;
+
+ mapaddr = vmbus_map_resource(uio_res->maps[i].addr,
+ fd, offset,
+ uio_res->maps[i].size, 0);
+
+ if (mapaddr == uio_res->maps[i].addr)
+ continue; /* successful map */
+
+ if (mapaddr == MAP_FAILED)
+ VMBUS_LOG(ERR,
+ "mmap resource %d in secondary failed", i);
+ else {
+ VMBUS_LOG(ERR,
+ "mmap resource %d address mismatch", i);
+ vmbus_unmap_resource(mapaddr, uio_res->maps[i].size);
+ }
+
+ close(fd);
+ return -1;
+ }
+
+ /* fd is not needed in slave process, close it */
+ close(fd);
+
+ dev->primary = uio_res->primary;
+ if (!dev->primary) {
+ VMBUS_LOG(ERR, "missing primary channel");
+ return -1;
+ }
+
+ STAILQ_FOREACH(chan, &dev->primary->subchannel_list, next) {
+ if (vmbus_uio_map_secondary_subchan(dev, chan) != 0) {
+ VMBUS_LOG(ERR, "cannot map secondary subchan");
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int
+vmbus_uio_map_primary(struct rte_vmbus_device *dev)
+{
+ int i, ret;
+ struct mapped_vmbus_resource *uio_res = NULL;
+ struct mapped_vmbus_res_list *uio_res_list =
+ RTE_TAILQ_CAST(vmbus_tailq.head, mapped_vmbus_res_list);
+
+ /* allocate uio resource */
+ ret = vmbus_uio_alloc_resource(dev, &uio_res);
+ if (ret)
+ return ret;
+
+ /* Map the resources */
+ for (i = 0; i < VMBUS_MAX_RESOURCE; i++) {
+ /* stop at empty BAR */
+ if (dev->resource[i].len == 0)
+ break;
+
+ ret = vmbus_uio_map_resource_by_index(dev, i, uio_res, 0);
+ if (ret)
+ goto error;
+ }
+
+ uio_res->nb_maps = i;
+
+ TAILQ_INSERT_TAIL(uio_res_list, uio_res, next);
+
+ return 0;
+error:
+ while (--i >= 0) {
+ vmbus_unmap_resource(uio_res->maps[i].addr,
+ (size_t)uio_res->maps[i].size);
+ }
+ vmbus_uio_free_resource(dev, uio_res);
+ return -1;
+}
+
+/* map the VMBUS resource of a VMBUS device in virtual memory */
+int
+vmbus_uio_map_resource(struct rte_vmbus_device *dev)
+{
+ struct mapped_vmbus_resource *uio_res;
+ int ret;
+
+ /* TODO: handle rescind */
+ dev->intr_handle.fd = -1;
+ dev->intr_handle.uio_cfg_fd = -1;
+ dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+
+ /* secondary processes - use already recorded details */
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY)
+ ret = vmbus_uio_map_secondary(dev);
+ else
+ ret = vmbus_uio_map_primary(dev);
+
+ if (ret != 0)
+ return ret;
+
+ uio_res = vmbus_uio_find_resource(dev);
+ if (!uio_res) {
+ VMBUS_LOG(ERR, "can not find resources!");
+ return -EIO;
+ }
+
+ if (uio_res->nb_maps <= HV_MON_PAGE_MAP) {
+ VMBUS_LOG(ERR, "VMBUS: only %u resources found!",
+ uio_res->nb_maps);
+ return -EINVAL;
+ }
+
+ dev->int_page = (uint32_t *)((char *)uio_res->maps[HV_INT_PAGE_MAP].addr
+ + (PAGE_SIZE >> 1));
+ dev->monitor_page = uio_res->maps[HV_MON_PAGE_MAP].addr;
+ return 0;
+}
+
+static void
+vmbus_uio_unmap(struct mapped_vmbus_resource *uio_res)
+{
+ int i;
+
+ if (uio_res == NULL)
+ return;
+
+ for (i = 0; i != uio_res->nb_maps; i++) {
+ vmbus_unmap_resource(uio_res->maps[i].addr,
+ (size_t)uio_res->maps[i].size);
+ }
+}
+
+/* unmap the VMBUS resource of a VMBUS device in virtual memory */
+void
+vmbus_uio_unmap_resource(struct rte_vmbus_device *dev)
+{
+ struct mapped_vmbus_resource *uio_res;
+ struct mapped_vmbus_res_list *uio_res_list =
+ RTE_TAILQ_CAST(vmbus_tailq.head, mapped_vmbus_res_list);
+
+ if (dev == NULL)
+ return;
+
+ /* find an entry for the device */
+ uio_res = vmbus_uio_find_resource(dev);
+ if (uio_res == NULL)
+ return;
+
+ /* secondary processes - just free maps */
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY)
+ return vmbus_uio_unmap(uio_res);
+
+ TAILQ_REMOVE(uio_res_list, uio_res, next);
+
+ /* unmap all resources */
+ vmbus_uio_unmap(uio_res);
+
+ /* free uio resource */
+ rte_free(uio_res);
+
+ /* close fd if in primary process */
+ close(dev->intr_handle.fd);
+ if (dev->intr_handle.uio_cfg_fd >= 0) {
+ close(dev->intr_handle.uio_cfg_fd);
+ dev->intr_handle.uio_cfg_fd = -1;
+ }
+
+ dev->intr_handle.fd = -1;
+ dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+}