summaryrefslogtreecommitdiffstats
path: root/src/seastar/dpdk/lib/librte_vhost
diff options
context:
space:
mode:
Diffstat (limited to 'src/seastar/dpdk/lib/librte_vhost')
-rw-r--r--src/seastar/dpdk/lib/librte_vhost/Makefile38
-rw-r--r--src/seastar/dpdk/lib/librte_vhost/fd_man.c374
-rw-r--r--src/seastar/dpdk/lib/librte_vhost/fd_man.h58
-rw-r--r--src/seastar/dpdk/lib/librte_vhost/iotlb.c363
-rw-r--r--src/seastar/dpdk/lib/librte_vhost/iotlb.h79
-rw-r--r--src/seastar/dpdk/lib/librte_vhost/meson.build19
-rw-r--r--src/seastar/dpdk/lib/librte_vhost/rte_vdpa.h196
-rw-r--r--src/seastar/dpdk/lib/librte_vhost/rte_vhost.h728
-rw-r--r--src/seastar/dpdk/lib/librte_vhost/rte_vhost_crypto.h112
-rw-r--r--src/seastar/dpdk/lib/librte_vhost/rte_vhost_version.map90
-rw-r--r--src/seastar/dpdk/lib/librte_vhost/socket.c1123
-rw-r--r--src/seastar/dpdk/lib/librte_vhost/vdpa.c228
-rw-r--r--src/seastar/dpdk/lib/librte_vhost/vhost.c814
-rw-r--r--src/seastar/dpdk/lib/librte_vhost/vhost.h782
-rw-r--r--src/seastar/dpdk/lib/librte_vhost/vhost_crypto.c1677
-rw-r--r--src/seastar/dpdk/lib/librte_vhost/vhost_user.c2269
-rw-r--r--src/seastar/dpdk/lib/librte_vhost/vhost_user.h157
-rw-r--r--src/seastar/dpdk/lib/librte_vhost/virtio_crypto.h422
-rw-r--r--src/seastar/dpdk/lib/librte_vhost/virtio_net.c1638
19 files changed, 11167 insertions, 0 deletions
diff --git a/src/seastar/dpdk/lib/librte_vhost/Makefile b/src/seastar/dpdk/lib/librte_vhost/Makefile
new file mode 100644
index 000000000..5dd318987
--- /dev/null
+++ b/src/seastar/dpdk/lib/librte_vhost/Makefile
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2010-2014 Intel Corporation
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_vhost.a
+
+EXPORT_MAP := rte_vhost_version.map
+
+LIBABIVER := 4
+
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -D_FILE_OFFSET_BITS=64
+CFLAGS += -I vhost_user
+CFLAGS += -fno-strict-aliasing
+LDLIBS += -lpthread
+
+ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y)
+LDLIBS += -lnuma
+endif
+LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev -lrte_net
+
+# all source are stored in SRCS-y
+SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c iotlb.c socket.c vhost.c \
+ vhost_user.c virtio_net.c vdpa.c
+
+# install includes
+SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h
+
+# only compile vhost crypto when cryptodev is enabled
+ifeq ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y)
+LDLIBS += -lrte_cryptodev -lrte_hash
+SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += vhost_crypto.c
+SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost_crypto.h
+endif
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/src/seastar/dpdk/lib/librte_vhost/fd_man.c b/src/seastar/dpdk/lib/librte_vhost/fd_man.c
new file mode 100644
index 000000000..55d4856f9
--- /dev/null
+++ b/src/seastar/dpdk/lib/librte_vhost/fd_man.c
@@ -0,0 +1,374 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+
+#include "fd_man.h"
+
+
+#define RTE_LOGTYPE_VHOST_FDMAN RTE_LOGTYPE_USER1
+
+#define FDPOLLERR (POLLERR | POLLHUP | POLLNVAL)
+
+static int
+get_last_valid_idx(struct fdset *pfdset, int last_valid_idx)
+{
+ int i;
+
+ for (i = last_valid_idx; i >= 0 && pfdset->fd[i].fd == -1; i--)
+ ;
+
+ return i;
+}
+
+static void
+fdset_move(struct fdset *pfdset, int dst, int src)
+{
+ pfdset->fd[dst] = pfdset->fd[src];
+ pfdset->rwfds[dst] = pfdset->rwfds[src];
+}
+
+static void
+fdset_shrink_nolock(struct fdset *pfdset)
+{
+ int i;
+ int last_valid_idx = get_last_valid_idx(pfdset, pfdset->num - 1);
+
+ for (i = 0; i < last_valid_idx; i++) {
+ if (pfdset->fd[i].fd != -1)
+ continue;
+
+ fdset_move(pfdset, i, last_valid_idx);
+ last_valid_idx = get_last_valid_idx(pfdset, last_valid_idx - 1);
+ }
+ pfdset->num = last_valid_idx + 1;
+}
+
+/*
+ * Find deleted fd entries and remove them
+ */
+static void
+fdset_shrink(struct fdset *pfdset)
+{
+ pthread_mutex_lock(&pfdset->fd_mutex);
+ fdset_shrink_nolock(pfdset);
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+}
+
+/**
+ * Returns the index in the fdset for a given fd.
+ * @return
+ * index for the fd, or -1 if fd isn't in the fdset.
+ */
+static int
+fdset_find_fd(struct fdset *pfdset, int fd)
+{
+ int i;
+
+ for (i = 0; i < pfdset->num && pfdset->fd[i].fd != fd; i++)
+ ;
+
+ return i == pfdset->num ? -1 : i;
+}
+
+static void
+fdset_add_fd(struct fdset *pfdset, int idx, int fd,
+ fd_cb rcb, fd_cb wcb, void *dat)
+{
+ struct fdentry *pfdentry = &pfdset->fd[idx];
+ struct pollfd *pfd = &pfdset->rwfds[idx];
+
+ pfdentry->fd = fd;
+ pfdentry->rcb = rcb;
+ pfdentry->wcb = wcb;
+ pfdentry->dat = dat;
+
+ pfd->fd = fd;
+ pfd->events = rcb ? POLLIN : 0;
+ pfd->events |= wcb ? POLLOUT : 0;
+ pfd->revents = 0;
+}
+
+void
+fdset_init(struct fdset *pfdset)
+{
+ int i;
+
+ if (pfdset == NULL)
+ return;
+
+ for (i = 0; i < MAX_FDS; i++) {
+ pfdset->fd[i].fd = -1;
+ pfdset->fd[i].dat = NULL;
+ }
+ pfdset->num = 0;
+}
+
+/**
+ * Register the fd in the fdset with read/write handler and context.
+ */
+int
+fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat)
+{
+ int i;
+
+ if (pfdset == NULL || fd == -1)
+ return -1;
+
+ pthread_mutex_lock(&pfdset->fd_mutex);
+ i = pfdset->num < MAX_FDS ? pfdset->num++ : -1;
+ if (i == -1) {
+ pthread_mutex_lock(&pfdset->fd_pooling_mutex);
+ fdset_shrink_nolock(pfdset);
+ pthread_mutex_unlock(&pfdset->fd_pooling_mutex);
+ i = pfdset->num < MAX_FDS ? pfdset->num++ : -1;
+ if (i == -1) {
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+ return -2;
+ }
+ }
+
+ fdset_add_fd(pfdset, i, fd, rcb, wcb, dat);
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+
+ return 0;
+}
+
+/**
+ * Unregister the fd from the fdset.
+ * Returns context of a given fd or NULL.
+ */
+void *
+fdset_del(struct fdset *pfdset, int fd)
+{
+ int i;
+ void *dat = NULL;
+
+ if (pfdset == NULL || fd == -1)
+ return NULL;
+
+ do {
+ pthread_mutex_lock(&pfdset->fd_mutex);
+
+ i = fdset_find_fd(pfdset, fd);
+ if (i != -1 && pfdset->fd[i].busy == 0) {
+ /* busy indicates r/wcb is executing! */
+ dat = pfdset->fd[i].dat;
+ pfdset->fd[i].fd = -1;
+ pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL;
+ pfdset->fd[i].dat = NULL;
+ i = -1;
+ }
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+ } while (i != -1);
+
+ return dat;
+}
+
+/**
+ * Unregister the fd from the fdset.
+ *
+ * If parameters are invalid, return directly -2.
+ * And check whether fd is busy, if yes, return -1.
+ * Otherwise, try to delete the fd from fdset and
+ * return true.
+ */
+int
+fdset_try_del(struct fdset *pfdset, int fd)
+{
+ int i;
+
+ if (pfdset == NULL || fd == -1)
+ return -2;
+
+ pthread_mutex_lock(&pfdset->fd_mutex);
+ i = fdset_find_fd(pfdset, fd);
+ if (i != -1 && pfdset->fd[i].busy) {
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+ return -1;
+ }
+
+ if (i != -1) {
+ pfdset->fd[i].fd = -1;
+ pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL;
+ pfdset->fd[i].dat = NULL;
+ }
+
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+ return 0;
+}
+
+/**
+ * This functions runs in infinite blocking loop until there is no fd in
+ * pfdset. It calls corresponding r/w handler if there is event on the fd.
+ *
+ * Before the callback is called, we set the flag to busy status; If other
+ * thread(now rte_vhost_driver_unregister) calls fdset_del concurrently, it
+ * will wait until the flag is reset to zero(which indicates the callback is
+ * finished), then it could free the context after fdset_del.
+ */
+void *
+fdset_event_dispatch(void *arg)
+{
+ int i;
+ struct pollfd *pfd;
+ struct fdentry *pfdentry;
+ fd_cb rcb, wcb;
+ void *dat;
+ int fd, numfds;
+ int remove1, remove2;
+ int need_shrink;
+ struct fdset *pfdset = arg;
+ int val;
+
+ if (pfdset == NULL)
+ return NULL;
+
+ while (1) {
+
+ /*
+ * When poll is blocked, other threads might unregister
+ * listenfds from and register new listenfds into fdset.
+ * When poll returns, the entries for listenfds in the fdset
+ * might have been updated. It is ok if there is unwanted call
+ * for new listenfds.
+ */
+ pthread_mutex_lock(&pfdset->fd_mutex);
+ numfds = pfdset->num;
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+
+ pthread_mutex_lock(&pfdset->fd_pooling_mutex);
+ val = poll(pfdset->rwfds, numfds, 1000 /* millisecs */);
+ pthread_mutex_unlock(&pfdset->fd_pooling_mutex);
+ if (val < 0)
+ continue;
+
+ need_shrink = 0;
+ for (i = 0; i < numfds; i++) {
+ pthread_mutex_lock(&pfdset->fd_mutex);
+
+ pfdentry = &pfdset->fd[i];
+ fd = pfdentry->fd;
+ pfd = &pfdset->rwfds[i];
+
+ if (fd < 0) {
+ need_shrink = 1;
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+ continue;
+ }
+
+ if (!pfd->revents) {
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+ continue;
+ }
+
+ remove1 = remove2 = 0;
+
+ rcb = pfdentry->rcb;
+ wcb = pfdentry->wcb;
+ dat = pfdentry->dat;
+ pfdentry->busy = 1;
+
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+
+ if (rcb && pfd->revents & (POLLIN | FDPOLLERR))
+ rcb(fd, dat, &remove1);
+ if (wcb && pfd->revents & (POLLOUT | FDPOLLERR))
+ wcb(fd, dat, &remove2);
+ pfdentry->busy = 0;
+ /*
+ * fdset_del needs to check busy flag.
+ * We don't allow fdset_del to be called in callback
+ * directly.
+ */
+ /*
+ * When we are to clean up the fd from fdset,
+ * because the fd is closed in the cb,
+ * the old fd val could be reused by when creates new
+ * listen fd in another thread, we couldn't call
+ * fdset_del.
+ */
+ if (remove1 || remove2) {
+ pfdentry->fd = -1;
+ need_shrink = 1;
+ }
+ }
+
+ if (need_shrink)
+ fdset_shrink(pfdset);
+ }
+
+ return NULL;
+}
+
+static void
+fdset_pipe_read_cb(int readfd, void *dat __rte_unused,
+ int *remove __rte_unused)
+{
+ char charbuf[16];
+ int r = read(readfd, charbuf, sizeof(charbuf));
+ /*
+ * Just an optimization, we don't care if read() failed
+ * so ignore explicitly its return value to make the
+ * compiler happy
+ */
+ RTE_SET_USED(r);
+}
+
+void
+fdset_pipe_uninit(struct fdset *fdset)
+{
+ fdset_del(fdset, fdset->u.readfd);
+ close(fdset->u.readfd);
+ close(fdset->u.writefd);
+}
+
+int
+fdset_pipe_init(struct fdset *fdset)
+{
+ int ret;
+
+ if (pipe(fdset->u.pipefd) < 0) {
+ RTE_LOG(ERR, VHOST_FDMAN,
+ "failed to create pipe for vhost fdset\n");
+ return -1;
+ }
+
+ ret = fdset_add(fdset, fdset->u.readfd,
+ fdset_pipe_read_cb, NULL, NULL);
+
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_FDMAN,
+ "failed to add pipe readfd %d into vhost server fdset\n",
+ fdset->u.readfd);
+
+ fdset_pipe_uninit(fdset);
+ return -1;
+ }
+
+ return 0;
+}
+
+void
+fdset_pipe_notify(struct fdset *fdset)
+{
+ int r = write(fdset->u.writefd, "1", 1);
+ /*
+ * Just an optimization, we don't care if write() failed
+ * so ignore explicitly its return value to make the
+ * compiler happy
+ */
+ RTE_SET_USED(r);
+
+}
diff --git a/src/seastar/dpdk/lib/librte_vhost/fd_man.h b/src/seastar/dpdk/lib/librte_vhost/fd_man.h
new file mode 100644
index 000000000..3ab5cfdd6
--- /dev/null
+++ b/src/seastar/dpdk/lib/librte_vhost/fd_man.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#ifndef _FD_MAN_H_
+#define _FD_MAN_H_
+#include <stdint.h>
+#include <pthread.h>
+#include <poll.h>
+
+#define MAX_FDS 1024
+
+typedef void (*fd_cb)(int fd, void *dat, int *remove);
+
+struct fdentry {
+ int fd; /* -1 indicates this entry is empty */
+ fd_cb rcb; /* callback when this fd is readable. */
+ fd_cb wcb; /* callback when this fd is writeable.*/
+ void *dat; /* fd context */
+ int busy; /* whether this entry is being used in cb. */
+};
+
+struct fdset {
+ struct pollfd rwfds[MAX_FDS];
+ struct fdentry fd[MAX_FDS];
+ pthread_mutex_t fd_mutex;
+ pthread_mutex_t fd_pooling_mutex;
+ int num; /* current fd number of this fdset */
+
+ union pipefds {
+ struct {
+ int pipefd[2];
+ };
+ struct {
+ int readfd;
+ int writefd;
+ };
+ } u;
+};
+
+
+void fdset_init(struct fdset *pfdset);
+
+int fdset_add(struct fdset *pfdset, int fd,
+ fd_cb rcb, fd_cb wcb, void *dat);
+
+void *fdset_del(struct fdset *pfdset, int fd);
+int fdset_try_del(struct fdset *pfdset, int fd);
+
+void *fdset_event_dispatch(void *arg);
+
+int fdset_pipe_init(struct fdset *fdset);
+
+void fdset_pipe_uninit(struct fdset *fdset);
+
+void fdset_pipe_notify(struct fdset *fdset);
+
+#endif
diff --git a/src/seastar/dpdk/lib/librte_vhost/iotlb.c b/src/seastar/dpdk/lib/librte_vhost/iotlb.c
new file mode 100644
index 000000000..fcf9e9861
--- /dev/null
+++ b/src/seastar/dpdk/lib/librte_vhost/iotlb.c
@@ -0,0 +1,363 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) 2017 Red Hat, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef RTE_LIBRTE_VHOST_NUMA
+#include <numaif.h>
+#endif
+
+#include <rte_tailq.h>
+
+#include "iotlb.h"
+#include "vhost.h"
+
+struct vhost_iotlb_entry {
+ TAILQ_ENTRY(vhost_iotlb_entry) next;
+
+ uint64_t iova;
+ uint64_t uaddr;
+ uint64_t size;
+ uint8_t perm;
+};
+
+#define IOTLB_CACHE_SIZE 2048
+
+static void
+vhost_user_iotlb_cache_random_evict(struct vhost_virtqueue *vq);
+
+static void
+vhost_user_iotlb_pending_remove_all(struct vhost_virtqueue *vq)
+{
+ struct vhost_iotlb_entry *node, *temp_node;
+
+ rte_rwlock_write_lock(&vq->iotlb_pending_lock);
+
+ TAILQ_FOREACH_SAFE(node, &vq->iotlb_pending_list, next, temp_node) {
+ TAILQ_REMOVE(&vq->iotlb_pending_list, node, next);
+ rte_mempool_put(vq->iotlb_pool, node);
+ }
+
+ rte_rwlock_write_unlock(&vq->iotlb_pending_lock);
+}
+
+bool
+vhost_user_iotlb_pending_miss(struct vhost_virtqueue *vq, uint64_t iova,
+ uint8_t perm)
+{
+ struct vhost_iotlb_entry *node;
+ bool found = false;
+
+ rte_rwlock_read_lock(&vq->iotlb_pending_lock);
+
+ TAILQ_FOREACH(node, &vq->iotlb_pending_list, next) {
+ if ((node->iova == iova) && (node->perm == perm)) {
+ found = true;
+ break;
+ }
+ }
+
+ rte_rwlock_read_unlock(&vq->iotlb_pending_lock);
+
+ return found;
+}
+
+void
+vhost_user_iotlb_pending_insert(struct vhost_virtqueue *vq,
+ uint64_t iova, uint8_t perm)
+{
+ struct vhost_iotlb_entry *node;
+ int ret;
+
+ ret = rte_mempool_get(vq->iotlb_pool, (void **)&node);
+ if (ret) {
+ RTE_LOG(DEBUG, VHOST_CONFIG, "IOTLB pool empty, clear entries\n");
+ if (!TAILQ_EMPTY(&vq->iotlb_pending_list))
+ vhost_user_iotlb_pending_remove_all(vq);
+ else
+ vhost_user_iotlb_cache_random_evict(vq);
+ ret = rte_mempool_get(vq->iotlb_pool, (void **)&node);
+ if (ret) {
+ RTE_LOG(ERR, VHOST_CONFIG, "IOTLB pool still empty, failure\n");
+ return;
+ }
+ }
+
+ node->iova = iova;
+ node->perm = perm;
+
+ rte_rwlock_write_lock(&vq->iotlb_pending_lock);
+
+ TAILQ_INSERT_TAIL(&vq->iotlb_pending_list, node, next);
+
+ rte_rwlock_write_unlock(&vq->iotlb_pending_lock);
+}
+
+void
+vhost_user_iotlb_pending_remove(struct vhost_virtqueue *vq,
+ uint64_t iova, uint64_t size, uint8_t perm)
+{
+ struct vhost_iotlb_entry *node, *temp_node;
+
+ rte_rwlock_write_lock(&vq->iotlb_pending_lock);
+
+ TAILQ_FOREACH_SAFE(node, &vq->iotlb_pending_list, next, temp_node) {
+ if (node->iova < iova)
+ continue;
+ if (node->iova >= iova + size)
+ continue;
+ if ((node->perm & perm) != node->perm)
+ continue;
+ TAILQ_REMOVE(&vq->iotlb_pending_list, node, next);
+ rte_mempool_put(vq->iotlb_pool, node);
+ }
+
+ rte_rwlock_write_unlock(&vq->iotlb_pending_lock);
+}
+
+static void
+vhost_user_iotlb_cache_remove_all(struct vhost_virtqueue *vq)
+{
+ struct vhost_iotlb_entry *node, *temp_node;
+
+ rte_rwlock_write_lock(&vq->iotlb_lock);
+
+ TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) {
+ TAILQ_REMOVE(&vq->iotlb_list, node, next);
+ rte_mempool_put(vq->iotlb_pool, node);
+ }
+
+ vq->iotlb_cache_nr = 0;
+
+ rte_rwlock_write_unlock(&vq->iotlb_lock);
+}
+
+static void
+vhost_user_iotlb_cache_random_evict(struct vhost_virtqueue *vq)
+{
+ struct vhost_iotlb_entry *node, *temp_node;
+ int entry_idx;
+
+ rte_rwlock_write_lock(&vq->iotlb_lock);
+
+ entry_idx = rte_rand() % vq->iotlb_cache_nr;
+
+ TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) {
+ if (!entry_idx) {
+ TAILQ_REMOVE(&vq->iotlb_list, node, next);
+ rte_mempool_put(vq->iotlb_pool, node);
+ vq->iotlb_cache_nr--;
+ break;
+ }
+ entry_idx--;
+ }
+
+ rte_rwlock_write_unlock(&vq->iotlb_lock);
+}
+
+void
+vhost_user_iotlb_cache_insert(struct vhost_virtqueue *vq, uint64_t iova,
+ uint64_t uaddr, uint64_t size, uint8_t perm)
+{
+ struct vhost_iotlb_entry *node, *new_node;
+ int ret;
+
+ ret = rte_mempool_get(vq->iotlb_pool, (void **)&new_node);
+ if (ret) {
+ RTE_LOG(DEBUG, VHOST_CONFIG, "IOTLB pool empty, clear entries\n");
+ if (!TAILQ_EMPTY(&vq->iotlb_list))
+ vhost_user_iotlb_cache_random_evict(vq);
+ else
+ vhost_user_iotlb_pending_remove_all(vq);
+ ret = rte_mempool_get(vq->iotlb_pool, (void **)&new_node);
+ if (ret) {
+ RTE_LOG(ERR, VHOST_CONFIG, "IOTLB pool still empty, failure\n");
+ return;
+ }
+ }
+
+ new_node->iova = iova;
+ new_node->uaddr = uaddr;
+ new_node->size = size;
+ new_node->perm = perm;
+
+ rte_rwlock_write_lock(&vq->iotlb_lock);
+
+ TAILQ_FOREACH(node, &vq->iotlb_list, next) {
+ /*
+ * Entries must be invalidated before being updated.
+ * So if iova already in list, assume identical.
+ */
+ if (node->iova == new_node->iova) {
+ rte_mempool_put(vq->iotlb_pool, new_node);
+ goto unlock;
+ } else if (node->iova > new_node->iova) {
+ TAILQ_INSERT_BEFORE(node, new_node, next);
+ vq->iotlb_cache_nr++;
+ goto unlock;
+ }
+ }
+
+ TAILQ_INSERT_TAIL(&vq->iotlb_list, new_node, next);
+ vq->iotlb_cache_nr++;
+
+unlock:
+ vhost_user_iotlb_pending_remove(vq, iova, size, perm);
+
+ rte_rwlock_write_unlock(&vq->iotlb_lock);
+
+}
+
+void
+vhost_user_iotlb_cache_remove(struct vhost_virtqueue *vq,
+ uint64_t iova, uint64_t size)
+{
+ struct vhost_iotlb_entry *node, *temp_node;
+
+ if (unlikely(!size))
+ return;
+
+ rte_rwlock_write_lock(&vq->iotlb_lock);
+
+ TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) {
+ /* Sorted list */
+ if (unlikely(iova + size < node->iova))
+ break;
+
+ if (iova < node->iova + node->size) {
+ TAILQ_REMOVE(&vq->iotlb_list, node, next);
+ rte_mempool_put(vq->iotlb_pool, node);
+ vq->iotlb_cache_nr--;
+ }
+ }
+
+ rte_rwlock_write_unlock(&vq->iotlb_lock);
+}
+
+uint64_t
+vhost_user_iotlb_cache_find(struct vhost_virtqueue *vq, uint64_t iova,
+ uint64_t *size, uint8_t perm)
+{
+ struct vhost_iotlb_entry *node;
+ uint64_t offset, vva = 0, mapped = 0;
+
+ if (unlikely(!*size))
+ goto out;
+
+ TAILQ_FOREACH(node, &vq->iotlb_list, next) {
+ /* List sorted by iova */
+ if (unlikely(iova < node->iova))
+ break;
+
+ if (iova >= node->iova + node->size)
+ continue;
+
+ if (unlikely((perm & node->perm) != perm)) {
+ vva = 0;
+ break;
+ }
+
+ offset = iova - node->iova;
+ if (!vva)
+ vva = node->uaddr + offset;
+
+ mapped += node->size - offset;
+ iova = node->iova + node->size;
+
+ if (mapped >= *size)
+ break;
+ }
+
+out:
+ /* Only part of the requested chunk is mapped */
+ if (unlikely(mapped < *size))
+ *size = mapped;
+
+ return vva;
+}
+
+void
+vhost_user_iotlb_flush_all(struct vhost_virtqueue *vq)
+{
+ vhost_user_iotlb_cache_remove_all(vq);
+ vhost_user_iotlb_pending_remove_all(vq);
+}
+
+int
+vhost_user_iotlb_init(struct virtio_net *dev, int vq_index)
+{
+ char pool_name[RTE_MEMPOOL_NAMESIZE];
+ struct vhost_virtqueue *vq = dev->virtqueue[vq_index];
+ int socket = 0;
+
+ if (vq->iotlb_pool) {
+ /*
+ * The cache has already been initialized,
+ * just drop all cached and pending entries.
+ */
+ vhost_user_iotlb_flush_all(vq);
+ }
+
+#ifdef RTE_LIBRTE_VHOST_NUMA
+ if (get_mempolicy(&socket, NULL, 0, vq, MPOL_F_NODE | MPOL_F_ADDR) != 0)
+ socket = 0;
+#endif
+
+ rte_rwlock_init(&vq->iotlb_lock);
+ rte_rwlock_init(&vq->iotlb_pending_lock);
+
+ TAILQ_INIT(&vq->iotlb_list);
+ TAILQ_INIT(&vq->iotlb_pending_list);
+
+ snprintf(pool_name, sizeof(pool_name), "iotlb_cache_%d_%d",
+ dev->vid, vq_index);
+
+ /* If already created, free it and recreate */
+ vq->iotlb_pool = rte_mempool_lookup(pool_name);
+ if (vq->iotlb_pool)
+ rte_mempool_free(vq->iotlb_pool);
+
+ vq->iotlb_pool = rte_mempool_create(pool_name,
+ IOTLB_CACHE_SIZE, sizeof(struct vhost_iotlb_entry), 0,
+ 0, 0, NULL, NULL, NULL, socket,
+ MEMPOOL_F_NO_CACHE_ALIGN |
+ MEMPOOL_F_SP_PUT |
+ MEMPOOL_F_SC_GET);
+ if (!vq->iotlb_pool) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to create IOTLB cache pool (%s)\n",
+ pool_name);
+ return -1;
+ }
+
+ vq->iotlb_cache_nr = 0;
+
+ return 0;
+}
diff --git a/src/seastar/dpdk/lib/librte_vhost/iotlb.h b/src/seastar/dpdk/lib/librte_vhost/iotlb.h
new file mode 100644
index 000000000..60b9e4c57
--- /dev/null
+++ b/src/seastar/dpdk/lib/librte_vhost/iotlb.h
@@ -0,0 +1,79 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) 2017 Red Hat, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _VHOST_IOTLB_H_
+#define _VHOST_IOTLB_H_
+
+#include <stdbool.h>
+
+#include "vhost.h"
+
+static __rte_always_inline void
+vhost_user_iotlb_rd_lock(struct vhost_virtqueue *vq)
+{
+ rte_rwlock_read_lock(&vq->iotlb_lock);
+}
+
+static __rte_always_inline void
+vhost_user_iotlb_rd_unlock(struct vhost_virtqueue *vq)
+{
+ rte_rwlock_read_unlock(&vq->iotlb_lock);
+}
+
+static __rte_always_inline void
+vhost_user_iotlb_wr_lock(struct vhost_virtqueue *vq)
+{
+ rte_rwlock_write_lock(&vq->iotlb_lock);
+}
+
+static __rte_always_inline void
+vhost_user_iotlb_wr_unlock(struct vhost_virtqueue *vq)
+{
+ rte_rwlock_write_unlock(&vq->iotlb_lock);
+}
+
+void vhost_user_iotlb_cache_insert(struct vhost_virtqueue *vq, uint64_t iova,
+ uint64_t uaddr, uint64_t size,
+ uint8_t perm);
+void vhost_user_iotlb_cache_remove(struct vhost_virtqueue *vq,
+ uint64_t iova, uint64_t size);
+uint64_t vhost_user_iotlb_cache_find(struct vhost_virtqueue *vq, uint64_t iova,
+ uint64_t *size, uint8_t perm);
+bool vhost_user_iotlb_pending_miss(struct vhost_virtqueue *vq, uint64_t iova,
+ uint8_t perm);
+void vhost_user_iotlb_pending_insert(struct vhost_virtqueue *vq, uint64_t iova,
+ uint8_t perm);
+void vhost_user_iotlb_pending_remove(struct vhost_virtqueue *vq, uint64_t iova,
+ uint64_t size, uint8_t perm);
+void vhost_user_iotlb_flush_all(struct vhost_virtqueue *vq);
+int vhost_user_iotlb_init(struct virtio_net *dev, int vq_index);
+
+#endif /* _VHOST_IOTLB_H_ */
diff --git a/src/seastar/dpdk/lib/librte_vhost/meson.build b/src/seastar/dpdk/lib/librte_vhost/meson.build
new file mode 100644
index 000000000..3090bbe08
--- /dev/null
+++ b/src/seastar/dpdk/lib/librte_vhost/meson.build
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2017-2018 Intel Corporation
+
+if not is_linux
+ build = false
+endif
+if has_libnuma == 1
+ dpdk_conf.set10('RTE_LIBRTE_VHOST_NUMA', true)
+endif
+dpdk_conf.set('RTE_LIBRTE_VHOST_POSTCOPY',
+ cc.has_header('linux/userfaultfd.h'))
+version = 4
+allow_experimental_apis = true
+cflags += '-fno-strict-aliasing'
+sources = files('fd_man.c', 'iotlb.c', 'socket.c', 'vdpa.c',
+ 'vhost.c', 'vhost_user.c',
+ 'virtio_net.c', 'vhost_crypto.c')
+headers = files('rte_vhost.h', 'rte_vdpa.h', 'rte_vhost_crypto.h')
+deps += ['ethdev', 'cryptodev', 'hash', 'pci']
diff --git a/src/seastar/dpdk/lib/librte_vhost/rte_vdpa.h b/src/seastar/dpdk/lib/librte_vhost/rte_vdpa.h
new file mode 100644
index 000000000..51a9a7b74
--- /dev/null
+++ b/src/seastar/dpdk/lib/librte_vhost/rte_vdpa.h
@@ -0,0 +1,196 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _RTE_VDPA_H_
+#define _RTE_VDPA_H_
+
+/**
+ * @file
+ *
+ * Device specific vhost lib
+ */
+
+#include <stdbool.h>
+
+#include <rte_pci.h>
+#include "rte_vhost.h"
+
+#define MAX_VDPA_NAME_LEN 128
+
+enum vdpa_addr_type {
+ PCI_ADDR,
+ VDPA_ADDR_MAX
+};
+
+/**
+ * vdpa device address
+ */
+struct rte_vdpa_dev_addr {
+ /** vdpa address type */
+ enum vdpa_addr_type type;
+
+ /** vdpa pci address */
+ union {
+ uint8_t __dummy[64];
+ struct rte_pci_addr pci_addr;
+ };
+};
+
+/**
+ * vdpa device operations
+ */
+struct rte_vdpa_dev_ops {
+ /** Get capabilities of this device */
+ int (*get_queue_num)(int did, uint32_t *queue_num);
+
+ /** Get supported features of this device */
+ int (*get_features)(int did, uint64_t *features);
+
+ /** Get supported protocol features of this device */
+ int (*get_protocol_features)(int did, uint64_t *protocol_features);
+
+ /** Driver configure/close the device */
+ int (*dev_conf)(int vid);
+ int (*dev_close)(int vid);
+
+ /** Enable/disable this vring */
+ int (*set_vring_state)(int vid, int vring, int state);
+
+ /** Set features when changed */
+ int (*set_features)(int vid);
+
+ /** Destination operations when migration done */
+ int (*migration_done)(int vid);
+
+ /** Get the vfio group fd */
+ int (*get_vfio_group_fd)(int vid);
+
+ /** Get the vfio device fd */
+ int (*get_vfio_device_fd)(int vid);
+
+ /** Get the notify area info of the queue */
+ int (*get_notify_area)(int vid, int qid,
+ uint64_t *offset, uint64_t *size);
+
+ /** Reserved for future extension */
+ void *reserved[5];
+};
+
+/**
+ * vdpa device structure includes device address and device operations.
+ */
+struct rte_vdpa_device {
+ /** vdpa device address */
+ struct rte_vdpa_dev_addr addr;
+ /** vdpa device operations */
+ struct rte_vdpa_dev_ops *ops;
+} __rte_cache_aligned;
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Register a vdpa device
+ *
+ * @param addr
+ * the vdpa device address
+ * @param ops
+ * the vdpa device operations
+ * @return
+ * device id on success, -1 on failure
+ */
+int __rte_experimental
+rte_vdpa_register_device(struct rte_vdpa_dev_addr *addr,
+ struct rte_vdpa_dev_ops *ops);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Unregister a vdpa device
+ *
+ * @param did
+ * vdpa device id
+ * @return
+ * device id on success, -1 on failure
+ */
+int __rte_experimental
+rte_vdpa_unregister_device(int did);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Find the device id of a vdpa device
+ *
+ * @param addr
+ * the vdpa device address
+ * @return
+ * device id on success, -1 on failure
+ */
+int __rte_experimental
+rte_vdpa_find_device_id(struct rte_vdpa_dev_addr *addr);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Find a vdpa device based on device id
+ *
+ * @param did
+ * device id
+ * @return
+ * rte_vdpa_device on success, NULL on failure
+ */
+struct rte_vdpa_device * __rte_experimental
+rte_vdpa_get_device(int did);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Get current available vdpa device number
+ *
+ * @return
+ * available vdpa device number
+ */
+int __rte_experimental
+rte_vdpa_get_device_num(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Enable/Disable host notifier mapping for a vdpa port.
+ *
+ * @param vid
+ * vhost device id
+ * @param enable
+ * true for host notifier map, false for host notifier unmap
+ * @return
+ * 0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_host_notifier_ctrl(int vid, bool enable);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Synchronize the used ring from mediated ring to guest, log dirty
+ * page for each writeable buffer, caller should handle the used
+ * ring logging before device stop.
+ *
+ * @param vid
+ * vhost device id
+ * @param qid
+ * vhost queue id
+ * @param vring_m
+ * mediated virtio ring pointer
+ * @return
+ * number of synced used entries on success, -1 on failure
+ */
+int __rte_experimental
+rte_vdpa_relay_vring_used(int vid, uint16_t qid, void *vring_m);
+#endif /* _RTE_VDPA_H_ */
diff --git a/src/seastar/dpdk/lib/librte_vhost/rte_vhost.h b/src/seastar/dpdk/lib/librte_vhost/rte_vhost.h
new file mode 100644
index 000000000..0226b3eff
--- /dev/null
+++ b/src/seastar/dpdk/lib/librte_vhost/rte_vhost.h
@@ -0,0 +1,728 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2017 Intel Corporation
+ */
+
+#ifndef _RTE_VHOST_H_
+#define _RTE_VHOST_H_
+
+/**
+ * @file
+ * Interface to vhost-user
+ */
+
+#include <stdint.h>
+#include <sys/eventfd.h>
+
+#include <rte_memory.h>
+#include <rte_mempool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* These are not C++-aware. */
+#include <linux/vhost.h>
+#include <linux/virtio_ring.h>
+
+#define RTE_VHOST_USER_CLIENT (1ULL << 0)
+#define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1)
+#define RTE_VHOST_USER_DEQUEUE_ZERO_COPY (1ULL << 2)
+#define RTE_VHOST_USER_IOMMU_SUPPORT (1ULL << 3)
+#define RTE_VHOST_USER_POSTCOPY_SUPPORT (1ULL << 4)
+
+/** Protocol features. */
+#ifndef VHOST_USER_PROTOCOL_F_MQ
+#define VHOST_USER_PROTOCOL_F_MQ 0
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_LOG_SHMFD
+#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_RARP
+#define VHOST_USER_PROTOCOL_F_RARP 2
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_REPLY_ACK
+#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_NET_MTU
+#define VHOST_USER_PROTOCOL_F_NET_MTU 4
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_SLAVE_REQ
+#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_CRYPTO_SESSION
+#define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_PAGEFAULT
+#define VHOST_USER_PROTOCOL_F_PAGEFAULT 8
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD
+#define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
+#define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
+#endif
+
+/** Indicate whether protocol features negotiation is supported. */
+#ifndef VHOST_USER_F_PROTOCOL_FEATURES
+#define VHOST_USER_F_PROTOCOL_FEATURES 30
+#endif
+
+/**
+ * Information relating to memory regions including offsets to
+ * addresses in QEMUs memory file.
+ */
+struct rte_vhost_mem_region {
+ uint64_t guest_phys_addr;
+ uint64_t guest_user_addr;
+ uint64_t host_user_addr;
+ uint64_t size;
+ void *mmap_addr;
+ uint64_t mmap_size;
+ int fd;
+};
+
+/**
+ * Memory structure includes region and mapping information.
+ */
+struct rte_vhost_memory {
+ uint32_t nregions;
+ struct rte_vhost_mem_region regions[];
+};
+
+struct rte_vhost_vring {
+ struct vring_desc *desc;
+ struct vring_avail *avail;
+ struct vring_used *used;
+ uint64_t log_guest_addr;
+
+ /** Deprecated, use rte_vhost_vring_call() instead. */
+ int callfd;
+
+ int kickfd;
+ uint16_t size;
+};
+
+/**
+ * Possible results of the vhost user message handling callbacks
+ */
+enum rte_vhost_msg_result {
+ /* Message handling failed */
+ RTE_VHOST_MSG_RESULT_ERR = -1,
+ /* Message handling successful */
+ RTE_VHOST_MSG_RESULT_OK = 0,
+ /* Message handling successful and reply prepared */
+ RTE_VHOST_MSG_RESULT_REPLY = 1,
+ /* Message not handled */
+ RTE_VHOST_MSG_RESULT_NOT_HANDLED,
+};
+
+/**
+ * Function prototype for the vhost backend to handle specific vhost user
+ * messages.
+ *
+ * @param vid
+ * vhost device id
+ * @param msg
+ * Message pointer.
+ * @return
+ * RTE_VHOST_MSG_RESULT_OK on success,
+ * RTE_VHOST_MSG_RESULT_REPLY on success with reply,
+ * RTE_VHOST_MSG_RESULT_ERR on failure,
+ * RTE_VHOST_MSG_RESULT_NOT_HANDLED if message was not handled.
+ */
+typedef enum rte_vhost_msg_result (*rte_vhost_msg_handle)(int vid, void *msg);
+
+/**
+ * Optional vhost user message handlers.
+ */
+struct rte_vhost_user_extern_ops {
+ /* Called prior to the master message handling. */
+ rte_vhost_msg_handle pre_msg_handle;
+ /* Called after the master message handling. */
+ rte_vhost_msg_handle post_msg_handle;
+};
+
+/**
+ * Device and vring operations.
+ */
+struct vhost_device_ops {
+ int (*new_device)(int vid); /**< Add device. */
+ void (*destroy_device)(int vid); /**< Remove device. */
+
+ int (*vring_state_changed)(int vid, uint16_t queue_id, int enable); /**< triggered when a vring is enabled or disabled */
+
+ /**
+ * Features could be changed after the feature negotiation.
+ * For example, VHOST_F_LOG_ALL will be set/cleared at the
+ * start/end of live migration, respectively. This callback
+ * is used to inform the application on such change.
+ */
+ int (*features_changed)(int vid, uint64_t features);
+
+ int (*new_connection)(int vid);
+ void (*destroy_connection)(int vid);
+
+ void *reserved[2]; /**< Reserved for future extension */
+};
+
+/**
+ * Convert guest physical address to host virtual address
+ *
+ * This function is deprecated because unsafe.
+ * New rte_vhost_va_from_guest_pa() should be used instead to ensure
+ * guest physical ranges are fully and contiguously mapped into
+ * process virtual address space.
+ *
+ * @param mem
+ * the guest memory regions
+ * @param gpa
+ * the guest physical address for querying
+ * @return
+ * the host virtual address on success, 0 on failure
+ */
+__rte_deprecated
+static __rte_always_inline uint64_t
+rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa)
+{
+ struct rte_vhost_mem_region *reg;
+ uint32_t i;
+
+ for (i = 0; i < mem->nregions; i++) {
+ reg = &mem->regions[i];
+ if (gpa >= reg->guest_phys_addr &&
+ gpa < reg->guest_phys_addr + reg->size) {
+ return gpa - reg->guest_phys_addr +
+ reg->host_user_addr;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * Convert guest physical address to host virtual address safely
+ *
+ * This variant of rte_vhost_gpa_to_vva() takes care all the
+ * requested length is mapped and contiguous in process address
+ * space.
+ *
+ * @param mem
+ * the guest memory regions
+ * @param gpa
+ * the guest physical address for querying
+ * @param len
+ * the size of the requested area to map, updated with actual size mapped
+ * @return
+ * the host virtual address on success, 0 on failure
+ */
+static __rte_always_inline uint64_t
+rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem,
+ uint64_t gpa, uint64_t *len)
+{
+ struct rte_vhost_mem_region *r;
+ uint32_t i;
+
+ for (i = 0; i < mem->nregions; i++) {
+ r = &mem->regions[i];
+ if (gpa >= r->guest_phys_addr &&
+ gpa < r->guest_phys_addr + r->size) {
+
+ if (unlikely(*len > r->guest_phys_addr + r->size - gpa))
+ *len = r->guest_phys_addr + r->size - gpa;
+
+ return gpa - r->guest_phys_addr +
+ r->host_user_addr;
+ }
+ }
+ *len = 0;
+
+ return 0;
+}
+
+#define RTE_VHOST_NEED_LOG(features) ((features) & (1ULL << VHOST_F_LOG_ALL))
+
+/**
+ * Log the memory write start with given address.
+ *
+ * This function only need be invoked when the live migration starts.
+ * Therefore, we won't need call it at all in the most of time. For
+ * making the performance impact be minimum, it's suggested to do a
+ * check before calling it:
+ *
+ * if (unlikely(RTE_VHOST_NEED_LOG(features)))
+ * rte_vhost_log_write(vid, addr, len);
+ *
+ * @param vid
+ * vhost device ID
+ * @param addr
+ * the starting address for write
+ * @param len
+ * the length to write
+ */
+void rte_vhost_log_write(int vid, uint64_t addr, uint64_t len);
+
+/**
+ * Log the used ring update start at given offset.
+ *
+ * Same as rte_vhost_log_write, it's suggested to do a check before
+ * calling it:
+ *
+ * if (unlikely(RTE_VHOST_NEED_LOG(features)))
+ * rte_vhost_log_used_vring(vid, vring_idx, offset, len);
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * the vring index
+ * @param offset
+ * the offset inside the used ring
+ * @param len
+ * the length to write
+ */
+void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
+ uint64_t offset, uint64_t len);
+
+int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
+
+/**
+ * Register vhost driver. path could be different for multiple
+ * instance support.
+ */
+int rte_vhost_driver_register(const char *path, uint64_t flags);
+
+/* Unregister vhost driver. This is only meaningful to vhost user. */
+int rte_vhost_driver_unregister(const char *path);
+
+/**
+ * Set the vdpa device id, enforce single connection per socket
+ *
+ * @param path
+ * The vhost-user socket file path
+ * @param did
+ * Device id
+ * @return
+ * 0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_driver_attach_vdpa_device(const char *path, int did);
+
+/**
+ * Unset the vdpa device id
+ *
+ * @param path
+ * The vhost-user socket file path
+ * @return
+ * 0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_driver_detach_vdpa_device(const char *path);
+
+/**
+ * Get the device id
+ *
+ * @param path
+ * The vhost-user socket file path
+ * @return
+ * Device id, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_driver_get_vdpa_device_id(const char *path);
+
+/**
+ * Set the feature bits the vhost-user driver supports.
+ *
+ * @param path
+ * The vhost-user socket file path
+ * @param features
+ * Supported features
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_driver_set_features(const char *path, uint64_t features);
+
+/**
+ * Enable vhost-user driver features.
+ *
+ * Note that
+ * - the param features should be a subset of the feature bits provided
+ * by rte_vhost_driver_set_features().
+ * - it must be invoked before vhost-user negotiation starts.
+ *
+ * @param path
+ * The vhost-user socket file path
+ * @param features
+ * Features to enable
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_driver_enable_features(const char *path, uint64_t features);
+
+/**
+ * Disable vhost-user driver features.
+ *
+ * The two notes at rte_vhost_driver_enable_features() also apply here.
+ *
+ * @param path
+ * The vhost-user socket file path
+ * @param features
+ * Features to disable
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_driver_disable_features(const char *path, uint64_t features);
+
+/**
+ * Get the feature bits before feature negotiation.
+ *
+ * @param path
+ * The vhost-user socket file path
+ * @param features
+ * A pointer to store the queried feature bits
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_driver_get_features(const char *path, uint64_t *features);
+
+/**
+ * Set the protocol feature bits before feature negotiation.
+ *
+ * @param path
+ * The vhost-user socket file path
+ * @param protocol_features
+ * Supported protocol features
+ * @return
+ * 0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_driver_set_protocol_features(const char *path,
+ uint64_t protocol_features);
+
+/**
+ * Get the protocol feature bits before feature negotiation.
+ *
+ * @param path
+ * The vhost-user socket file path
+ * @param protocol_features
+ * A pointer to store the queried protocol feature bits
+ * @return
+ * 0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_driver_get_protocol_features(const char *path,
+ uint64_t *protocol_features);
+
+/**
+ * Get the queue number bits before feature negotiation.
+ *
+ * @param path
+ * The vhost-user socket file path
+ * @param queue_num
+ * A pointer to store the queried queue number bits
+ * @return
+ * 0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num);
+
+/**
+ * Get the feature bits after negotiation
+ *
+ * @param vid
+ * Vhost device ID
+ * @param features
+ * A pointer to store the queried feature bits
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_get_negotiated_features(int vid, uint64_t *features);
+
+/* Register callbacks. */
+int rte_vhost_driver_callback_register(const char *path,
+ struct vhost_device_ops const * const ops);
+
+/**
+ *
+ * Start the vhost-user driver.
+ *
+ * This function triggers the vhost-user negotiation.
+ *
+ * @param path
+ * The vhost-user socket file path
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_driver_start(const char *path);
+
+/**
+ * Get the MTU value of the device if set in QEMU.
+ *
+ * @param vid
+ * virtio-net device ID
+ * @param mtu
+ * The variable to store the MTU value
+ *
+ * @return
+ * 0: success
+ * -EAGAIN: device not yet started
+ * -ENOTSUP: device does not support MTU feature
+ */
+int rte_vhost_get_mtu(int vid, uint16_t *mtu);
+
+/**
+ * Get the numa node from which the virtio net device's memory
+ * is allocated.
+ *
+ * @param vid
+ * vhost device ID
+ *
+ * @return
+ * The numa node, -1 on failure
+ */
+int rte_vhost_get_numa_node(int vid);
+
+/**
+ * @deprecated
+ * Get the number of queues the device supports.
+ *
+ * Note this function is deprecated, as it returns a queue pair number,
+ * which is vhost specific. Instead, rte_vhost_get_vring_num should
+ * be used.
+ *
+ * @param vid
+ * vhost device ID
+ *
+ * @return
+ * The number of queues, 0 on failure
+ */
+__rte_deprecated
+uint32_t rte_vhost_get_queue_num(int vid);
+
+/**
+ * Get the number of vrings the device supports.
+ *
+ * @param vid
+ * vhost device ID
+ *
+ * @return
+ * The number of vrings, 0 on failure
+ */
+uint16_t rte_vhost_get_vring_num(int vid);
+
+/**
+ * Get the virtio net device's ifname, which is the vhost-user socket
+ * file path.
+ *
+ * @param vid
+ * vhost device ID
+ * @param buf
+ * The buffer to stored the queried ifname
+ * @param len
+ * The length of buf
+ *
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_get_ifname(int vid, char *buf, size_t len);
+
+/**
+ * Get how many avail entries are left in the queue
+ *
+ * @param vid
+ * vhost device ID
+ * @param queue_id
+ * virtio queue index
+ *
+ * @return
+ * num of avail entries left
+ */
+uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id);
+
+struct rte_mbuf;
+struct rte_mempool;
+/**
+ * This function adds buffers to the virtio devices RX virtqueue. Buffers can
+ * be received from the physical port or from another virtual device. A packet
+ * count is returned to indicate the number of packets that were successfully
+ * added to the RX queue.
+ * @param vid
+ * vhost device ID
+ * @param queue_id
+ * virtio queue index in mq case
+ * @param pkts
+ * array to contain packets to be enqueued
+ * @param count
+ * packets num to be enqueued
+ * @return
+ * num of packets enqueued
+ */
+uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+ struct rte_mbuf **pkts, uint16_t count);
+
+/**
+ * This function gets guest buffers from the virtio device TX virtqueue,
+ * construct host mbufs, copies guest buffer content to host mbufs and
+ * store them in pkts to be processed.
+ * @param vid
+ * vhost device ID
+ * @param queue_id
+ * virtio queue index in mq case
+ * @param mbuf_pool
+ * mbuf_pool where host mbuf is allocated.
+ * @param pkts
+ * array to contain packets to be dequeued
+ * @param count
+ * packets num to be dequeued
+ * @return
+ * num of packets dequeued
+ */
+uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
+ struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);
+
+/**
+ * Get guest mem table: a list of memory regions.
+ *
+ * An rte_vhost_vhost_memory object will be allocated internally, to hold the
+ * guest memory regions. Application should free it at destroy_device()
+ * callback.
+ *
+ * @param vid
+ * vhost device ID
+ * @param mem
+ * To store the returned mem regions
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem);
+
+/**
+ * Get guest vring info, including the vring address, vring size, etc.
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * vring index
+ * @param vring
+ * the structure to hold the requested vring info
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
+ struct rte_vhost_vring *vring);
+
+/**
+ * Notify the guest that used descriptors have been added to the vring. This
+ * function acts as a memory barrier.
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * vring index
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_vring_call(int vid, uint16_t vring_idx);
+
+/**
+ * Get vhost RX queue avail count.
+ *
+ * @param vid
+ * vhost device ID
+ * @param qid
+ * virtio queue index in mq case
+ * @return
+ * num of desc available
+ */
+uint32_t rte_vhost_rx_queue_count(int vid, uint16_t qid);
+
+/**
+ * Get log base and log size of the vhost device
+ *
+ * @param vid
+ * vhost device ID
+ * @param log_base
+ * vhost log base
+ * @param log_size
+ * vhost log size
+ * @return
+ * 0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_get_log_base(int vid, uint64_t *log_base, uint64_t *log_size);
+
+/**
+ * Get last_avail/used_idx of the vhost virtqueue
+ *
+ * @param vid
+ * vhost device ID
+ * @param queue_id
+ * vhost queue index
+ * @param last_avail_idx
+ * vhost last_avail_idx to get
+ * @param last_used_idx
+ * vhost last_used_idx to get
+ * @return
+ * 0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_get_vring_base(int vid, uint16_t queue_id,
+ uint16_t *last_avail_idx, uint16_t *last_used_idx);
+
+/**
+ * Set last_avail/used_idx of the vhost virtqueue
+ *
+ * @param vid
+ * vhost device ID
+ * @param queue_id
+ * vhost queue index
+ * @param last_avail_idx
+ * last_avail_idx to set
+ * @param last_used_idx
+ * last_used_idx to set
+ * @return
+ * 0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_set_vring_base(int vid, uint16_t queue_id,
+ uint16_t last_avail_idx, uint16_t last_used_idx);
+
+/**
+ * Register external message handling callbacks
+ *
+ * @param vid
+ * vhost device ID
+ * @param ops
+ * virtio external callbacks to register
+ * @param ctx
+ * additional context passed to the callbacks
+ * @return
+ * 0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_extern_callback_register(int vid,
+ struct rte_vhost_user_extern_ops const * const ops, void *ctx);
+
+/**
+ * Get vdpa device id for vhost device.
+ *
+ * @param vid
+ * vhost device id
+ * @return
+ * device id
+ */
+int __rte_experimental
+rte_vhost_get_vdpa_device_id(int vid);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_VHOST_H_ */
diff --git a/src/seastar/dpdk/lib/librte_vhost/rte_vhost_crypto.h b/src/seastar/dpdk/lib/librte_vhost/rte_vhost_crypto.h
new file mode 100644
index 000000000..d08e0ffab
--- /dev/null
+++ b/src/seastar/dpdk/lib/librte_vhost/rte_vhost_crypto.h
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#ifndef _VHOST_CRYPTO_H_
+#define _VHOST_CRYPTO_H_
+
+#define VHOST_CRYPTO_MBUF_POOL_SIZE (8192)
+#define VHOST_CRYPTO_MAX_BURST_SIZE (64)
+#define VHOST_CRYPTO_SESSION_MAP_ENTRIES (1024) /**< Max nb sessions */
+/** max nb virtual queues in a burst for finalizing*/
+#define VIRTIO_CRYPTO_MAX_NUM_BURST_VQS (64)
+
+enum rte_vhost_crypto_zero_copy {
+ RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE = 0,
+ RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE = 1,
+ RTE_VHOST_CRYPTO_MAX_ZERO_COPY_OPTIONS
+};
+
+/**
+ * Create Vhost-crypto instance
+ *
+ * @param vid
+ * The identifier of the vhost device.
+ * @param cryptodev_id
+ * The identifier of DPDK Cryptodev, the same cryptodev_id can be assigned to
+ * multiple Vhost-crypto devices.
+ * @param sess_pool
+ * The pointer to the created cryptodev session pool.
+ * @param sess_priv_pool
+ * The pointer to the created cryptodev session private data mempool.
+ * @param socket_id
+ * NUMA Socket ID to allocate resources on. *
+ * @return
+ * 0 if the Vhost Crypto Instance is created successfully.
+ * Negative integer if otherwise
+ */
+int __rte_experimental
+rte_vhost_crypto_create(int vid, uint8_t cryptodev_id,
+ struct rte_mempool *sess_pool,
+ struct rte_mempool *sess_priv_pool,
+ int socket_id);
+
+/**
+ * Free the Vhost-crypto instance
+ *
+ * @param vid
+ * The identifier of the vhost device.
+ * @return
+ * 0 if the Vhost Crypto Instance is created successfully.
+ * Negative integer if otherwise.
+ */
+int __rte_experimental
+rte_vhost_crypto_free(int vid);
+
+/**
+ * Enable or disable zero copy feature
+ *
+ * @param vid
+ * The identifier of the vhost device.
+ * @param option
+ * Flag of zero copy feature.
+ * @return
+ * 0 if completed successfully.
+ * Negative integer if otherwise.
+ */
+int __rte_experimental
+rte_vhost_crypto_set_zero_copy(int vid, enum rte_vhost_crypto_zero_copy option);
+
+/**
+ * Fetch a number of vring descriptors from virt-queue and translate to DPDK
+ * crypto operations. After this function is executed, the user can enqueue
+ * the processed ops to the target cryptodev.
+ *
+ * @param vid
+ * The identifier of the vhost device.
+ * @param qid
+ * Virtio queue index.
+ * @param ops
+ * The address of an array of pointers to *rte_crypto_op* structures that must
+ * be large enough to store *nb_ops* pointers in it.
+ * @param nb_ops
+ * The maximum number of operations to be fetched and translated.
+ * @return
+ * The number of fetched and processed vhost crypto request operations.
+ */
+uint16_t __rte_experimental
+rte_vhost_crypto_fetch_requests(int vid, uint32_t qid,
+ struct rte_crypto_op **ops, uint16_t nb_ops);
+/**
+ * Finalize the dequeued crypto ops. After the translated crypto ops are
+ * dequeued from the cryptodev, this function shall be called to write the
+ * processed data back to the vring descriptor (if no-copy is turned off).
+ *
+ * @param ops
+ * The address of an array of *rte_crypto_op* structure that was dequeued
+ * from cryptodev.
+ * @param nb_ops
+ * The number of operations contained in the array.
+ * @callfds
+ * The callfd number(s) contained in this burst, this shall be an array with
+ * no less than VIRTIO_CRYPTO_MAX_NUM_BURST_VQS elements.
+ * @nb_callfds
+ * The number of call_fd numbers exist in the callfds.
+ * @return
+ * The number of ops processed.
+ */
+uint16_t __rte_experimental
+rte_vhost_crypto_finalize_requests(struct rte_crypto_op **ops,
+ uint16_t nb_ops, int *callfds, uint16_t *nb_callfds);
+
+#endif /**< _VHOST_CRYPTO_H_ */
diff --git a/src/seastar/dpdk/lib/librte_vhost/rte_vhost_version.map b/src/seastar/dpdk/lib/librte_vhost/rte_vhost_version.map
new file mode 100644
index 000000000..5f1d4a75c
--- /dev/null
+++ b/src/seastar/dpdk/lib/librte_vhost/rte_vhost_version.map
@@ -0,0 +1,90 @@
+DPDK_2.0 {
+ global:
+
+ rte_vhost_dequeue_burst;
+ rte_vhost_driver_callback_register;
+ rte_vhost_driver_register;
+ rte_vhost_enable_guest_notification;
+ rte_vhost_enqueue_burst;
+
+ local: *;
+};
+
+DPDK_2.1 {
+ global:
+
+ rte_vhost_driver_unregister;
+
+} DPDK_2.0;
+
+DPDK_16.07 {
+ global:
+
+ rte_vhost_avail_entries;
+ rte_vhost_get_ifname;
+ rte_vhost_get_numa_node;
+ rte_vhost_get_queue_num;
+
+} DPDK_2.1;
+
+DPDK_17.05 {
+ global:
+
+ rte_vhost_driver_disable_features;
+ rte_vhost_driver_enable_features;
+ rte_vhost_driver_get_features;
+ rte_vhost_driver_set_features;
+ rte_vhost_driver_start;
+ rte_vhost_get_mem_table;
+ rte_vhost_get_mtu;
+ rte_vhost_get_negotiated_features;
+ rte_vhost_get_vhost_vring;
+ rte_vhost_get_vring_num;
+ rte_vhost_gpa_to_vva;
+ rte_vhost_log_used_vring;
+ rte_vhost_log_write;
+
+} DPDK_16.07;
+
+DPDK_17.08 {
+ global:
+
+ rte_vhost_rx_queue_count;
+
+} DPDK_17.05;
+
+DPDK_18.02 {
+ global:
+
+ rte_vhost_vring_call;
+
+} DPDK_17.08;
+
+EXPERIMENTAL {
+ global:
+
+ rte_vdpa_register_device;
+ rte_vdpa_unregister_device;
+ rte_vdpa_find_device_id;
+ rte_vdpa_get_device;
+ rte_vdpa_get_device_num;
+ rte_vhost_driver_attach_vdpa_device;
+ rte_vhost_driver_detach_vdpa_device;
+ rte_vhost_driver_get_vdpa_device_id;
+ rte_vhost_get_vdpa_device_id;
+ rte_vhost_driver_get_protocol_features;
+ rte_vhost_driver_get_queue_num;
+ rte_vhost_get_log_base;
+ rte_vhost_get_vring_base;
+ rte_vhost_set_vring_base;
+ rte_vhost_crypto_create;
+ rte_vhost_crypto_free;
+ rte_vhost_crypto_fetch_requests;
+ rte_vhost_crypto_finalize_requests;
+ rte_vhost_crypto_set_zero_copy;
+ rte_vhost_va_from_guest_pa;
+ rte_vhost_host_notifier_ctrl;
+ rte_vdpa_relay_vring_used;
+ rte_vhost_extern_callback_register;
+ rte_vhost_driver_set_protocol_features;
+};
diff --git a/src/seastar/dpdk/lib/librte_vhost/socket.c b/src/seastar/dpdk/lib/librte_vhost/socket.c
new file mode 100644
index 000000000..274988c4d
--- /dev/null
+++ b/src/seastar/dpdk/lib/librte_vhost/socket.c
@@ -0,0 +1,1123 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/queue.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+
+#include <rte_log.h>
+
+#include "fd_man.h"
+#include "vhost.h"
+#include "vhost_user.h"
+
+
+TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection);
+
+/*
+ * Every time rte_vhost_driver_register() is invoked, an associated
+ * vhost_user_socket struct will be created.
+ */
+struct vhost_user_socket {
+ struct vhost_user_connection_list conn_list;
+ pthread_mutex_t conn_mutex;
+ char *path;
+ int socket_fd;
+ struct sockaddr_un un;
+ bool is_server;
+ bool reconnect;
+ bool dequeue_zero_copy;
+ bool iommu_support;
+ bool use_builtin_virtio_net;
+
+ /*
+ * The "supported_features" indicates the feature bits the
+ * vhost driver supports. The "features" indicates the feature
+ * bits after the rte_vhost_driver_features_disable/enable().
+ * It is also the final feature bits used for vhost-user
+ * features negotiation.
+ */
+ uint64_t supported_features;
+ uint64_t features;
+
+ uint64_t protocol_features;
+
+ /*
+ * Device id to identify a specific backend device.
+ * It's set to -1 for the default software implementation.
+ * If valid, one socket can have 1 connection only.
+ */
+ int vdpa_dev_id;
+
+ struct vhost_device_ops const *notify_ops;
+};
+
+struct vhost_user_connection {
+ struct vhost_user_socket *vsocket;
+ int connfd;
+ int vid;
+
+ TAILQ_ENTRY(vhost_user_connection) next;
+};
+
+#define MAX_VHOST_SOCKET 1024
+struct vhost_user {
+ struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
+ struct fdset fdset;
+ int vsocket_cnt;
+ pthread_mutex_t mutex;
+};
+
+#define MAX_VIRTIO_BACKLOG 128
+
+static void vhost_user_server_new_connection(int fd, void *data, int *remove);
+static void vhost_user_read_cb(int fd, void *dat, int *remove);
+static int create_unix_socket(struct vhost_user_socket *vsocket);
+static int vhost_user_start_client(struct vhost_user_socket *vsocket);
+
+static struct vhost_user vhost_user = {
+ .fdset = {
+ .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} },
+ .fd_mutex = PTHREAD_MUTEX_INITIALIZER,
+ .fd_pooling_mutex = PTHREAD_MUTEX_INITIALIZER,
+ .num = 0
+ },
+ .vsocket_cnt = 0,
+ .mutex = PTHREAD_MUTEX_INITIALIZER,
+};
+
+/*
+ * return bytes# of read on success or negative val on failure. Update fdnum
+ * with number of fds read.
+ */
+int
+read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds,
+ int *fd_num)
+{
+ struct iovec iov;
+ struct msghdr msgh;
+ char control[CMSG_SPACE(max_fds * sizeof(int))];
+ struct cmsghdr *cmsg;
+ int got_fds = 0;
+ int ret;
+
+ *fd_num = 0;
+
+ memset(&msgh, 0, sizeof(msgh));
+ iov.iov_base = buf;
+ iov.iov_len = buflen;
+
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+
+ ret = recvmsg(sockfd, &msgh, 0);
+ if (ret <= 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n");
+ return ret;
+ }
+
+ if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
+ RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n");
+ return -1;
+ }
+
+ for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+ cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+ if ((cmsg->cmsg_level == SOL_SOCKET) &&
+ (cmsg->cmsg_type == SCM_RIGHTS)) {
+ got_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
+ *fd_num = got_fds;
+ memcpy(fds, CMSG_DATA(cmsg), got_fds * sizeof(int));
+ break;
+ }
+ }
+
+ /* Clear out unused file descriptors */
+ while (got_fds < max_fds)
+ fds[got_fds++] = -1;
+
+ return ret;
+}
+
+int
+send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+{
+
+ struct iovec iov;
+ struct msghdr msgh;
+ size_t fdsize = fd_num * sizeof(int);
+ char control[CMSG_SPACE(fdsize)];
+ struct cmsghdr *cmsg;
+ int ret;
+
+ memset(&msgh, 0, sizeof(msgh));
+ iov.iov_base = buf;
+ iov.iov_len = buflen;
+
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+
+ if (fds && fd_num > 0) {
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+ cmsg = CMSG_FIRSTHDR(&msgh);
+ if (cmsg == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG, "cmsg == NULL\n");
+ errno = EINVAL;
+ return -1;
+ }
+ cmsg->cmsg_len = CMSG_LEN(fdsize);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ memcpy(CMSG_DATA(cmsg), fds, fdsize);
+ } else {
+ msgh.msg_control = NULL;
+ msgh.msg_controllen = 0;
+ }
+
+ do {
+ ret = sendmsg(sockfd, &msgh, MSG_NOSIGNAL);
+ } while (ret < 0 && errno == EINTR);
+
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "sendmsg error\n");
+ return ret;
+ }
+
+ return ret;
+}
+
+static void
+vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
+{
+ int vid;
+ size_t size;
+ struct vhost_user_connection *conn;
+ int ret;
+
+ if (vsocket == NULL)
+ return;
+
+ conn = malloc(sizeof(*conn));
+ if (conn == NULL) {
+ close(fd);
+ return;
+ }
+
+ vid = vhost_new_device();
+ if (vid == -1) {
+ goto err;
+ }
+
+ size = strnlen(vsocket->path, PATH_MAX);
+ vhost_set_ifname(vid, vsocket->path, size);
+
+ vhost_set_builtin_virtio_net(vid, vsocket->use_builtin_virtio_net);
+
+ vhost_attach_vdpa_device(vid, vsocket->vdpa_dev_id);
+
+ if (vsocket->dequeue_zero_copy)
+ vhost_enable_dequeue_zero_copy(vid);
+
+ RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
+
+ if (vsocket->notify_ops->new_connection) {
+ ret = vsocket->notify_ops->new_connection(vid);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to add vhost user connection with fd %d\n",
+ fd);
+ goto err_cleanup;
+ }
+ }
+
+ conn->connfd = fd;
+ conn->vsocket = vsocket;
+ conn->vid = vid;
+ ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb,
+ NULL, conn);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to add fd %d into vhost server fdset\n",
+ fd);
+
+ if (vsocket->notify_ops->destroy_connection)
+ vsocket->notify_ops->destroy_connection(conn->vid);
+
+ goto err_cleanup;
+ }
+
+ pthread_mutex_lock(&vsocket->conn_mutex);
+ TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next);
+ pthread_mutex_unlock(&vsocket->conn_mutex);
+
+ fdset_pipe_notify(&vhost_user.fdset);
+ return;
+
+err_cleanup:
+ vhost_destroy_device(vid);
+err:
+ free(conn);
+ close(fd);
+}
+
+/* call back when there is new vhost-user connection from client */
+static void
+vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
+{
+ struct vhost_user_socket *vsocket = dat;
+
+ fd = accept(fd, NULL, NULL);
+ if (fd < 0)
+ return;
+
+ RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd);
+ vhost_user_add_connection(fd, vsocket);
+}
+
+static void
+vhost_user_read_cb(int connfd, void *dat, int *remove)
+{
+ struct vhost_user_connection *conn = dat;
+ struct vhost_user_socket *vsocket = conn->vsocket;
+ int ret;
+
+ ret = vhost_user_msg_handler(conn->vid, connfd);
+ if (ret < 0) {
+ struct virtio_net *dev = get_device(conn->vid);
+
+ close(connfd);
+ *remove = 1;
+
+ if (dev)
+ vhost_destroy_device_notify(dev);
+
+ if (vsocket->notify_ops->destroy_connection)
+ vsocket->notify_ops->destroy_connection(conn->vid);
+
+ vhost_destroy_device(conn->vid);
+
+ pthread_mutex_lock(&vsocket->conn_mutex);
+ TAILQ_REMOVE(&vsocket->conn_list, conn, next);
+ pthread_mutex_unlock(&vsocket->conn_mutex);
+
+ free(conn);
+
+ if (vsocket->reconnect) {
+ create_unix_socket(vsocket);
+ vhost_user_start_client(vsocket);
+ }
+ }
+}
+
+static int
+create_unix_socket(struct vhost_user_socket *vsocket)
+{
+ int fd;
+ struct sockaddr_un *un = &vsocket->un;
+
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd < 0)
+ return -1;
+ RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n",
+ vsocket->is_server ? "server" : "client", fd);
+
+ if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "vhost-user: can't set nonblocking mode for socket, fd: "
+ "%d (%s)\n", fd, strerror(errno));
+ close(fd);
+ return -1;
+ }
+
+ memset(un, 0, sizeof(*un));
+ un->sun_family = AF_UNIX;
+ strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path));
+ un->sun_path[sizeof(un->sun_path) - 1] = '\0';
+
+ vsocket->socket_fd = fd;
+ return 0;
+}
+
+static int
+vhost_user_start_server(struct vhost_user_socket *vsocket)
+{
+ int ret;
+ int fd = vsocket->socket_fd;
+ const char *path = vsocket->path;
+
+ /*
+ * bind () may fail if the socket file with the same name already
+ * exists. But the library obviously should not delete the file
+ * provided by the user, since we can not be sure that it is not
+ * being used by other applications. Moreover, many applications form
+ * socket names based on user input, which is prone to errors.
+ *
+ * The user must ensure that the socket does not exist before
+ * registering the vhost driver in server mode.
+ */
+ ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un));
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to bind to %s: %s; remove it and try again\n",
+ path, strerror(errno));
+ goto err;
+ }
+ RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
+
+ ret = listen(fd, MAX_VIRTIO_BACKLOG);
+ if (ret < 0)
+ goto err;
+
+ ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection,
+ NULL, vsocket);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to add listen fd %d to vhost server fdset\n",
+ fd);
+ goto err;
+ }
+
+ return 0;
+
+err:
+ close(fd);
+ return -1;
+}
+
+struct vhost_user_reconnect {
+ struct sockaddr_un un;
+ int fd;
+ struct vhost_user_socket *vsocket;
+
+ TAILQ_ENTRY(vhost_user_reconnect) next;
+};
+
+TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
+struct vhost_user_reconnect_list {
+ struct vhost_user_reconnect_tailq_list head;
+ pthread_mutex_t mutex;
+};
+
+static struct vhost_user_reconnect_list reconn_list;
+static pthread_t reconn_tid;
+
+static int
+vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz)
+{
+ int ret, flags;
+
+ ret = connect(fd, un, sz);
+ if (ret < 0 && errno != EISCONN)
+ return -1;
+
+ flags = fcntl(fd, F_GETFL, 0);
+ if (flags < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "can't get flags for connfd %d\n", fd);
+ return -2;
+ }
+ if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "can't disable nonblocking on fd %d\n", fd);
+ return -2;
+ }
+ return 0;
+}
+
+static void *
+vhost_user_client_reconnect(void *arg __rte_unused)
+{
+ int ret;
+ struct vhost_user_reconnect *reconn, *next;
+
+ while (1) {
+ pthread_mutex_lock(&reconn_list.mutex);
+
+ /*
+ * An equal implementation of TAILQ_FOREACH_SAFE,
+ * which does not exist on all platforms.
+ */
+ for (reconn = TAILQ_FIRST(&reconn_list.head);
+ reconn != NULL; reconn = next) {
+ next = TAILQ_NEXT(reconn, next);
+
+ ret = vhost_user_connect_nonblock(reconn->fd,
+ (struct sockaddr *)&reconn->un,
+ sizeof(reconn->un));
+ if (ret == -2) {
+ close(reconn->fd);
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "reconnection for fd %d failed\n",
+ reconn->fd);
+ goto remove_fd;
+ }
+ if (ret == -1)
+ continue;
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "%s: connected\n", reconn->vsocket->path);
+ vhost_user_add_connection(reconn->fd, reconn->vsocket);
+remove_fd:
+ TAILQ_REMOVE(&reconn_list.head, reconn, next);
+ free(reconn);
+ }
+
+ pthread_mutex_unlock(&reconn_list.mutex);
+ sleep(1);
+ }
+
+ return NULL;
+}
+
+static int
+vhost_user_reconnect_init(void)
+{
+ int ret;
+
+ ret = pthread_mutex_init(&reconn_list.mutex, NULL);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "failed to initialize mutex");
+ return ret;
+ }
+ TAILQ_INIT(&reconn_list.head);
+
+ ret = rte_ctrl_thread_create(&reconn_tid, "vhost_reconn", NULL,
+ vhost_user_client_reconnect, NULL);
+ if (ret != 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread");
+ if (pthread_mutex_destroy(&reconn_list.mutex)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to destroy reconnect mutex");
+ }
+ }
+
+ return ret;
+}
+
+static int
+vhost_user_start_client(struct vhost_user_socket *vsocket)
+{
+ int ret;
+ int fd = vsocket->socket_fd;
+ const char *path = vsocket->path;
+ struct vhost_user_reconnect *reconn;
+
+ ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&vsocket->un,
+ sizeof(vsocket->un));
+ if (ret == 0) {
+ vhost_user_add_connection(fd, vsocket);
+ return 0;
+ }
+
+ RTE_LOG(WARNING, VHOST_CONFIG,
+ "failed to connect to %s: %s\n",
+ path, strerror(errno));
+
+ if (ret == -2 || !vsocket->reconnect) {
+ close(fd);
+ return -1;
+ }
+
+ RTE_LOG(INFO, VHOST_CONFIG, "%s: reconnecting...\n", path);
+ reconn = malloc(sizeof(*reconn));
+ if (reconn == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to allocate memory for reconnect\n");
+ close(fd);
+ return -1;
+ }
+ reconn->un = vsocket->un;
+ reconn->fd = fd;
+ reconn->vsocket = vsocket;
+ pthread_mutex_lock(&reconn_list.mutex);
+ TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
+ pthread_mutex_unlock(&reconn_list.mutex);
+
+ return 0;
+}
+
+static struct vhost_user_socket *
+find_vhost_user_socket(const char *path)
+{
+ int i;
+
+ if (path == NULL)
+ return NULL;
+
+ for (i = 0; i < vhost_user.vsocket_cnt; i++) {
+ struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
+
+ if (!strcmp(vsocket->path, path))
+ return vsocket;
+ }
+
+ return NULL;
+}
+
+int
+rte_vhost_driver_attach_vdpa_device(const char *path, int did)
+{
+ struct vhost_user_socket *vsocket;
+
+ if (rte_vdpa_get_device(did) == NULL || path == NULL)
+ return -1;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ if (vsocket)
+ vsocket->vdpa_dev_id = did;
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_detach_vdpa_device(const char *path)
+{
+ struct vhost_user_socket *vsocket;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ if (vsocket)
+ vsocket->vdpa_dev_id = -1;
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_get_vdpa_device_id(const char *path)
+{
+ struct vhost_user_socket *vsocket;
+ int did = -1;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ if (vsocket)
+ did = vsocket->vdpa_dev_id;
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return did;
+}
+
+int
+rte_vhost_driver_disable_features(const char *path, uint64_t features)
+{
+ struct vhost_user_socket *vsocket;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+
+ /* Note that use_builtin_virtio_net is not affected by this function
+ * since callers may want to selectively disable features of the
+ * built-in vhost net device backend.
+ */
+
+ if (vsocket)
+ vsocket->features &= ~features;
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_enable_features(const char *path, uint64_t features)
+{
+ struct vhost_user_socket *vsocket;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ if (vsocket) {
+ if ((vsocket->supported_features & features) != features) {
+ /*
+ * trying to enable features the driver doesn't
+ * support.
+ */
+ pthread_mutex_unlock(&vhost_user.mutex);
+ return -1;
+ }
+ vsocket->features |= features;
+ }
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_set_features(const char *path, uint64_t features)
+{
+ struct vhost_user_socket *vsocket;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ if (vsocket) {
+ vsocket->supported_features = features;
+ vsocket->features = features;
+
+ /* Anyone setting feature bits is implementing their own vhost
+ * device backend.
+ */
+ vsocket->use_builtin_virtio_net = false;
+ }
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_get_features(const char *path, uint64_t *features)
+{
+ struct vhost_user_socket *vsocket;
+ uint64_t vdpa_features;
+ struct rte_vdpa_device *vdpa_dev;
+ int did = -1;
+ int ret = 0;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ if (!vsocket) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "socket file %s is not registered yet.\n", path);
+ ret = -1;
+ goto unlock_exit;
+ }
+
+ did = vsocket->vdpa_dev_id;
+ vdpa_dev = rte_vdpa_get_device(did);
+ if (!vdpa_dev || !vdpa_dev->ops->get_features) {
+ *features = vsocket->features;
+ goto unlock_exit;
+ }
+
+ if (vdpa_dev->ops->get_features(did, &vdpa_features) < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to get vdpa features "
+ "for socket file %s.\n", path);
+ ret = -1;
+ goto unlock_exit;
+ }
+
+ *features = vsocket->features & vdpa_features;
+
+unlock_exit:
+ pthread_mutex_unlock(&vhost_user.mutex);
+ return ret;
+}
+
+int
+rte_vhost_driver_set_protocol_features(const char *path,
+ uint64_t protocol_features)
+{
+ struct vhost_user_socket *vsocket;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ if (vsocket)
+ vsocket->protocol_features = protocol_features;
+ pthread_mutex_unlock(&vhost_user.mutex);
+ return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_get_protocol_features(const char *path,
+ uint64_t *protocol_features)
+{
+ struct vhost_user_socket *vsocket;
+ uint64_t vdpa_protocol_features;
+ struct rte_vdpa_device *vdpa_dev;
+ int did = -1;
+ int ret = 0;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ if (!vsocket) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "socket file %s is not registered yet.\n", path);
+ ret = -1;
+ goto unlock_exit;
+ }
+
+ did = vsocket->vdpa_dev_id;
+ vdpa_dev = rte_vdpa_get_device(did);
+ if (!vdpa_dev || !vdpa_dev->ops->get_protocol_features) {
+ *protocol_features = vsocket->protocol_features;
+ goto unlock_exit;
+ }
+
+ if (vdpa_dev->ops->get_protocol_features(did,
+ &vdpa_protocol_features) < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to get vdpa protocol features "
+ "for socket file %s.\n", path);
+ ret = -1;
+ goto unlock_exit;
+ }
+
+ *protocol_features = vsocket->protocol_features
+ & vdpa_protocol_features;
+
+unlock_exit:
+ pthread_mutex_unlock(&vhost_user.mutex);
+ return ret;
+}
+
+int
+rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num)
+{
+ struct vhost_user_socket *vsocket;
+ uint32_t vdpa_queue_num;
+ struct rte_vdpa_device *vdpa_dev;
+ int did = -1;
+ int ret = 0;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ if (!vsocket) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "socket file %s is not registered yet.\n", path);
+ ret = -1;
+ goto unlock_exit;
+ }
+
+ did = vsocket->vdpa_dev_id;
+ vdpa_dev = rte_vdpa_get_device(did);
+ if (!vdpa_dev || !vdpa_dev->ops->get_queue_num) {
+ *queue_num = VHOST_MAX_QUEUE_PAIRS;
+ goto unlock_exit;
+ }
+
+ if (vdpa_dev->ops->get_queue_num(did, &vdpa_queue_num) < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to get vdpa queue number "
+ "for socket file %s.\n", path);
+ ret = -1;
+ goto unlock_exit;
+ }
+
+ *queue_num = RTE_MIN((uint32_t)VHOST_MAX_QUEUE_PAIRS, vdpa_queue_num);
+
+unlock_exit:
+ pthread_mutex_unlock(&vhost_user.mutex);
+ return ret;
+}
+
+static void
+vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
+{
+ if (vsocket && vsocket->path) {
+ free(vsocket->path);
+ vsocket->path = NULL;
+ }
+
+ if (vsocket) {
+ free(vsocket);
+ vsocket = NULL;
+ }
+}
+
+/*
+ * Register a new vhost-user socket; here we could act as server
+ * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
+ * is set.
+ */
+int
+rte_vhost_driver_register(const char *path, uint64_t flags)
+{
+ int ret = -1;
+ struct vhost_user_socket *vsocket;
+
+ if (!path)
+ return -1;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+
+ if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "error: the number of vhost sockets reaches maximum\n");
+ goto out;
+ }
+
+ vsocket = malloc(sizeof(struct vhost_user_socket));
+ if (!vsocket)
+ goto out;
+ memset(vsocket, 0, sizeof(struct vhost_user_socket));
+ vsocket->path = strdup(path);
+ if (vsocket->path == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "error: failed to copy socket path string\n");
+ vhost_user_socket_mem_free(vsocket);
+ goto out;
+ }
+ TAILQ_INIT(&vsocket->conn_list);
+ ret = pthread_mutex_init(&vsocket->conn_mutex, NULL);
+ if (ret) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "error: failed to init connection mutex\n");
+ goto out_free;
+ }
+ vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
+
+ /*
+ * Set the supported features correctly for the builtin vhost-user
+ * net driver.
+ *
+ * Applications know nothing about features the builtin virtio net
+ * driver (virtio_net.c) supports, thus it's not possible for them
+ * to invoke rte_vhost_driver_set_features(). To workaround it, here
+ * we set it unconditionally. If the application want to implement
+ * another vhost-user driver (say SCSI), it should call the
+ * rte_vhost_driver_set_features(), which will overwrite following
+ * two values.
+ */
+ vsocket->use_builtin_virtio_net = true;
+ vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES;
+ vsocket->features = VIRTIO_NET_SUPPORTED_FEATURES;
+ vsocket->protocol_features = VHOST_USER_PROTOCOL_FEATURES;
+
+ /*
+ * Dequeue zero copy can't assure descriptors returned in order.
+ * Also, it requires that the guest memory is populated, which is
+ * not compatible with postcopy.
+ */
+ if (vsocket->dequeue_zero_copy) {
+ vsocket->supported_features &= ~(1ULL << VIRTIO_F_IN_ORDER);
+ vsocket->features &= ~(1ULL << VIRTIO_F_IN_ORDER);
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "Dequeue zero copy requested, disabling postcopy support\n");
+ vsocket->protocol_features &=
+ ~(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT);
+ }
+
+ if (!(flags & RTE_VHOST_USER_IOMMU_SUPPORT)) {
+ vsocket->supported_features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
+ vsocket->features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
+ }
+
+ if (!(flags & RTE_VHOST_USER_POSTCOPY_SUPPORT)) {
+ vsocket->protocol_features &=
+ ~(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT);
+ } else {
+#ifndef RTE_LIBRTE_VHOST_POSTCOPY
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Postcopy requested but not compiled\n");
+ ret = -1;
+ goto out_mutex;
+#endif
+ }
+
+ if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
+ vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
+ if (vsocket->reconnect && reconn_tid == 0) {
+ if (vhost_user_reconnect_init() != 0)
+ goto out_mutex;
+ }
+ } else {
+ vsocket->is_server = true;
+ }
+ ret = create_unix_socket(vsocket);
+ if (ret < 0) {
+ goto out_mutex;
+ }
+
+ vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
+
+ pthread_mutex_unlock(&vhost_user.mutex);
+ return ret;
+
+out_mutex:
+ if (pthread_mutex_destroy(&vsocket->conn_mutex)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "error: failed to destroy connection mutex\n");
+ }
+out_free:
+ vhost_user_socket_mem_free(vsocket);
+out:
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return ret;
+}
+
+static bool
+vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
+{
+ int found = false;
+ struct vhost_user_reconnect *reconn, *next;
+
+ pthread_mutex_lock(&reconn_list.mutex);
+
+ for (reconn = TAILQ_FIRST(&reconn_list.head);
+ reconn != NULL; reconn = next) {
+ next = TAILQ_NEXT(reconn, next);
+
+ if (reconn->vsocket == vsocket) {
+ TAILQ_REMOVE(&reconn_list.head, reconn, next);
+ close(reconn->fd);
+ free(reconn);
+ found = true;
+ break;
+ }
+ }
+ pthread_mutex_unlock(&reconn_list.mutex);
+ return found;
+}
+
+/**
+ * Unregister the specified vhost socket
+ */
+int
+rte_vhost_driver_unregister(const char *path)
+{
+ int i;
+ int count;
+ struct vhost_user_connection *conn, *next;
+
+ if (path == NULL)
+ return -1;
+
+again:
+ pthread_mutex_lock(&vhost_user.mutex);
+
+ for (i = 0; i < vhost_user.vsocket_cnt; i++) {
+ struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
+
+ if (!strcmp(vsocket->path, path)) {
+ pthread_mutex_lock(&vsocket->conn_mutex);
+ for (conn = TAILQ_FIRST(&vsocket->conn_list);
+ conn != NULL;
+ conn = next) {
+ next = TAILQ_NEXT(conn, next);
+
+ /*
+ * If r/wcb is executing, release the
+ * conn_mutex lock, and try again since
+ * the r/wcb may use the conn_mutex lock.
+ */
+ if (fdset_try_del(&vhost_user.fdset,
+ conn->connfd) == -1) {
+ pthread_mutex_unlock(
+ &vsocket->conn_mutex);
+ pthread_mutex_unlock(&vhost_user.mutex);
+ goto again;
+ }
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "free connfd = %d for device '%s'\n",
+ conn->connfd, path);
+ close(conn->connfd);
+ vhost_destroy_device(conn->vid);
+ TAILQ_REMOVE(&vsocket->conn_list, conn, next);
+ free(conn);
+ }
+ pthread_mutex_unlock(&vsocket->conn_mutex);
+
+ if (vsocket->is_server) {
+ fdset_del(&vhost_user.fdset,
+ vsocket->socket_fd);
+ close(vsocket->socket_fd);
+ unlink(path);
+ } else if (vsocket->reconnect) {
+ vhost_user_remove_reconnect(vsocket);
+ }
+
+ pthread_mutex_destroy(&vsocket->conn_mutex);
+ vhost_user_socket_mem_free(vsocket);
+
+ count = --vhost_user.vsocket_cnt;
+ vhost_user.vsockets[i] = vhost_user.vsockets[count];
+ vhost_user.vsockets[count] = NULL;
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return 0;
+ }
+ }
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return -1;
+}
+
+/*
+ * Register ops so that we can add/remove device to data core.
+ */
+int
+rte_vhost_driver_callback_register(const char *path,
+ struct vhost_device_ops const * const ops)
+{
+ struct vhost_user_socket *vsocket;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ if (vsocket)
+ vsocket->notify_ops = ops;
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return vsocket ? 0 : -1;
+}
+
+struct vhost_device_ops const *
+vhost_driver_callback_get(const char *path)
+{
+ struct vhost_user_socket *vsocket;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return vsocket ? vsocket->notify_ops : NULL;
+}
+
+int
+rte_vhost_driver_start(const char *path)
+{
+ struct vhost_user_socket *vsocket;
+ static pthread_t fdset_tid;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ if (!vsocket)
+ return -1;
+
+ if (fdset_tid == 0) {
+ /**
+ * create a pipe which will be waited by poll and notified to
+ * rebuild the wait list of poll.
+ */
+ if (fdset_pipe_init(&vhost_user.fdset) < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to create pipe for vhost fdset\n");
+ return -1;
+ }
+
+ int ret = rte_ctrl_thread_create(&fdset_tid,
+ "vhost-events", NULL, fdset_event_dispatch,
+ &vhost_user.fdset);
+ if (ret != 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to create fdset handling thread");
+
+ fdset_pipe_uninit(&vhost_user.fdset);
+ return -1;
+ }
+ }
+
+ if (vsocket->is_server)
+ return vhost_user_start_server(vsocket);
+ else
+ return vhost_user_start_client(vsocket);
+}
diff --git a/src/seastar/dpdk/lib/librte_vhost/vdpa.c b/src/seastar/dpdk/lib/librte_vhost/vdpa.c
new file mode 100644
index 000000000..e91548843
--- /dev/null
+++ b/src/seastar/dpdk/lib/librte_vhost/vdpa.c
@@ -0,0 +1,228 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+/**
+ * @file
+ *
+ * Device specific vhost lib
+ */
+
+#include <stdbool.h>
+
+#include <rte_malloc.h>
+#include "rte_vdpa.h"
+#include "vhost.h"
+
+static struct rte_vdpa_device *vdpa_devices[MAX_VHOST_DEVICE];
+static uint32_t vdpa_device_num;
+
+static bool
+is_same_vdpa_device(struct rte_vdpa_dev_addr *a,
+ struct rte_vdpa_dev_addr *b)
+{
+ bool ret = true;
+
+ if (a->type != b->type)
+ return false;
+
+ switch (a->type) {
+ case PCI_ADDR:
+ if (a->pci_addr.domain != b->pci_addr.domain ||
+ a->pci_addr.bus != b->pci_addr.bus ||
+ a->pci_addr.devid != b->pci_addr.devid ||
+ a->pci_addr.function != b->pci_addr.function)
+ ret = false;
+ break;
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+int
+rte_vdpa_register_device(struct rte_vdpa_dev_addr *addr,
+ struct rte_vdpa_dev_ops *ops)
+{
+ struct rte_vdpa_device *dev;
+ char device_name[MAX_VDPA_NAME_LEN];
+ int i;
+
+ if (vdpa_device_num >= MAX_VHOST_DEVICE || addr == NULL || ops == NULL)
+ return -1;
+
+ for (i = 0; i < MAX_VHOST_DEVICE; i++) {
+ dev = vdpa_devices[i];
+ if (dev && is_same_vdpa_device(&dev->addr, addr))
+ return -1;
+ }
+
+ for (i = 0; i < MAX_VHOST_DEVICE; i++) {
+ if (vdpa_devices[i] == NULL)
+ break;
+ }
+
+ if (i == MAX_VHOST_DEVICE)
+ return -1;
+
+ snprintf(device_name, sizeof(device_name), "vdpa-dev-%d", i);
+ dev = rte_zmalloc(device_name, sizeof(struct rte_vdpa_device),
+ RTE_CACHE_LINE_SIZE);
+ if (!dev)
+ return -1;
+
+ memcpy(&dev->addr, addr, sizeof(struct rte_vdpa_dev_addr));
+ dev->ops = ops;
+ vdpa_devices[i] = dev;
+ vdpa_device_num++;
+
+ return i;
+}
+
+int
+rte_vdpa_unregister_device(int did)
+{
+ if (did < 0 || did >= MAX_VHOST_DEVICE || vdpa_devices[did] == NULL)
+ return -1;
+
+ rte_free(vdpa_devices[did]);
+ vdpa_devices[did] = NULL;
+ vdpa_device_num--;
+
+ return did;
+}
+
+int
+rte_vdpa_find_device_id(struct rte_vdpa_dev_addr *addr)
+{
+ struct rte_vdpa_device *dev;
+ int i;
+
+ if (addr == NULL)
+ return -1;
+
+ for (i = 0; i < MAX_VHOST_DEVICE; ++i) {
+ dev = vdpa_devices[i];
+ if (dev && is_same_vdpa_device(&dev->addr, addr))
+ return i;
+ }
+
+ return -1;
+}
+
+struct rte_vdpa_device *
+rte_vdpa_get_device(int did)
+{
+ if (did < 0 || did >= MAX_VHOST_DEVICE)
+ return NULL;
+
+ return vdpa_devices[did];
+}
+
+int
+rte_vdpa_get_device_num(void)
+{
+ return vdpa_device_num;
+}
+
+int __rte_experimental
+rte_vdpa_relay_vring_used(int vid, uint16_t qid, void *vring_m)
+{
+ struct virtio_net *dev = get_device(vid);
+ uint16_t idx, idx_m, desc_id;
+ struct vhost_virtqueue *vq;
+ struct vring_desc desc;
+ struct vring_desc *desc_ring;
+ struct vring_desc *idesc = NULL;
+ struct vring *s_vring;
+ uint64_t dlen;
+ uint32_t nr_descs;
+ int ret;
+
+ if (!dev || !vring_m)
+ return -1;
+
+ if (qid >= dev->nr_vring)
+ return -1;
+
+ if (vq_is_packed(dev))
+ return -1;
+
+ s_vring = (struct vring *)vring_m;
+ vq = dev->virtqueue[qid];
+ idx = vq->used->idx;
+ idx_m = s_vring->used->idx;
+ ret = (uint16_t)(idx_m - idx);
+
+ while (idx != idx_m) {
+ /* copy used entry, used ring logging is not covered here */
+ vq->used->ring[idx & (vq->size - 1)] =
+ s_vring->used->ring[idx & (vq->size - 1)];
+
+ desc_id = vq->used->ring[idx & (vq->size - 1)].id;
+ desc_ring = vq->desc;
+ nr_descs = vq->size;
+
+ if (unlikely(desc_id >= vq->size))
+ return -1;
+
+ if (vq->desc[desc_id].flags & VRING_DESC_F_INDIRECT) {
+ dlen = vq->desc[desc_id].len;
+ nr_descs = dlen / sizeof(struct vring_desc);
+ if (unlikely(nr_descs > vq->size))
+ return -1;
+
+ desc_ring = (struct vring_desc *)(uintptr_t)
+ vhost_iova_to_vva(dev, vq,
+ vq->desc[desc_id].addr, &dlen,
+ VHOST_ACCESS_RO);
+ if (unlikely(!desc_ring))
+ return -1;
+
+ if (unlikely(dlen < vq->desc[desc_id].len)) {
+ idesc = alloc_copy_ind_table(dev, vq,
+ vq->desc[desc_id].addr,
+ vq->desc[desc_id].len);
+ if (unlikely(!idesc))
+ return -1;
+
+ desc_ring = idesc;
+ }
+
+ desc_id = 0;
+ }
+
+ /* dirty page logging for DMA writeable buffer */
+ do {
+ if (unlikely(desc_id >= vq->size))
+ goto fail;
+ if (unlikely(nr_descs-- == 0))
+ goto fail;
+ desc = desc_ring[desc_id];
+ if (desc.flags & VRING_DESC_F_WRITE)
+ vhost_log_write(dev, desc.addr, desc.len);
+ desc_id = desc.next;
+ } while (desc.flags & VRING_DESC_F_NEXT);
+
+ if (unlikely(idesc)) {
+ free_ind_table(idesc);
+ idesc = NULL;
+ }
+
+ idx++;
+ }
+
+ rte_smp_wmb();
+ vq->used->idx = idx_m;
+
+ if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))
+ vring_used_event(s_vring) = idx_m;
+
+ return ret;
+
+fail:
+ if (unlikely(idesc))
+ free_ind_table(idesc);
+ return -1;
+}
diff --git a/src/seastar/dpdk/lib/librte_vhost/vhost.c b/src/seastar/dpdk/lib/librte_vhost/vhost.c
new file mode 100644
index 000000000..163f4595e
--- /dev/null
+++ b/src/seastar/dpdk/lib/librte_vhost/vhost.c
@@ -0,0 +1,814 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2017 Intel Corporation
+ */
+
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#ifdef RTE_LIBRTE_VHOST_NUMA
+#include <numa.h>
+#include <numaif.h>
+#endif
+
+#include <rte_errno.h>
+#include <rte_ethdev.h>
+#include <rte_log.h>
+#include <rte_string_fns.h>
+#include <rte_memory.h>
+#include <rte_malloc.h>
+#include <rte_vhost.h>
+#include <rte_rwlock.h>
+
+#include "iotlb.h"
+#include "vhost.h"
+#include "vhost_user.h"
+
+struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
+
+/* Called with iotlb_lock read-locked */
+uint64_t
+__vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint64_t iova, uint64_t *size, uint8_t perm)
+{
+ uint64_t vva, tmp_size;
+
+ if (unlikely(!*size))
+ return 0;
+
+ tmp_size = *size;
+
+ vva = vhost_user_iotlb_cache_find(vq, iova, &tmp_size, perm);
+ if (tmp_size == *size)
+ return vva;
+
+ iova += tmp_size;
+
+ if (!vhost_user_iotlb_pending_miss(vq, iova, perm)) {
+ /*
+ * iotlb_lock is read-locked for a full burst,
+ * but it only protects the iotlb cache.
+ * In case of IOTLB miss, we might block on the socket,
+ * which could cause a deadlock with QEMU if an IOTLB update
+ * is being handled. We can safely unlock here to avoid it.
+ */
+ vhost_user_iotlb_rd_unlock(vq);
+
+ vhost_user_iotlb_pending_insert(vq, iova, perm);
+ if (vhost_user_iotlb_miss(dev, iova, perm)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "IOTLB miss req failed for IOVA 0x%" PRIx64 "\n",
+ iova);
+ vhost_user_iotlb_pending_remove(vq, iova, 1, perm);
+ }
+
+ vhost_user_iotlb_rd_lock(vq);
+ }
+
+ return 0;
+}
+
+void
+cleanup_vq(struct vhost_virtqueue *vq, int destroy)
+{
+ if ((vq->callfd >= 0) && (destroy != 0))
+ close(vq->callfd);
+ if (vq->kickfd >= 0)
+ close(vq->kickfd);
+}
+
+/*
+ * Unmap any memory, close any file descriptors and
+ * free any memory owned by a device.
+ */
+void
+cleanup_device(struct virtio_net *dev, int destroy)
+{
+ uint32_t i;
+
+ vhost_backend_cleanup(dev);
+
+ for (i = 0; i < dev->nr_vring; i++)
+ cleanup_vq(dev->virtqueue[i], destroy);
+}
+
+void
+free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+ if (vq_is_packed(dev))
+ rte_free(vq->shadow_used_packed);
+ else
+ rte_free(vq->shadow_used_split);
+ rte_free(vq->batch_copy_elems);
+ rte_mempool_free(vq->iotlb_pool);
+ rte_free(vq);
+}
+
+/*
+ * Release virtqueues and device memory.
+ */
+static void
+free_device(struct virtio_net *dev)
+{
+ uint32_t i;
+
+ for (i = 0; i < dev->nr_vring; i++)
+ free_vq(dev, dev->virtqueue[i]);
+
+ rte_free(dev);
+}
+
+static int
+vring_translate_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+ uint64_t req_size, size;
+
+ req_size = sizeof(struct vring_desc) * vq->size;
+ size = req_size;
+ vq->desc = (struct vring_desc *)(uintptr_t)vhost_iova_to_vva(dev, vq,
+ vq->ring_addrs.desc_user_addr,
+ &size, VHOST_ACCESS_RW);
+ if (!vq->desc || size != req_size)
+ return -1;
+
+ req_size = sizeof(struct vring_avail);
+ req_size += sizeof(uint16_t) * vq->size;
+ if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))
+ req_size += sizeof(uint16_t);
+ size = req_size;
+ vq->avail = (struct vring_avail *)(uintptr_t)vhost_iova_to_vva(dev, vq,
+ vq->ring_addrs.avail_user_addr,
+ &size, VHOST_ACCESS_RW);
+ if (!vq->avail || size != req_size)
+ return -1;
+
+ req_size = sizeof(struct vring_used);
+ req_size += sizeof(struct vring_used_elem) * vq->size;
+ if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))
+ req_size += sizeof(uint16_t);
+ size = req_size;
+ vq->used = (struct vring_used *)(uintptr_t)vhost_iova_to_vva(dev, vq,
+ vq->ring_addrs.used_user_addr,
+ &size, VHOST_ACCESS_RW);
+ if (!vq->used || size != req_size)
+ return -1;
+
+ return 0;
+}
+
+static int
+vring_translate_packed(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+ uint64_t req_size, size;
+
+ req_size = sizeof(struct vring_packed_desc) * vq->size;
+ size = req_size;
+ vq->desc_packed = (struct vring_packed_desc *)(uintptr_t)
+ vhost_iova_to_vva(dev, vq, vq->ring_addrs.desc_user_addr,
+ &size, VHOST_ACCESS_RW);
+ if (!vq->desc_packed || size != req_size)
+ return -1;
+
+ req_size = sizeof(struct vring_packed_desc_event);
+ size = req_size;
+ vq->driver_event = (struct vring_packed_desc_event *)(uintptr_t)
+ vhost_iova_to_vva(dev, vq, vq->ring_addrs.avail_user_addr,
+ &size, VHOST_ACCESS_RW);
+ if (!vq->driver_event || size != req_size)
+ return -1;
+
+ req_size = sizeof(struct vring_packed_desc_event);
+ size = req_size;
+ vq->device_event = (struct vring_packed_desc_event *)(uintptr_t)
+ vhost_iova_to_vva(dev, vq, vq->ring_addrs.used_user_addr,
+ &size, VHOST_ACCESS_RW);
+ if (!vq->device_event || size != req_size)
+ return -1;
+
+ return 0;
+}
+
+int
+vring_translate(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+
+ if (!(dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)))
+ goto out;
+
+ if (vq_is_packed(dev)) {
+ if (vring_translate_packed(dev, vq) < 0)
+ return -1;
+ } else {
+ if (vring_translate_split(dev, vq) < 0)
+ return -1;
+ }
+out:
+ vq->access_ok = 1;
+
+ return 0;
+}
+
+void
+vring_invalidate(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+ if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+ vhost_user_iotlb_wr_lock(vq);
+
+ vq->access_ok = 0;
+ vq->desc = NULL;
+ vq->avail = NULL;
+ vq->used = NULL;
+
+ if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+ vhost_user_iotlb_wr_unlock(vq);
+}
+
+static void
+init_vring_queue(struct virtio_net *dev, uint32_t vring_idx)
+{
+ struct vhost_virtqueue *vq;
+
+ if (vring_idx >= VHOST_MAX_VRING) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed not init vring, out of bound (%d)\n",
+ vring_idx);
+ return;
+ }
+
+ vq = dev->virtqueue[vring_idx];
+
+ memset(vq, 0, sizeof(struct vhost_virtqueue));
+
+ vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+ vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+ vhost_user_iotlb_init(dev, vring_idx);
+ /* Backends are set to -1 indicating an inactive device. */
+ vq->backend = -1;
+
+ TAILQ_INIT(&vq->zmbuf_list);
+}
+
+static void
+reset_vring_queue(struct virtio_net *dev, uint32_t vring_idx)
+{
+ struct vhost_virtqueue *vq;
+ int callfd;
+
+ if (vring_idx >= VHOST_MAX_VRING) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed not init vring, out of bound (%d)\n",
+ vring_idx);
+ return;
+ }
+
+ vq = dev->virtqueue[vring_idx];
+ callfd = vq->callfd;
+ init_vring_queue(dev, vring_idx);
+ vq->callfd = callfd;
+}
+
+int
+alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx)
+{
+ struct vhost_virtqueue *vq;
+
+ vq = rte_malloc(NULL, sizeof(struct vhost_virtqueue), 0);
+ if (vq == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to allocate memory for vring:%u.\n", vring_idx);
+ return -1;
+ }
+
+ dev->virtqueue[vring_idx] = vq;
+ init_vring_queue(dev, vring_idx);
+ rte_spinlock_init(&vq->access_lock);
+ vq->avail_wrap_counter = 1;
+ vq->used_wrap_counter = 1;
+ vq->signalled_used_valid = false;
+
+ dev->nr_vring += 1;
+
+ return 0;
+}
+
+/*
+ * Reset some variables in device structure, while keeping few
+ * others untouched, such as vid, ifname, nr_vring: they
+ * should be same unless the device is removed.
+ */
+void
+reset_device(struct virtio_net *dev)
+{
+ uint32_t i;
+
+ dev->features = 0;
+ dev->protocol_features = 0;
+ dev->flags &= VIRTIO_DEV_BUILTIN_VIRTIO_NET;
+
+ for (i = 0; i < dev->nr_vring; i++)
+ reset_vring_queue(dev, i);
+}
+
+/*
+ * Invoked when there is a new vhost-user connection established (when
+ * there is a new virtio device being attached).
+ */
+int
+vhost_new_device(void)
+{
+ struct virtio_net *dev;
+ int i;
+
+ for (i = 0; i < MAX_VHOST_DEVICE; i++) {
+ if (vhost_devices[i] == NULL)
+ break;
+ }
+
+ if (i == MAX_VHOST_DEVICE) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to find a free slot for new device.\n");
+ return -1;
+ }
+
+ dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0);
+ if (dev == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to allocate memory for new dev.\n");
+ return -1;
+ }
+
+ vhost_devices[i] = dev;
+ dev->vid = i;
+ dev->flags = VIRTIO_DEV_BUILTIN_VIRTIO_NET;
+ dev->slave_req_fd = -1;
+ dev->vdpa_dev_id = -1;
+ dev->postcopy_ufd = -1;
+ rte_spinlock_init(&dev->slave_req_lock);
+
+ return i;
+}
+
+void
+vhost_destroy_device_notify(struct virtio_net *dev)
+{
+ struct rte_vdpa_device *vdpa_dev;
+ int did;
+
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ did = dev->vdpa_dev_id;
+ vdpa_dev = rte_vdpa_get_device(did);
+ if (vdpa_dev && vdpa_dev->ops->dev_close)
+ vdpa_dev->ops->dev_close(dev->vid);
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(dev->vid);
+ }
+}
+
+/*
+ * Invoked when there is the vhost-user connection is broken (when
+ * the virtio device is being detached).
+ */
+void
+vhost_destroy_device(int vid)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return;
+
+ vhost_destroy_device_notify(dev);
+
+ cleanup_device(dev, 1);
+ free_device(dev);
+
+ vhost_devices[vid] = NULL;
+}
+
+void
+vhost_attach_vdpa_device(int vid, int did)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return;
+
+ if (rte_vdpa_get_device(did) == NULL)
+ return;
+
+ dev->vdpa_dev_id = did;
+}
+
+void
+vhost_set_ifname(int vid, const char *if_name, unsigned int if_len)
+{
+ struct virtio_net *dev;
+ unsigned int len;
+
+ dev = get_device(vid);
+ if (dev == NULL)
+ return;
+
+ len = if_len > sizeof(dev->ifname) ?
+ sizeof(dev->ifname) : if_len;
+
+ strncpy(dev->ifname, if_name, len);
+ dev->ifname[sizeof(dev->ifname) - 1] = '\0';
+}
+
+void
+vhost_enable_dequeue_zero_copy(int vid)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return;
+
+ dev->dequeue_zero_copy = 1;
+}
+
+void
+vhost_set_builtin_virtio_net(int vid, bool enable)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return;
+
+ if (enable)
+ dev->flags |= VIRTIO_DEV_BUILTIN_VIRTIO_NET;
+ else
+ dev->flags &= ~VIRTIO_DEV_BUILTIN_VIRTIO_NET;
+}
+
+int
+rte_vhost_get_mtu(int vid, uint16_t *mtu)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL || mtu == NULL)
+ return -ENODEV;
+
+ if (!(dev->flags & VIRTIO_DEV_READY))
+ return -EAGAIN;
+
+ if (!(dev->features & (1ULL << VIRTIO_NET_F_MTU)))
+ return -ENOTSUP;
+
+ *mtu = dev->mtu;
+
+ return 0;
+}
+
+int
+rte_vhost_get_numa_node(int vid)
+{
+#ifdef RTE_LIBRTE_VHOST_NUMA
+ struct virtio_net *dev = get_device(vid);
+ int numa_node;
+ int ret;
+
+ if (dev == NULL || numa_available() != 0)
+ return -1;
+
+ ret = get_mempolicy(&numa_node, NULL, 0, dev,
+ MPOL_F_NODE | MPOL_F_ADDR);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) failed to query numa node: %s\n",
+ vid, rte_strerror(errno));
+ return -1;
+ }
+
+ return numa_node;
+#else
+ RTE_SET_USED(vid);
+ return -1;
+#endif
+}
+
+uint32_t
+rte_vhost_get_queue_num(int vid)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return 0;
+
+ return dev->nr_vring / 2;
+}
+
+uint16_t
+rte_vhost_get_vring_num(int vid)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return 0;
+
+ return dev->nr_vring;
+}
+
+int
+rte_vhost_get_ifname(int vid, char *buf, size_t len)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL || buf == NULL)
+ return -1;
+
+ len = RTE_MIN(len, sizeof(dev->ifname));
+
+ strncpy(buf, dev->ifname, len);
+ buf[len - 1] = '\0';
+
+ return 0;
+}
+
+int
+rte_vhost_get_negotiated_features(int vid, uint64_t *features)
+{
+ struct virtio_net *dev;
+
+ dev = get_device(vid);
+ if (dev == NULL || features == NULL)
+ return -1;
+
+ *features = dev->features;
+ return 0;
+}
+
+int
+rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem)
+{
+ struct virtio_net *dev;
+ struct rte_vhost_memory *m;
+ size_t size;
+
+ dev = get_device(vid);
+ if (dev == NULL || mem == NULL)
+ return -1;
+
+ size = dev->mem->nregions * sizeof(struct rte_vhost_mem_region);
+ m = malloc(sizeof(struct rte_vhost_memory) + size);
+ if (!m)
+ return -1;
+
+ m->nregions = dev->mem->nregions;
+ memcpy(m->regions, dev->mem->regions, size);
+ *mem = m;
+
+ return 0;
+}
+
+int
+rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
+ struct rte_vhost_vring *vring)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (dev == NULL || vring == NULL)
+ return -1;
+
+ if (vring_idx >= VHOST_MAX_VRING)
+ return -1;
+
+ vq = dev->virtqueue[vring_idx];
+ if (!vq)
+ return -1;
+
+ vring->desc = vq->desc;
+ vring->avail = vq->avail;
+ vring->used = vq->used;
+ vring->log_guest_addr = vq->log_guest_addr;
+
+ vring->callfd = vq->callfd;
+ vring->kickfd = vq->kickfd;
+ vring->size = vq->size;
+
+ return 0;
+}
+
+int
+rte_vhost_vring_call(int vid, uint16_t vring_idx)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (!dev)
+ return -1;
+
+ if (vring_idx >= VHOST_MAX_VRING)
+ return -1;
+
+ vq = dev->virtqueue[vring_idx];
+ if (!vq)
+ return -1;
+
+ if (vq_is_packed(dev))
+ vhost_vring_call_packed(dev, vq);
+ else
+ vhost_vring_call_split(dev, vq);
+
+ return 0;
+}
+
+uint16_t
+rte_vhost_avail_entries(int vid, uint16_t queue_id)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (!dev)
+ return 0;
+
+ vq = dev->virtqueue[queue_id];
+ if (!vq->enabled)
+ return 0;
+
+ return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx;
+}
+
+static inline void
+vhost_enable_notify_split(struct virtio_net *dev,
+ struct vhost_virtqueue *vq, int enable)
+{
+ if (!(dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))) {
+ if (enable)
+ vq->used->flags &= ~VRING_USED_F_NO_NOTIFY;
+ else
+ vq->used->flags |= VRING_USED_F_NO_NOTIFY;
+ } else {
+ if (enable)
+ vhost_avail_event(vq) = vq->last_avail_idx;
+ }
+}
+
+static inline void
+vhost_enable_notify_packed(struct virtio_net *dev,
+ struct vhost_virtqueue *vq, int enable)
+{
+ uint16_t flags;
+
+ if (!enable) {
+ vq->device_event->flags = VRING_EVENT_F_DISABLE;
+ return;
+ }
+
+ flags = VRING_EVENT_F_ENABLE;
+ if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) {
+ flags = VRING_EVENT_F_DESC;
+ vq->device_event->off_wrap = vq->last_avail_idx |
+ vq->avail_wrap_counter << 15;
+ }
+
+ rte_smp_wmb();
+
+ vq->device_event->flags = flags;
+}
+
+int
+rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
+{
+ struct virtio_net *dev = get_device(vid);
+ struct vhost_virtqueue *vq;
+
+ if (!dev)
+ return -1;
+
+ vq = dev->virtqueue[queue_id];
+
+ if (vq_is_packed(dev))
+ vhost_enable_notify_packed(dev, vq, enable);
+ else
+ vhost_enable_notify_split(dev, vq, enable);
+
+ return 0;
+}
+
+void
+rte_vhost_log_write(int vid, uint64_t addr, uint64_t len)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return;
+
+ vhost_log_write(dev, addr, len);
+}
+
+void
+rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
+ uint64_t offset, uint64_t len)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (dev == NULL)
+ return;
+
+ if (vring_idx >= VHOST_MAX_VRING)
+ return;
+ vq = dev->virtqueue[vring_idx];
+ if (!vq)
+ return;
+
+ vhost_log_used_vring(dev, vq, offset, len);
+}
+
+uint32_t
+rte_vhost_rx_queue_count(int vid, uint16_t qid)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (dev == NULL)
+ return 0;
+
+ if (unlikely(qid >= dev->nr_vring || (qid & 1) == 0)) {
+ RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
+ dev->vid, __func__, qid);
+ return 0;
+ }
+
+ vq = dev->virtqueue[qid];
+ if (vq == NULL)
+ return 0;
+
+ if (unlikely(vq->enabled == 0 || vq->avail == NULL))
+ return 0;
+
+ return *((volatile uint16_t *)&vq->avail->idx) - vq->last_avail_idx;
+}
+
+int rte_vhost_get_vdpa_device_id(int vid)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return -1;
+
+ return dev->vdpa_dev_id;
+}
+
+int rte_vhost_get_log_base(int vid, uint64_t *log_base,
+ uint64_t *log_size)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL || log_base == NULL || log_size == NULL)
+ return -1;
+
+ *log_base = dev->log_base;
+ *log_size = dev->log_size;
+
+ return 0;
+}
+
+int rte_vhost_get_vring_base(int vid, uint16_t queue_id,
+ uint16_t *last_avail_idx, uint16_t *last_used_idx)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL || last_avail_idx == NULL || last_used_idx == NULL)
+ return -1;
+
+ *last_avail_idx = dev->virtqueue[queue_id]->last_avail_idx;
+ *last_used_idx = dev->virtqueue[queue_id]->last_used_idx;
+
+ return 0;
+}
+
+int rte_vhost_set_vring_base(int vid, uint16_t queue_id,
+ uint16_t last_avail_idx, uint16_t last_used_idx)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (!dev)
+ return -1;
+
+ dev->virtqueue[queue_id]->last_avail_idx = last_avail_idx;
+ dev->virtqueue[queue_id]->last_used_idx = last_used_idx;
+
+ return 0;
+}
+
+int rte_vhost_extern_callback_register(int vid,
+ struct rte_vhost_user_extern_ops const * const ops, void *ctx)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL || ops == NULL)
+ return -1;
+
+ dev->extern_ops = *ops;
+ dev->extern_data = ctx;
+ return 0;
+}
diff --git a/src/seastar/dpdk/lib/librte_vhost/vhost.h b/src/seastar/dpdk/lib/librte_vhost/vhost.h
new file mode 100644
index 000000000..e9138dfab
--- /dev/null
+++ b/src/seastar/dpdk/lib/librte_vhost/vhost.h
@@ -0,0 +1,782 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2018 Intel Corporation
+ */
+
+#ifndef _VHOST_NET_CDEV_H_
+#define _VHOST_NET_CDEV_H_
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <unistd.h>
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <sys/socket.h>
+#include <linux/if.h>
+
+#include <rte_log.h>
+#include <rte_ether.h>
+#include <rte_rwlock.h>
+#include <rte_malloc.h>
+
+#include "rte_vhost.h"
+#include "rte_vdpa.h"
+
+/* Used to indicate that the device is running on a data core */
+#define VIRTIO_DEV_RUNNING 1
+/* Used to indicate that the device is ready to operate */
+#define VIRTIO_DEV_READY 2
+/* Used to indicate that the built-in vhost net device backend is enabled */
+#define VIRTIO_DEV_BUILTIN_VIRTIO_NET 4
+/* Used to indicate that the device has its own data path and configured */
+#define VIRTIO_DEV_VDPA_CONFIGURED 8
+
+/* Backend value set by guest. */
+#define VIRTIO_DEV_STOPPED -1
+
+#define BUF_VECTOR_MAX 256
+
+#define VHOST_LOG_CACHE_NR 32
+
+/**
+ * Structure contains buffer address, length and descriptor index
+ * from vring to do scatter RX.
+ */
+struct buf_vector {
+ uint64_t buf_iova;
+ uint64_t buf_addr;
+ uint32_t buf_len;
+ uint32_t desc_idx;
+};
+
+/*
+ * A structure to hold some fields needed in zero copy code path,
+ * mainly for associating an mbuf with the right desc_idx.
+ */
+struct zcopy_mbuf {
+ struct rte_mbuf *mbuf;
+ uint32_t desc_idx;
+ uint16_t desc_count;
+ uint16_t in_use;
+
+ TAILQ_ENTRY(zcopy_mbuf) next;
+};
+TAILQ_HEAD(zcopy_mbuf_list, zcopy_mbuf);
+
+/*
+ * Structure contains the info for each batched memory copy.
+ */
+struct batch_copy_elem {
+ void *dst;
+ void *src;
+ uint32_t len;
+ uint64_t log_addr;
+};
+
+/*
+ * Structure that contains the info for batched dirty logging.
+ */
+struct log_cache_entry {
+ uint32_t offset;
+ unsigned long val;
+};
+
+struct vring_used_elem_packed {
+ uint16_t id;
+ uint32_t len;
+ uint32_t count;
+};
+
+/**
+ * Structure contains variables relevant to RX/TX virtqueues.
+ */
+struct vhost_virtqueue {
+ union {
+ struct vring_desc *desc;
+ struct vring_packed_desc *desc_packed;
+ };
+ union {
+ struct vring_avail *avail;
+ struct vring_packed_desc_event *driver_event;
+ };
+ union {
+ struct vring_used *used;
+ struct vring_packed_desc_event *device_event;
+ };
+ uint32_t size;
+
+ uint16_t last_avail_idx;
+ uint16_t last_used_idx;
+ /* Last used index we notify to front end. */
+ uint16_t signalled_used;
+ bool signalled_used_valid;
+#define VIRTIO_INVALID_EVENTFD (-1)
+#define VIRTIO_UNINITIALIZED_EVENTFD (-2)
+
+ /* Backend value to determine if device should started/stopped */
+ int backend;
+ int enabled;
+ int access_ok;
+ rte_spinlock_t access_lock;
+
+ /* Used to notify the guest (trigger interrupt) */
+ int callfd;
+ /* Currently unused as polling mode is enabled */
+ int kickfd;
+
+ /* Physical address of used ring, for logging */
+ uint64_t log_guest_addr;
+
+ uint16_t nr_zmbuf;
+ uint16_t zmbuf_size;
+ uint16_t last_zmbuf_idx;
+ struct zcopy_mbuf *zmbufs;
+ struct zcopy_mbuf_list zmbuf_list;
+
+ union {
+ struct vring_used_elem *shadow_used_split;
+ struct vring_used_elem_packed *shadow_used_packed;
+ };
+ uint16_t shadow_used_idx;
+ struct vhost_vring_addr ring_addrs;
+
+ struct batch_copy_elem *batch_copy_elems;
+ uint16_t batch_copy_nb_elems;
+ bool used_wrap_counter;
+ bool avail_wrap_counter;
+
+ struct log_cache_entry log_cache[VHOST_LOG_CACHE_NR];
+ uint16_t log_cache_nb_elem;
+
+ rte_rwlock_t iotlb_lock;
+ rte_rwlock_t iotlb_pending_lock;
+ struct rte_mempool *iotlb_pool;
+ TAILQ_HEAD(, vhost_iotlb_entry) iotlb_list;
+ int iotlb_cache_nr;
+ TAILQ_HEAD(, vhost_iotlb_entry) iotlb_pending_list;
+} __rte_cache_aligned;
+
+/* Old kernels have no such macros defined */
+#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
+ #define VIRTIO_NET_F_GUEST_ANNOUNCE 21
+#endif
+
+#ifndef VIRTIO_NET_F_MQ
+ #define VIRTIO_NET_F_MQ 22
+#endif
+
+#define VHOST_MAX_VRING 0x100
+#define VHOST_MAX_QUEUE_PAIRS 0x80
+
+#ifndef VIRTIO_NET_F_MTU
+ #define VIRTIO_NET_F_MTU 3
+#endif
+
+#ifndef VIRTIO_F_ANY_LAYOUT
+ #define VIRTIO_F_ANY_LAYOUT 27
+#endif
+
+/* Declare IOMMU related bits for older kernels */
+#ifndef VIRTIO_F_IOMMU_PLATFORM
+
+#define VIRTIO_F_IOMMU_PLATFORM 33
+
+struct vhost_iotlb_msg {
+ __u64 iova;
+ __u64 size;
+ __u64 uaddr;
+#define VHOST_ACCESS_RO 0x1
+#define VHOST_ACCESS_WO 0x2
+#define VHOST_ACCESS_RW 0x3
+ __u8 perm;
+#define VHOST_IOTLB_MISS 1
+#define VHOST_IOTLB_UPDATE 2
+#define VHOST_IOTLB_INVALIDATE 3
+#define VHOST_IOTLB_ACCESS_FAIL 4
+ __u8 type;
+};
+
+#define VHOST_IOTLB_MSG 0x1
+
+struct vhost_msg {
+ int type;
+ union {
+ struct vhost_iotlb_msg iotlb;
+ __u8 padding[64];
+ };
+};
+#endif
+
+/*
+ * Define virtio 1.0 for older kernels
+ */
+#ifndef VIRTIO_F_VERSION_1
+ #define VIRTIO_F_VERSION_1 32
+#endif
+
+/* Declare packed ring related bits for older kernels */
+#ifndef VIRTIO_F_RING_PACKED
+
+#define VIRTIO_F_RING_PACKED 34
+
+struct vring_packed_desc {
+ uint64_t addr;
+ uint32_t len;
+ uint16_t id;
+ uint16_t flags;
+};
+
+struct vring_packed_desc_event {
+ uint16_t off_wrap;
+ uint16_t flags;
+};
+#endif
+
+/*
+ * Declare below packed ring defines unconditionally
+ * as Kernel header might use different names.
+ */
+#define VRING_DESC_F_AVAIL (1ULL << 7)
+#define VRING_DESC_F_USED (1ULL << 15)
+
+#define VRING_EVENT_F_ENABLE 0x0
+#define VRING_EVENT_F_DISABLE 0x1
+#define VRING_EVENT_F_DESC 0x2
+
+/*
+ * Available and used descs are in same order
+ */
+#ifndef VIRTIO_F_IN_ORDER
+#define VIRTIO_F_IN_ORDER 35
+#endif
+
+/* Features supported by this builtin vhost-user net driver. */
+#define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \
+ (1ULL << VIRTIO_F_ANY_LAYOUT) | \
+ (1ULL << VIRTIO_NET_F_CTRL_VQ) | \
+ (1ULL << VIRTIO_NET_F_CTRL_RX) | \
+ (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \
+ (1ULL << VIRTIO_NET_F_MQ) | \
+ (1ULL << VIRTIO_F_VERSION_1) | \
+ (1ULL << VHOST_F_LOG_ALL) | \
+ (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
+ (1ULL << VIRTIO_NET_F_GSO) | \
+ (1ULL << VIRTIO_NET_F_HOST_TSO4) | \
+ (1ULL << VIRTIO_NET_F_HOST_TSO6) | \
+ (1ULL << VIRTIO_NET_F_HOST_UFO) | \
+ (1ULL << VIRTIO_NET_F_HOST_ECN) | \
+ (1ULL << VIRTIO_NET_F_CSUM) | \
+ (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
+ (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
+ (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
+ (1ULL << VIRTIO_NET_F_GUEST_UFO) | \
+ (1ULL << VIRTIO_NET_F_GUEST_ECN) | \
+ (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \
+ (1ULL << VIRTIO_RING_F_EVENT_IDX) | \
+ (1ULL << VIRTIO_NET_F_MTU) | \
+ (1ULL << VIRTIO_F_IN_ORDER) | \
+ (1ULL << VIRTIO_F_IOMMU_PLATFORM) | \
+ (1ULL << VIRTIO_F_RING_PACKED))
+
+
+struct guest_page {
+ uint64_t guest_phys_addr;
+ uint64_t host_phys_addr;
+ uint64_t size;
+};
+
+/**
+ * Device structure contains all configuration information relating
+ * to the device.
+ */
+struct virtio_net {
+ /* Frontend (QEMU) memory and memory region information */
+ struct rte_vhost_memory *mem;
+ uint64_t features;
+ uint64_t protocol_features;
+ int vid;
+ uint32_t flags;
+ uint16_t vhost_hlen;
+ /* to tell if we need broadcast rarp packet */
+ rte_atomic16_t broadcast_rarp;
+ uint32_t nr_vring;
+ int dequeue_zero_copy;
+ struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
+#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
+ char ifname[IF_NAME_SZ];
+ uint64_t log_size;
+ uint64_t log_base;
+ uint64_t log_addr;
+ struct ether_addr mac;
+ uint16_t mtu;
+
+ struct vhost_device_ops const *notify_ops;
+
+ uint32_t nr_guest_pages;
+ uint32_t max_guest_pages;
+ struct guest_page *guest_pages;
+
+ int slave_req_fd;
+ rte_spinlock_t slave_req_lock;
+
+ int postcopy_ufd;
+ int postcopy_listening;
+
+ /*
+ * Device id to identify a specific backend device.
+ * It's set to -1 for the default software implementation.
+ */
+ int vdpa_dev_id;
+
+ /* context data for the external message handlers */
+ void *extern_data;
+ /* pre and post vhost user message handlers for the device */
+ struct rte_vhost_user_extern_ops extern_ops;
+} __rte_cache_aligned;
+
+static __rte_always_inline bool
+vq_is_packed(struct virtio_net *dev)
+{
+ return dev->features & (1ull << VIRTIO_F_RING_PACKED);
+}
+
+static inline bool
+desc_is_avail(struct vring_packed_desc *desc, bool wrap_counter)
+{
+ uint16_t flags = *((volatile uint16_t *) &desc->flags);
+
+ return wrap_counter == !!(flags & VRING_DESC_F_AVAIL) &&
+ wrap_counter != !!(flags & VRING_DESC_F_USED);
+}
+
+#define VHOST_LOG_PAGE 4096
+
+/*
+ * Atomically set a bit in memory.
+ */
+static __rte_always_inline void
+vhost_set_bit(unsigned int nr, volatile uint8_t *addr)
+{
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)
+ /*
+ * __sync_ built-ins are deprecated, but __atomic_ ones
+ * are sub-optimized in older GCC versions.
+ */
+ __sync_fetch_and_or_1(addr, (1U << nr));
+#else
+ __atomic_fetch_or(addr, (1U << nr), __ATOMIC_RELAXED);
+#endif
+}
+
+static __rte_always_inline void
+vhost_log_page(uint8_t *log_base, uint64_t page)
+{
+ vhost_set_bit(page % 8, &log_base[page / 8]);
+}
+
+static __rte_always_inline void
+vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
+{
+ uint64_t page;
+
+ if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
+ !dev->log_base || !len))
+ return;
+
+ if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
+ return;
+
+ /* To make sure guest memory updates are committed before logging */
+ rte_smp_wmb();
+
+ page = addr / VHOST_LOG_PAGE;
+ while (page * VHOST_LOG_PAGE < addr + len) {
+ vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
+ page += 1;
+ }
+}
+
+static __rte_always_inline void
+vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+ unsigned long *log_base;
+ int i;
+
+ if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
+ !dev->log_base))
+ return;
+
+ rte_smp_wmb();
+
+ log_base = (unsigned long *)(uintptr_t)dev->log_base;
+
+ for (i = 0; i < vq->log_cache_nb_elem; i++) {
+ struct log_cache_entry *elem = vq->log_cache + i;
+
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)
+ /*
+ * '__sync' builtins are deprecated, but '__atomic' ones
+ * are sub-optimized in older GCC versions.
+ */
+ __sync_fetch_and_or(log_base + elem->offset, elem->val);
+#else
+ __atomic_fetch_or(log_base + elem->offset, elem->val,
+ __ATOMIC_RELAXED);
+#endif
+ }
+
+ rte_smp_wmb();
+
+ vq->log_cache_nb_elem = 0;
+}
+
+static __rte_always_inline void
+vhost_log_cache_page(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint64_t page)
+{
+ uint32_t bit_nr = page % (sizeof(unsigned long) << 3);
+ uint32_t offset = page / (sizeof(unsigned long) << 3);
+ int i;
+
+ for (i = 0; i < vq->log_cache_nb_elem; i++) {
+ struct log_cache_entry *elem = vq->log_cache + i;
+
+ if (elem->offset == offset) {
+ elem->val |= (1UL << bit_nr);
+ return;
+ }
+ }
+
+ if (unlikely(i >= VHOST_LOG_CACHE_NR)) {
+ /*
+ * No more room for a new log cache entry,
+ * so write the dirty log map directly.
+ */
+ rte_smp_wmb();
+ vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
+
+ return;
+ }
+
+ vq->log_cache[i].offset = offset;
+ vq->log_cache[i].val = (1UL << bit_nr);
+ vq->log_cache_nb_elem++;
+}
+
+static __rte_always_inline void
+vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint64_t addr, uint64_t len)
+{
+ uint64_t page;
+
+ if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
+ !dev->log_base || !len))
+ return;
+
+ if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
+ return;
+
+ page = addr / VHOST_LOG_PAGE;
+ while (page * VHOST_LOG_PAGE < addr + len) {
+ vhost_log_cache_page(dev, vq, page);
+ page += 1;
+ }
+}
+
+static __rte_always_inline void
+vhost_log_cache_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint64_t offset, uint64_t len)
+{
+ vhost_log_cache_write(dev, vq, vq->log_guest_addr + offset, len);
+}
+
+static __rte_always_inline void
+vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint64_t offset, uint64_t len)
+{
+ vhost_log_write(dev, vq->log_guest_addr + offset, len);
+}
+
+/* Macros for printing using RTE_LOG */
+#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
+#define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1
+
+#ifdef RTE_LIBRTE_VHOST_DEBUG
+#define VHOST_MAX_PRINT_BUFF 6072
+#define VHOST_LOG_DEBUG(log_type, fmt, args...) \
+ RTE_LOG(DEBUG, log_type, fmt, ##args)
+#define PRINT_PACKET(device, addr, size, header) do { \
+ char *pkt_addr = (char *)(addr); \
+ unsigned int index; \
+ char packet[VHOST_MAX_PRINT_BUFF]; \
+ \
+ if ((header)) \
+ snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \
+ else \
+ snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \
+ for (index = 0; index < (size); index++) { \
+ snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \
+ "%02hhx ", pkt_addr[index]); \
+ } \
+ snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \
+ \
+ VHOST_LOG_DEBUG(VHOST_DATA, "%s", packet); \
+} while (0)
+#else
+#define VHOST_LOG_DEBUG(log_type, fmt, args...) do {} while (0)
+#define PRINT_PACKET(device, addr, size, header) do {} while (0)
+#endif
+
+extern uint64_t VHOST_FEATURES;
+#define MAX_VHOST_DEVICE 1024
+extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
+
+/* Convert guest physical address to host physical address */
+static __rte_always_inline rte_iova_t
+gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size)
+{
+ uint32_t i;
+ struct guest_page *page;
+
+ for (i = 0; i < dev->nr_guest_pages; i++) {
+ page = &dev->guest_pages[i];
+
+ if (gpa >= page->guest_phys_addr &&
+ gpa + size < page->guest_phys_addr + page->size) {
+ return gpa - page->guest_phys_addr +
+ page->host_phys_addr;
+ }
+ }
+
+ return 0;
+}
+
+static __rte_always_inline struct virtio_net *
+get_device(int vid)
+{
+ struct virtio_net *dev = vhost_devices[vid];
+
+ if (unlikely(!dev)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) device not found.\n", vid);
+ }
+
+ return dev;
+}
+
+int vhost_new_device(void);
+void cleanup_device(struct virtio_net *dev, int destroy);
+void reset_device(struct virtio_net *dev);
+void vhost_destroy_device(int);
+void vhost_destroy_device_notify(struct virtio_net *dev);
+
+void cleanup_vq(struct vhost_virtqueue *vq, int destroy);
+void free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq);
+
+int alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx);
+
+void vhost_attach_vdpa_device(int vid, int did);
+
+void vhost_set_ifname(int, const char *if_name, unsigned int if_len);
+void vhost_enable_dequeue_zero_copy(int vid);
+void vhost_set_builtin_virtio_net(int vid, bool enable);
+
+struct vhost_device_ops const *vhost_driver_callback_get(const char *path);
+
+/*
+ * Backend-specific cleanup.
+ *
+ * TODO: fix it; we have one backend now
+ */
+void vhost_backend_cleanup(struct virtio_net *dev);
+
+uint64_t __vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint64_t iova, uint64_t *len, uint8_t perm);
+int vring_translate(struct virtio_net *dev, struct vhost_virtqueue *vq);
+void vring_invalidate(struct virtio_net *dev, struct vhost_virtqueue *vq);
+
+static __rte_always_inline uint64_t
+vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint64_t iova, uint64_t *len, uint8_t perm)
+{
+ if (!(dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)))
+ return rte_vhost_va_from_guest_pa(dev->mem, iova, len);
+
+ return __vhost_iova_to_vva(dev, vq, iova, len, perm);
+}
+
+#define vhost_avail_event(vr) \
+ (*(volatile uint16_t*)&(vr)->used->ring[(vr)->size])
+#define vhost_used_event(vr) \
+ (*(volatile uint16_t*)&(vr)->avail->ring[(vr)->size])
+
+/*
+ * The following is used with VIRTIO_RING_F_EVENT_IDX.
+ * Assuming a given event_idx value from the other size, if we have
+ * just incremented index from old to new_idx, should we trigger an
+ * event?
+ */
+static __rte_always_inline int
+vhost_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
+{
+ return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
+}
+
+static __rte_always_inline void
+vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+ /* Flush used->idx update before we read avail->flags. */
+ rte_smp_mb();
+
+ /* Don't kick guest if we don't reach index specified by guest. */
+ if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) {
+ uint16_t old = vq->signalled_used;
+ uint16_t new = vq->last_used_idx;
+ bool signalled_used_valid = vq->signalled_used_valid;
+
+ vq->signalled_used = new;
+ vq->signalled_used_valid = true;
+
+ VHOST_LOG_DEBUG(VHOST_DATA, "%s: used_event_idx=%d, old=%d, new=%d\n",
+ __func__,
+ vhost_used_event(vq),
+ old, new);
+
+ if ((vhost_need_event(vhost_used_event(vq), new, old) &&
+ (vq->callfd >= 0)) ||
+ unlikely(!signalled_used_valid))
+ eventfd_write(vq->callfd, (eventfd_t) 1);
+ } else {
+ /* Kick the guest if necessary. */
+ if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
+ && (vq->callfd >= 0))
+ eventfd_write(vq->callfd, (eventfd_t)1);
+ }
+}
+
+static __rte_always_inline void
+vhost_vring_call_packed(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+ uint16_t old, new, off, off_wrap;
+ bool signalled_used_valid, kick = false;
+
+ /* Flush used desc update. */
+ rte_smp_mb();
+
+ if (!(dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))) {
+ if (vq->driver_event->flags !=
+ VRING_EVENT_F_DISABLE)
+ kick = true;
+ goto kick;
+ }
+
+ old = vq->signalled_used;
+ new = vq->last_used_idx;
+ vq->signalled_used = new;
+ signalled_used_valid = vq->signalled_used_valid;
+ vq->signalled_used_valid = true;
+
+ if (vq->driver_event->flags != VRING_EVENT_F_DESC) {
+ if (vq->driver_event->flags != VRING_EVENT_F_DISABLE)
+ kick = true;
+ goto kick;
+ }
+
+ if (unlikely(!signalled_used_valid)) {
+ kick = true;
+ goto kick;
+ }
+
+ rte_smp_rmb();
+
+ off_wrap = vq->driver_event->off_wrap;
+ off = off_wrap & ~(1 << 15);
+
+ if (new <= old)
+ old -= vq->size;
+
+ if (vq->used_wrap_counter != off_wrap >> 15)
+ off -= vq->size;
+
+ if (vhost_need_event(off, new, old))
+ kick = true;
+kick:
+ if (kick)
+ eventfd_write(vq->callfd, (eventfd_t)1);
+}
+
+static __rte_always_inline void *
+alloc_copy_ind_table(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint64_t desc_addr, uint64_t desc_len)
+{
+ void *idesc;
+ uint64_t src, dst;
+ uint64_t len, remain = desc_len;
+
+ idesc = rte_malloc(__func__, desc_len, 0);
+ if (unlikely(!idesc))
+ return 0;
+
+ dst = (uint64_t)(uintptr_t)idesc;
+
+ while (remain) {
+ len = remain;
+ src = vhost_iova_to_vva(dev, vq, desc_addr, &len,
+ VHOST_ACCESS_RO);
+ if (unlikely(!src || !len)) {
+ rte_free(idesc);
+ return 0;
+ }
+
+ rte_memcpy((void *)(uintptr_t)dst, (void *)(uintptr_t)src, len);
+
+ remain -= len;
+ dst += len;
+ desc_addr += len;
+ }
+
+ return idesc;
+}
+
+static __rte_always_inline void
+free_ind_table(void *idesc)
+{
+ rte_free(idesc);
+}
+
+static __rte_always_inline void
+restore_mbuf(struct rte_mbuf *m)
+{
+ uint32_t mbuf_size, priv_size;
+
+ while (m) {
+ priv_size = rte_pktmbuf_priv_size(m->pool);
+ mbuf_size = sizeof(struct rte_mbuf) + priv_size;
+ /* start of buffer is after mbuf structure and priv data */
+
+ m->buf_addr = (char *)m + mbuf_size;
+ m->buf_iova = rte_mempool_virt2iova(m) + mbuf_size;
+ m = m->next;
+ }
+}
+
+static __rte_always_inline bool
+mbuf_is_consumed(struct rte_mbuf *m)
+{
+ while (m) {
+ if (rte_mbuf_refcnt_read(m) > 1)
+ return false;
+ m = m->next;
+ }
+
+ return true;
+}
+
+static __rte_always_inline void
+put_zmbuf(struct zcopy_mbuf *zmbuf)
+{
+ zmbuf->in_use = 0;
+}
+
+#endif /* _VHOST_NET_CDEV_H_ */
diff --git a/src/seastar/dpdk/lib/librte_vhost/vhost_crypto.c b/src/seastar/dpdk/lib/librte_vhost/vhost_crypto.c
new file mode 100644
index 000000000..0edf12d52
--- /dev/null
+++ b/src/seastar/dpdk/lib/librte_vhost/vhost_crypto.c
@@ -0,0 +1,1677 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+#include <rte_malloc.h>
+#include <rte_hash.h>
+#include <rte_jhash.h>
+#include <rte_mbuf.h>
+#include <rte_cryptodev.h>
+
+#include "rte_vhost_crypto.h"
+#include "vhost.h"
+#include "vhost_user.h"
+#include "virtio_crypto.h"
+
+#define INHDR_LEN (sizeof(struct virtio_crypto_inhdr))
+#define IV_OFFSET (sizeof(struct rte_crypto_op) + \
+ sizeof(struct rte_crypto_sym_op))
+
+#ifdef RTE_LIBRTE_VHOST_DEBUG
+#define VC_LOG_ERR(fmt, args...) \
+ RTE_LOG(ERR, USER1, "[%s] %s() line %u: " fmt "\n", \
+ "Vhost-Crypto", __func__, __LINE__, ## args)
+#define VC_LOG_INFO(fmt, args...) \
+ RTE_LOG(INFO, USER1, "[%s] %s() line %u: " fmt "\n", \
+ "Vhost-Crypto", __func__, __LINE__, ## args)
+
+#define VC_LOG_DBG(fmt, args...) \
+ RTE_LOG(DEBUG, USER1, "[%s] %s() line %u: " fmt "\n", \
+ "Vhost-Crypto", __func__, __LINE__, ## args)
+#else
+#define VC_LOG_ERR(fmt, args...) \
+ RTE_LOG(ERR, USER1, "[VHOST-Crypto]: " fmt "\n", ## args)
+#define VC_LOG_INFO(fmt, args...) \
+ RTE_LOG(INFO, USER1, "[VHOST-Crypto]: " fmt "\n", ## args)
+#define VC_LOG_DBG(fmt, args...)
+#endif
+
+#define VIRTIO_CRYPTO_FEATURES ((1 << VIRTIO_F_NOTIFY_ON_EMPTY) | \
+ (1 << VIRTIO_RING_F_INDIRECT_DESC) | \
+ (1 << VIRTIO_RING_F_EVENT_IDX) | \
+ (1 << VIRTIO_CRYPTO_SERVICE_CIPHER) | \
+ (1 << VIRTIO_CRYPTO_SERVICE_MAC) | \
+ (1 << VIRTIO_NET_F_CTRL_VQ))
+
+#define IOVA_TO_VVA(t, r, a, l, p) \
+ ((t)(uintptr_t)vhost_iova_to_vva(r->dev, r->vq, a, l, p))
+
+static int
+cipher_algo_transform(uint32_t virtio_cipher_algo)
+{
+ int ret;
+
+ switch (virtio_cipher_algo) {
+ case VIRTIO_CRYPTO_CIPHER_AES_CBC:
+ ret = RTE_CRYPTO_CIPHER_AES_CBC;
+ break;
+ case VIRTIO_CRYPTO_CIPHER_AES_CTR:
+ ret = RTE_CRYPTO_CIPHER_AES_CTR;
+ break;
+ case VIRTIO_CRYPTO_CIPHER_DES_ECB:
+ ret = -VIRTIO_CRYPTO_NOTSUPP;
+ break;
+ case VIRTIO_CRYPTO_CIPHER_DES_CBC:
+ ret = RTE_CRYPTO_CIPHER_DES_CBC;
+ break;
+ case VIRTIO_CRYPTO_CIPHER_3DES_ECB:
+ ret = RTE_CRYPTO_CIPHER_3DES_ECB;
+ break;
+ case VIRTIO_CRYPTO_CIPHER_3DES_CBC:
+ ret = RTE_CRYPTO_CIPHER_3DES_CBC;
+ break;
+ case VIRTIO_CRYPTO_CIPHER_3DES_CTR:
+ ret = RTE_CRYPTO_CIPHER_3DES_CTR;
+ break;
+ case VIRTIO_CRYPTO_CIPHER_KASUMI_F8:
+ ret = RTE_CRYPTO_CIPHER_KASUMI_F8;
+ break;
+ case VIRTIO_CRYPTO_CIPHER_SNOW3G_UEA2:
+ ret = RTE_CRYPTO_CIPHER_SNOW3G_UEA2;
+ break;
+ case VIRTIO_CRYPTO_CIPHER_AES_F8:
+ ret = RTE_CRYPTO_CIPHER_AES_F8;
+ break;
+ case VIRTIO_CRYPTO_CIPHER_AES_XTS:
+ ret = RTE_CRYPTO_CIPHER_AES_XTS;
+ break;
+ case VIRTIO_CRYPTO_CIPHER_ZUC_EEA3:
+ ret = RTE_CRYPTO_CIPHER_ZUC_EEA3;
+ break;
+ default:
+ ret = -VIRTIO_CRYPTO_BADMSG;
+ break;
+ }
+
+ return ret;
+}
+
+static int
+auth_algo_transform(uint32_t virtio_auth_algo)
+{
+ int ret;
+
+ switch (virtio_auth_algo) {
+
+ case VIRTIO_CRYPTO_NO_MAC:
+ ret = RTE_CRYPTO_AUTH_NULL;
+ break;
+ case VIRTIO_CRYPTO_MAC_HMAC_MD5:
+ ret = RTE_CRYPTO_AUTH_MD5_HMAC;
+ break;
+ case VIRTIO_CRYPTO_MAC_HMAC_SHA1:
+ ret = RTE_CRYPTO_AUTH_SHA1_HMAC;
+ break;
+ case VIRTIO_CRYPTO_MAC_HMAC_SHA_224:
+ ret = RTE_CRYPTO_AUTH_SHA224_HMAC;
+ break;
+ case VIRTIO_CRYPTO_MAC_HMAC_SHA_256:
+ ret = RTE_CRYPTO_AUTH_SHA256_HMAC;
+ break;
+ case VIRTIO_CRYPTO_MAC_HMAC_SHA_384:
+ ret = RTE_CRYPTO_AUTH_SHA384_HMAC;
+ break;
+ case VIRTIO_CRYPTO_MAC_HMAC_SHA_512:
+ ret = RTE_CRYPTO_AUTH_SHA512_HMAC;
+ break;
+ case VIRTIO_CRYPTO_MAC_CMAC_3DES:
+ ret = -VIRTIO_CRYPTO_NOTSUPP;
+ break;
+ case VIRTIO_CRYPTO_MAC_CMAC_AES:
+ ret = RTE_CRYPTO_AUTH_AES_CMAC;
+ break;
+ case VIRTIO_CRYPTO_MAC_KASUMI_F9:
+ ret = RTE_CRYPTO_AUTH_KASUMI_F9;
+ break;
+ case VIRTIO_CRYPTO_MAC_SNOW3G_UIA2:
+ ret = RTE_CRYPTO_AUTH_SNOW3G_UIA2;
+ break;
+ case VIRTIO_CRYPTO_MAC_GMAC_AES:
+ ret = RTE_CRYPTO_AUTH_AES_GMAC;
+ break;
+ case VIRTIO_CRYPTO_MAC_GMAC_TWOFISH:
+ ret = -VIRTIO_CRYPTO_NOTSUPP;
+ break;
+ case VIRTIO_CRYPTO_MAC_CBCMAC_AES:
+ ret = RTE_CRYPTO_AUTH_AES_CBC_MAC;
+ break;
+ case VIRTIO_CRYPTO_MAC_CBCMAC_KASUMI_F9:
+ ret = -VIRTIO_CRYPTO_NOTSUPP;
+ break;
+ case VIRTIO_CRYPTO_MAC_XCBC_AES:
+ ret = RTE_CRYPTO_AUTH_AES_XCBC_MAC;
+ break;
+ default:
+ ret = -VIRTIO_CRYPTO_BADMSG;
+ break;
+ }
+
+ return ret;
+}
+
+static int get_iv_len(enum rte_crypto_cipher_algorithm algo)
+{
+ int len;
+
+ switch (algo) {
+ case RTE_CRYPTO_CIPHER_3DES_CBC:
+ len = 8;
+ break;
+ case RTE_CRYPTO_CIPHER_3DES_CTR:
+ len = 8;
+ break;
+ case RTE_CRYPTO_CIPHER_3DES_ECB:
+ len = 8;
+ break;
+ case RTE_CRYPTO_CIPHER_AES_CBC:
+ len = 16;
+ break;
+
+ /* TODO: add common algos */
+
+ default:
+ len = -1;
+ break;
+ }
+
+ return len;
+}
+
+/**
+ * vhost_crypto struct is used to maintain a number of virtio_cryptos and
+ * one DPDK crypto device that deals with all crypto workloads. It is declared
+ * here and defined in vhost_crypto.c
+ */
+struct vhost_crypto {
+ /** Used to lookup DPDK Cryptodev Session based on VIRTIO crypto
+ * session ID.
+ */
+ struct rte_hash *session_map;
+ struct rte_mempool *mbuf_pool;
+ struct rte_mempool *sess_pool;
+ struct rte_mempool *sess_priv_pool;
+ struct rte_mempool *wb_pool;
+
+ /** DPDK cryptodev ID */
+ uint8_t cid;
+ uint16_t nb_qps;
+
+ uint64_t last_session_id;
+
+ uint64_t cache_session_id;
+ struct rte_cryptodev_sym_session *cache_session;
+ /** socket id for the device */
+ int socket_id;
+
+ struct virtio_net *dev;
+
+ uint8_t option;
+} __rte_cache_aligned;
+
+struct vhost_crypto_writeback_data {
+ uint8_t *src;
+ uint8_t *dst;
+ uint64_t len;
+ struct vhost_crypto_writeback_data *next;
+};
+
+struct vhost_crypto_data_req {
+ struct vring_desc *head;
+ struct virtio_net *dev;
+ struct virtio_crypto_inhdr *inhdr;
+ struct vhost_virtqueue *vq;
+ struct vhost_crypto_writeback_data *wb;
+ struct rte_mempool *wb_pool;
+ uint16_t desc_idx;
+ uint16_t len;
+ uint16_t zero_copy;
+};
+
+static int
+transform_cipher_param(struct rte_crypto_sym_xform *xform,
+ VhostUserCryptoSessionParam *param)
+{
+ int ret;
+
+ ret = cipher_algo_transform(param->cipher_algo);
+ if (unlikely(ret < 0))
+ return ret;
+
+ xform->type = RTE_CRYPTO_SYM_XFORM_CIPHER;
+ xform->cipher.algo = (enum rte_crypto_cipher_algorithm)ret;
+ xform->cipher.key.length = param->cipher_key_len;
+ if (xform->cipher.key.length > 0)
+ xform->cipher.key.data = param->cipher_key_buf;
+ if (param->dir == VIRTIO_CRYPTO_OP_ENCRYPT)
+ xform->cipher.op = RTE_CRYPTO_CIPHER_OP_ENCRYPT;
+ else if (param->dir == VIRTIO_CRYPTO_OP_DECRYPT)
+ xform->cipher.op = RTE_CRYPTO_CIPHER_OP_DECRYPT;
+ else {
+ VC_LOG_DBG("Bad operation type");
+ return -VIRTIO_CRYPTO_BADMSG;
+ }
+
+ ret = get_iv_len(xform->cipher.algo);
+ if (unlikely(ret < 0))
+ return ret;
+ xform->cipher.iv.length = (uint16_t)ret;
+ xform->cipher.iv.offset = IV_OFFSET;
+ return 0;
+}
+
+static int
+transform_chain_param(struct rte_crypto_sym_xform *xforms,
+ VhostUserCryptoSessionParam *param)
+{
+ struct rte_crypto_sym_xform *xform_cipher, *xform_auth;
+ int ret;
+
+ switch (param->chaining_dir) {
+ case VIRTIO_CRYPTO_SYM_ALG_CHAIN_ORDER_HASH_THEN_CIPHER:
+ xform_auth = xforms;
+ xform_cipher = xforms->next;
+ xform_cipher->cipher.op = RTE_CRYPTO_CIPHER_OP_DECRYPT;
+ xform_auth->auth.op = RTE_CRYPTO_AUTH_OP_VERIFY;
+ break;
+ case VIRTIO_CRYPTO_SYM_ALG_CHAIN_ORDER_CIPHER_THEN_HASH:
+ xform_cipher = xforms;
+ xform_auth = xforms->next;
+ xform_cipher->cipher.op = RTE_CRYPTO_CIPHER_OP_ENCRYPT;
+ xform_auth->auth.op = RTE_CRYPTO_AUTH_OP_GENERATE;
+ break;
+ default:
+ return -VIRTIO_CRYPTO_BADMSG;
+ }
+
+ /* cipher */
+ ret = cipher_algo_transform(param->cipher_algo);
+ if (unlikely(ret < 0))
+ return ret;
+ xform_cipher->type = RTE_CRYPTO_SYM_XFORM_CIPHER;
+ xform_cipher->cipher.algo = (enum rte_crypto_cipher_algorithm)ret;
+ xform_cipher->cipher.key.length = param->cipher_key_len;
+ xform_cipher->cipher.key.data = param->cipher_key_buf;
+ ret = get_iv_len(xform_cipher->cipher.algo);
+ if (unlikely(ret < 0))
+ return ret;
+ xform_cipher->cipher.iv.length = (uint16_t)ret;
+ xform_cipher->cipher.iv.offset = IV_OFFSET;
+
+ /* auth */
+ xform_auth->type = RTE_CRYPTO_SYM_XFORM_AUTH;
+ ret = auth_algo_transform(param->hash_algo);
+ if (unlikely(ret < 0))
+ return ret;
+ xform_auth->auth.algo = (enum rte_crypto_auth_algorithm)ret;
+ xform_auth->auth.digest_length = param->digest_len;
+ xform_auth->auth.key.length = param->auth_key_len;
+ xform_auth->auth.key.data = param->auth_key_buf;
+
+ return 0;
+}
+
+static void
+vhost_crypto_create_sess(struct vhost_crypto *vcrypto,
+ VhostUserCryptoSessionParam *sess_param)
+{
+ struct rte_crypto_sym_xform xform1 = {0}, xform2 = {0};
+ struct rte_cryptodev_sym_session *session;
+ int ret;
+
+ switch (sess_param->op_type) {
+ case VIRTIO_CRYPTO_SYM_OP_NONE:
+ case VIRTIO_CRYPTO_SYM_OP_CIPHER:
+ ret = transform_cipher_param(&xform1, sess_param);
+ if (unlikely(ret)) {
+ VC_LOG_ERR("Error transform session msg (%i)", ret);
+ sess_param->session_id = ret;
+ return;
+ }
+ break;
+ case VIRTIO_CRYPTO_SYM_OP_ALGORITHM_CHAINING:
+ if (unlikely(sess_param->hash_mode !=
+ VIRTIO_CRYPTO_SYM_HASH_MODE_AUTH)) {
+ sess_param->session_id = -VIRTIO_CRYPTO_NOTSUPP;
+ VC_LOG_ERR("Error transform session message (%i)",
+ -VIRTIO_CRYPTO_NOTSUPP);
+ return;
+ }
+
+ xform1.next = &xform2;
+
+ ret = transform_chain_param(&xform1, sess_param);
+ if (unlikely(ret)) {
+ VC_LOG_ERR("Error transform session message (%i)", ret);
+ sess_param->session_id = ret;
+ return;
+ }
+
+ break;
+ default:
+ VC_LOG_ERR("Algorithm not yet supported");
+ sess_param->session_id = -VIRTIO_CRYPTO_NOTSUPP;
+ return;
+ }
+
+ session = rte_cryptodev_sym_session_create(vcrypto->sess_pool);
+ if (!session) {
+ VC_LOG_ERR("Failed to create session");
+ sess_param->session_id = -VIRTIO_CRYPTO_ERR;
+ return;
+ }
+
+ if (rte_cryptodev_sym_session_init(vcrypto->cid, session, &xform1,
+ vcrypto->sess_priv_pool) < 0) {
+ VC_LOG_ERR("Failed to initialize session");
+ sess_param->session_id = -VIRTIO_CRYPTO_ERR;
+ return;
+ }
+
+ /* insert hash to map */
+ if (rte_hash_add_key_data(vcrypto->session_map,
+ &vcrypto->last_session_id, session) < 0) {
+ VC_LOG_ERR("Failed to insert session to hash table");
+
+ if (rte_cryptodev_sym_session_clear(vcrypto->cid, session) < 0)
+ VC_LOG_ERR("Failed to clear session");
+ else {
+ if (rte_cryptodev_sym_session_free(session) < 0)
+ VC_LOG_ERR("Failed to free session");
+ }
+ sess_param->session_id = -VIRTIO_CRYPTO_ERR;
+ return;
+ }
+
+ VC_LOG_INFO("Session %"PRIu64" created for vdev %i.",
+ vcrypto->last_session_id, vcrypto->dev->vid);
+
+ sess_param->session_id = vcrypto->last_session_id;
+ vcrypto->last_session_id++;
+}
+
+static int
+vhost_crypto_close_sess(struct vhost_crypto *vcrypto, uint64_t session_id)
+{
+ struct rte_cryptodev_sym_session *session;
+ uint64_t sess_id = session_id;
+ int ret;
+
+ ret = rte_hash_lookup_data(vcrypto->session_map, &sess_id,
+ (void **)&session);
+
+ if (unlikely(ret < 0)) {
+ VC_LOG_ERR("Failed to delete session %"PRIu64".", session_id);
+ return -VIRTIO_CRYPTO_INVSESS;
+ }
+
+ if (rte_cryptodev_sym_session_clear(vcrypto->cid, session) < 0) {
+ VC_LOG_DBG("Failed to clear session");
+ return -VIRTIO_CRYPTO_ERR;
+ }
+
+ if (rte_cryptodev_sym_session_free(session) < 0) {
+ VC_LOG_DBG("Failed to free session");
+ return -VIRTIO_CRYPTO_ERR;
+ }
+
+ if (rte_hash_del_key(vcrypto->session_map, &sess_id) < 0) {
+ VC_LOG_DBG("Failed to delete session from hash table.");
+ return -VIRTIO_CRYPTO_ERR;
+ }
+
+ VC_LOG_INFO("Session %"PRIu64" deleted for vdev %i.", sess_id,
+ vcrypto->dev->vid);
+
+ return 0;
+}
+
+static enum rte_vhost_msg_result
+vhost_crypto_msg_post_handler(int vid, void *msg)
+{
+ struct virtio_net *dev = get_device(vid);
+ struct vhost_crypto *vcrypto;
+ VhostUserMsg *vmsg = msg;
+ enum rte_vhost_msg_result ret = RTE_VHOST_MSG_RESULT_OK;
+
+ if (dev == NULL) {
+ VC_LOG_ERR("Invalid vid %i", vid);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ vcrypto = dev->extern_data;
+ if (vcrypto == NULL) {
+ VC_LOG_ERR("Cannot find required data, is it initialized?");
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ switch (vmsg->request.master) {
+ case VHOST_USER_CRYPTO_CREATE_SESS:
+ vhost_crypto_create_sess(vcrypto,
+ &vmsg->payload.crypto_session);
+ vmsg->fd_num = 0;
+ ret = RTE_VHOST_MSG_RESULT_REPLY;
+ break;
+ case VHOST_USER_CRYPTO_CLOSE_SESS:
+ if (vhost_crypto_close_sess(vcrypto, vmsg->payload.u64))
+ ret = RTE_VHOST_MSG_RESULT_ERR;
+ break;
+ default:
+ ret = RTE_VHOST_MSG_RESULT_NOT_HANDLED;
+ break;
+ }
+
+ return ret;
+}
+
+static __rte_always_inline struct vring_desc *
+find_write_desc(struct vring_desc *head, struct vring_desc *desc,
+ uint32_t *nb_descs, uint32_t vq_size)
+{
+ if (desc->flags & VRING_DESC_F_WRITE)
+ return desc;
+
+ while (desc->flags & VRING_DESC_F_NEXT) {
+ if (unlikely(*nb_descs == 0 || desc->next >= vq_size))
+ return NULL;
+ (*nb_descs)--;
+
+ desc = &head[desc->next];
+ if (desc->flags & VRING_DESC_F_WRITE)
+ return desc;
+ }
+
+ return NULL;
+}
+
+static struct virtio_crypto_inhdr *
+reach_inhdr(struct vhost_crypto_data_req *vc_req, struct vring_desc *desc,
+ uint32_t *nb_descs, uint32_t vq_size)
+{
+ uint64_t dlen;
+ struct virtio_crypto_inhdr *inhdr;
+
+ while (desc->flags & VRING_DESC_F_NEXT) {
+ if (unlikely(*nb_descs == 0 || desc->next >= vq_size))
+ return NULL;
+ (*nb_descs)--;
+ desc = &vc_req->head[desc->next];
+ }
+
+ dlen = desc->len;
+ inhdr = IOVA_TO_VVA(struct virtio_crypto_inhdr *, vc_req, desc->addr,
+ &dlen, VHOST_ACCESS_WO);
+ if (unlikely(!inhdr || dlen != desc->len))
+ return NULL;
+
+ return inhdr;
+}
+
+static __rte_always_inline int
+move_desc(struct vring_desc *head, struct vring_desc **cur_desc,
+ uint32_t size, uint32_t *nb_descs, uint32_t vq_size)
+{
+ struct vring_desc *desc = *cur_desc;
+ int left = size - desc->len;
+
+ while ((desc->flags & VRING_DESC_F_NEXT) && left > 0) {
+ (*nb_descs)--;
+ if (unlikely(*nb_descs == 0 || desc->next >= vq_size))
+ return -1;
+
+ desc = &head[desc->next];
+ rte_prefetch0(&head[desc->next]);
+ left -= desc->len;
+ }
+
+ if (unlikely(left > 0))
+ return -1;
+
+ if (unlikely(*nb_descs == 0))
+ *cur_desc = NULL;
+ else {
+ if (unlikely(desc->next >= vq_size))
+ return -1;
+ *cur_desc = &head[desc->next];
+ }
+
+ return 0;
+}
+
+static __rte_always_inline void *
+get_data_ptr(struct vhost_crypto_data_req *vc_req, struct vring_desc *cur_desc,
+ uint8_t perm)
+{
+ void *data;
+ uint64_t dlen = cur_desc->len;
+
+ data = IOVA_TO_VVA(void *, vc_req, cur_desc->addr, &dlen, perm);
+ if (unlikely(!data || dlen != cur_desc->len)) {
+ VC_LOG_ERR("Failed to map object");
+ return NULL;
+ }
+
+ return data;
+}
+
+static int
+copy_data(void *dst_data, struct vhost_crypto_data_req *vc_req,
+ struct vring_desc **cur_desc, uint32_t size,
+ uint32_t *nb_descs, uint32_t vq_size)
+{
+ struct vring_desc *desc = *cur_desc;
+ uint64_t remain, addr, dlen, len;
+ uint32_t to_copy;
+ uint8_t *data = dst_data;
+ uint8_t *src;
+ int left = size;
+
+ to_copy = RTE_MIN(desc->len, (uint32_t)left);
+ dlen = to_copy;
+ src = IOVA_TO_VVA(uint8_t *, vc_req, desc->addr, &dlen,
+ VHOST_ACCESS_RO);
+ if (unlikely(!src || !dlen))
+ return -1;
+
+ rte_memcpy((uint8_t *)data, src, dlen);
+ data += dlen;
+
+ if (unlikely(dlen < to_copy)) {
+ remain = to_copy - dlen;
+ addr = desc->addr + dlen;
+
+ while (remain) {
+ len = remain;
+ src = IOVA_TO_VVA(uint8_t *, vc_req, addr, &len,
+ VHOST_ACCESS_RO);
+ if (unlikely(!src || !len)) {
+ VC_LOG_ERR("Failed to map descriptor");
+ return -1;
+ }
+
+ rte_memcpy(data, src, len);
+ addr += len;
+ remain -= len;
+ data += len;
+ }
+ }
+
+ left -= to_copy;
+
+ while ((desc->flags & VRING_DESC_F_NEXT) && left > 0) {
+ if (unlikely(*nb_descs == 0 || desc->next >= vq_size)) {
+ VC_LOG_ERR("Invalid descriptors");
+ return -1;
+ }
+ (*nb_descs)--;
+
+ desc = &vc_req->head[desc->next];
+ rte_prefetch0(&vc_req->head[desc->next]);
+ to_copy = RTE_MIN(desc->len, (uint32_t)left);
+ dlen = desc->len;
+ src = IOVA_TO_VVA(uint8_t *, vc_req, desc->addr, &dlen,
+ VHOST_ACCESS_RO);
+ if (unlikely(!src || !dlen)) {
+ VC_LOG_ERR("Failed to map descriptor");
+ return -1;
+ }
+
+ rte_memcpy(data, src, dlen);
+ data += dlen;
+
+ if (unlikely(dlen < to_copy)) {
+ remain = to_copy - dlen;
+ addr = desc->addr + dlen;
+
+ while (remain) {
+ len = remain;
+ src = IOVA_TO_VVA(uint8_t *, vc_req, addr, &len,
+ VHOST_ACCESS_RO);
+ if (unlikely(!src || !len)) {
+ VC_LOG_ERR("Failed to map descriptor");
+ return -1;
+ }
+
+ rte_memcpy(data, src, len);
+ addr += len;
+ remain -= len;
+ data += len;
+ }
+ }
+
+ left -= to_copy;
+ }
+
+ if (unlikely(left > 0)) {
+ VC_LOG_ERR("Incorrect virtio descriptor");
+ return -1;
+ }
+
+ if (unlikely(*nb_descs == 0))
+ *cur_desc = NULL;
+ else {
+ if (unlikely(desc->next >= vq_size))
+ return -1;
+ *cur_desc = &vc_req->head[desc->next];
+ }
+
+ return 0;
+}
+
+static void
+write_back_data(struct vhost_crypto_data_req *vc_req)
+{
+ struct vhost_crypto_writeback_data *wb_data = vc_req->wb, *wb_last;
+
+ while (wb_data) {
+ rte_memcpy(wb_data->dst, wb_data->src, wb_data->len);
+ wb_last = wb_data;
+ wb_data = wb_data->next;
+ rte_mempool_put(vc_req->wb_pool, wb_last);
+ }
+}
+
+static void
+free_wb_data(struct vhost_crypto_writeback_data *wb_data,
+ struct rte_mempool *mp)
+{
+ while (wb_data->next != NULL)
+ free_wb_data(wb_data->next, mp);
+
+ rte_mempool_put(mp, wb_data);
+}
+
+/**
+ * The function will allocate a vhost_crypto_writeback_data linked list
+ * containing the source and destination data pointers for the write back
+ * operation after dequeued from Cryptodev PMD queues.
+ *
+ * @param vc_req
+ * The vhost crypto data request pointer
+ * @param cur_desc
+ * The pointer of the current in use descriptor pointer. The content of
+ * cur_desc is expected to be updated after the function execution.
+ * @param end_wb_data
+ * The last write back data element to be returned. It is used only in cipher
+ * and hash chain operations.
+ * @param src
+ * The source data pointer
+ * @param offset
+ * The offset to both source and destination data. For source data the offset
+ * is the number of bytes between src and start point of cipher operation. For
+ * destination data the offset is the number of bytes from *cur_desc->addr
+ * to the point where the src will be written to.
+ * @param write_back_len
+ * The size of the write back length.
+ * @return
+ * The pointer to the start of the write back data linked list.
+ */
+static struct vhost_crypto_writeback_data *
+prepare_write_back_data(struct vhost_crypto_data_req *vc_req,
+ struct vring_desc **cur_desc,
+ struct vhost_crypto_writeback_data **end_wb_data,
+ uint8_t *src,
+ uint32_t offset,
+ uint64_t write_back_len,
+ uint32_t *nb_descs, uint32_t vq_size)
+{
+ struct vhost_crypto_writeback_data *wb_data, *head;
+ struct vring_desc *desc = *cur_desc;
+ uint64_t dlen;
+ uint8_t *dst;
+ int ret;
+
+ ret = rte_mempool_get(vc_req->wb_pool, (void **)&head);
+ if (unlikely(ret < 0)) {
+ VC_LOG_ERR("no memory");
+ goto error_exit;
+ }
+
+ wb_data = head;
+
+ if (likely(desc->len > offset)) {
+ wb_data->src = src + offset;
+ dlen = desc->len;
+ dst = IOVA_TO_VVA(uint8_t *, vc_req, desc->addr,
+ &dlen, VHOST_ACCESS_RW) + offset;
+ if (unlikely(!dst || dlen != desc->len)) {
+ VC_LOG_ERR("Failed to map descriptor");
+ goto error_exit;
+ }
+
+ wb_data->dst = dst;
+ wb_data->len = desc->len - offset;
+ write_back_len -= wb_data->len;
+ src += offset + wb_data->len;
+ offset = 0;
+
+ if (unlikely(write_back_len)) {
+ ret = rte_mempool_get(vc_req->wb_pool,
+ (void **)&(wb_data->next));
+ if (unlikely(ret < 0)) {
+ VC_LOG_ERR("no memory");
+ goto error_exit;
+ }
+
+ wb_data = wb_data->next;
+ } else
+ wb_data->next = NULL;
+ } else
+ offset -= desc->len;
+
+ while (write_back_len) {
+ if (unlikely(*nb_descs == 0 || desc->next >= vq_size)) {
+ VC_LOG_ERR("Invalid descriptors");
+ goto error_exit;
+ }
+ (*nb_descs)--;
+
+ desc = &vc_req->head[desc->next];
+ if (unlikely(!(desc->flags & VRING_DESC_F_WRITE))) {
+ VC_LOG_ERR("incorrect descriptor");
+ goto error_exit;
+ }
+
+ if (desc->len <= offset) {
+ offset -= desc->len;
+ continue;
+ }
+
+ dlen = desc->len;
+ dst = IOVA_TO_VVA(uint8_t *, vc_req, desc->addr, &dlen,
+ VHOST_ACCESS_RW) + offset;
+ if (unlikely(dst == NULL || dlen != desc->len)) {
+ VC_LOG_ERR("Failed to map descriptor");
+ goto error_exit;
+ }
+
+ wb_data->src = src;
+ wb_data->dst = dst;
+ wb_data->len = RTE_MIN(desc->len - offset, write_back_len);
+ write_back_len -= wb_data->len;
+ src += wb_data->len;
+ offset = 0;
+
+ if (write_back_len) {
+ ret = rte_mempool_get(vc_req->wb_pool,
+ (void **)&(wb_data->next));
+ if (unlikely(ret < 0)) {
+ VC_LOG_ERR("no memory");
+ goto error_exit;
+ }
+
+ wb_data = wb_data->next;
+ } else
+ wb_data->next = NULL;
+ }
+
+ if (unlikely(*nb_descs == 0))
+ *cur_desc = NULL;
+ else {
+ if (unlikely(desc->next >= vq_size))
+ goto error_exit;
+ *cur_desc = &vc_req->head[desc->next];
+ }
+
+ *end_wb_data = wb_data;
+
+ return head;
+
+error_exit:
+ if (head)
+ free_wb_data(head, vc_req->wb_pool);
+
+ return NULL;
+}
+
+static uint8_t
+prepare_sym_cipher_op(struct vhost_crypto *vcrypto, struct rte_crypto_op *op,
+ struct vhost_crypto_data_req *vc_req,
+ struct virtio_crypto_cipher_data_req *cipher,
+ struct vring_desc *cur_desc,
+ uint32_t *nb_descs, uint32_t vq_size)
+{
+ struct vring_desc *desc = cur_desc;
+ struct vhost_crypto_writeback_data *ewb = NULL;
+ struct rte_mbuf *m_src = op->sym->m_src, *m_dst = op->sym->m_dst;
+ uint8_t *iv_data = rte_crypto_op_ctod_offset(op, uint8_t *, IV_OFFSET);
+ uint8_t ret = 0;
+
+ /* prepare */
+ /* iv */
+ if (unlikely(copy_data(iv_data, vc_req, &desc, cipher->para.iv_len,
+ nb_descs, vq_size) < 0)) {
+ ret = VIRTIO_CRYPTO_BADMSG;
+ goto error_exit;
+ }
+
+ m_src->data_len = cipher->para.src_data_len;
+
+ switch (vcrypto->option) {
+ case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE:
+ m_src->buf_iova = gpa_to_hpa(vcrypto->dev, desc->addr,
+ cipher->para.src_data_len);
+ m_src->buf_addr = get_data_ptr(vc_req, desc, VHOST_ACCESS_RO);
+ if (unlikely(m_src->buf_iova == 0 ||
+ m_src->buf_addr == NULL)) {
+ VC_LOG_ERR("zero_copy may fail due to cross page data");
+ ret = VIRTIO_CRYPTO_ERR;
+ goto error_exit;
+ }
+
+ if (unlikely(move_desc(vc_req->head, &desc,
+ cipher->para.src_data_len, nb_descs,
+ vq_size) < 0)) {
+ VC_LOG_ERR("Incorrect descriptor");
+ ret = VIRTIO_CRYPTO_ERR;
+ goto error_exit;
+ }
+
+ break;
+ case RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE:
+ vc_req->wb_pool = vcrypto->wb_pool;
+
+ if (unlikely(cipher->para.src_data_len >
+ RTE_MBUF_DEFAULT_BUF_SIZE)) {
+ VC_LOG_ERR("Not enough space to do data copy");
+ ret = VIRTIO_CRYPTO_ERR;
+ goto error_exit;
+ }
+ if (unlikely(copy_data(rte_pktmbuf_mtod(m_src, uint8_t *),
+ vc_req, &desc, cipher->para.src_data_len,
+ nb_descs, vq_size) < 0)) {
+ ret = VIRTIO_CRYPTO_BADMSG;
+ goto error_exit;
+ }
+ break;
+ default:
+ ret = VIRTIO_CRYPTO_BADMSG;
+ goto error_exit;
+ }
+
+ /* dst */
+ desc = find_write_desc(vc_req->head, desc, nb_descs, vq_size);
+ if (unlikely(!desc)) {
+ VC_LOG_ERR("Cannot find write location");
+ ret = VIRTIO_CRYPTO_BADMSG;
+ goto error_exit;
+ }
+
+ switch (vcrypto->option) {
+ case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE:
+ m_dst->buf_iova = gpa_to_hpa(vcrypto->dev,
+ desc->addr, cipher->para.dst_data_len);
+ m_dst->buf_addr = get_data_ptr(vc_req, desc, VHOST_ACCESS_RW);
+ if (unlikely(m_dst->buf_iova == 0 || m_dst->buf_addr == NULL)) {
+ VC_LOG_ERR("zero_copy may fail due to cross page data");
+ ret = VIRTIO_CRYPTO_ERR;
+ goto error_exit;
+ }
+
+ if (unlikely(move_desc(vc_req->head, &desc,
+ cipher->para.dst_data_len,
+ nb_descs, vq_size) < 0)) {
+ VC_LOG_ERR("Incorrect descriptor");
+ ret = VIRTIO_CRYPTO_ERR;
+ goto error_exit;
+ }
+
+ m_dst->data_len = cipher->para.dst_data_len;
+ break;
+ case RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE:
+ vc_req->wb = prepare_write_back_data(vc_req, &desc, &ewb,
+ rte_pktmbuf_mtod(m_src, uint8_t *), 0,
+ cipher->para.dst_data_len, nb_descs, vq_size);
+ if (unlikely(vc_req->wb == NULL)) {
+ ret = VIRTIO_CRYPTO_ERR;
+ goto error_exit;
+ }
+
+ break;
+ default:
+ ret = VIRTIO_CRYPTO_BADMSG;
+ goto error_exit;
+ }
+
+ /* src data */
+ op->type = RTE_CRYPTO_OP_TYPE_SYMMETRIC;
+ op->sess_type = RTE_CRYPTO_OP_WITH_SESSION;
+
+ op->sym->cipher.data.offset = 0;
+ op->sym->cipher.data.length = cipher->para.src_data_len;
+
+ vc_req->inhdr = get_data_ptr(vc_req, desc, VHOST_ACCESS_WO);
+ if (unlikely(vc_req->inhdr == NULL)) {
+ ret = VIRTIO_CRYPTO_BADMSG;
+ goto error_exit;
+ }
+
+ vc_req->inhdr->status = VIRTIO_CRYPTO_OK;
+ vc_req->len = cipher->para.dst_data_len + INHDR_LEN;
+
+ return 0;
+
+error_exit:
+ if (vc_req->wb)
+ free_wb_data(vc_req->wb, vc_req->wb_pool);
+
+ vc_req->len = INHDR_LEN;
+ return ret;
+}
+
+static uint8_t
+prepare_sym_chain_op(struct vhost_crypto *vcrypto, struct rte_crypto_op *op,
+ struct vhost_crypto_data_req *vc_req,
+ struct virtio_crypto_alg_chain_data_req *chain,
+ struct vring_desc *cur_desc,
+ uint32_t *nb_descs, uint32_t vq_size)
+{
+ struct vring_desc *desc = cur_desc, *digest_desc;
+ struct vhost_crypto_writeback_data *ewb = NULL, *ewb2 = NULL;
+ struct rte_mbuf *m_src = op->sym->m_src, *m_dst = op->sym->m_dst;
+ uint8_t *iv_data = rte_crypto_op_ctod_offset(op, uint8_t *, IV_OFFSET);
+ uint32_t digest_offset;
+ void *digest_addr;
+ uint8_t ret = 0;
+
+ /* prepare */
+ /* iv */
+ if (unlikely(copy_data(iv_data, vc_req, &desc,
+ chain->para.iv_len, nb_descs, vq_size) < 0)) {
+ ret = VIRTIO_CRYPTO_BADMSG;
+ goto error_exit;
+ }
+
+ m_src->data_len = chain->para.src_data_len;
+
+ switch (vcrypto->option) {
+ case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE:
+ m_dst->data_len = chain->para.dst_data_len;
+
+ m_src->buf_iova = gpa_to_hpa(vcrypto->dev, desc->addr,
+ chain->para.src_data_len);
+ m_src->buf_addr = get_data_ptr(vc_req, desc, VHOST_ACCESS_RO);
+ if (unlikely(m_src->buf_iova == 0 || m_src->buf_addr == NULL)) {
+ VC_LOG_ERR("zero_copy may fail due to cross page data");
+ ret = VIRTIO_CRYPTO_ERR;
+ goto error_exit;
+ }
+
+ if (unlikely(move_desc(vc_req->head, &desc,
+ chain->para.src_data_len,
+ nb_descs, vq_size) < 0)) {
+ VC_LOG_ERR("Incorrect descriptor");
+ ret = VIRTIO_CRYPTO_ERR;
+ goto error_exit;
+ }
+ break;
+ case RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE:
+ vc_req->wb_pool = vcrypto->wb_pool;
+
+ if (unlikely(chain->para.src_data_len >
+ RTE_MBUF_DEFAULT_BUF_SIZE)) {
+ VC_LOG_ERR("Not enough space to do data copy");
+ ret = VIRTIO_CRYPTO_ERR;
+ goto error_exit;
+ }
+ if (unlikely(copy_data(rte_pktmbuf_mtod(m_src, uint8_t *),
+ vc_req, &desc, chain->para.src_data_len,
+ nb_descs, vq_size)) < 0) {
+ ret = VIRTIO_CRYPTO_BADMSG;
+ goto error_exit;
+ }
+
+ break;
+ default:
+ ret = VIRTIO_CRYPTO_BADMSG;
+ goto error_exit;
+ }
+
+ /* dst */
+ desc = find_write_desc(vc_req->head, desc, nb_descs, vq_size);
+ if (unlikely(!desc)) {
+ VC_LOG_ERR("Cannot find write location");
+ ret = VIRTIO_CRYPTO_BADMSG;
+ goto error_exit;
+ }
+
+ switch (vcrypto->option) {
+ case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE:
+ m_dst->buf_iova = gpa_to_hpa(vcrypto->dev,
+ desc->addr, chain->para.dst_data_len);
+ m_dst->buf_addr = get_data_ptr(vc_req, desc, VHOST_ACCESS_RW);
+ if (unlikely(m_dst->buf_iova == 0 || m_dst->buf_addr == NULL)) {
+ VC_LOG_ERR("zero_copy may fail due to cross page data");
+ ret = VIRTIO_CRYPTO_ERR;
+ goto error_exit;
+ }
+
+ if (unlikely(move_desc(vc_req->head, &desc,
+ chain->para.dst_data_len,
+ nb_descs, vq_size) < 0)) {
+ VC_LOG_ERR("Incorrect descriptor");
+ ret = VIRTIO_CRYPTO_ERR;
+ goto error_exit;
+ }
+
+ op->sym->auth.digest.phys_addr = gpa_to_hpa(vcrypto->dev,
+ desc->addr, chain->para.hash_result_len);
+ op->sym->auth.digest.data = get_data_ptr(vc_req, desc,
+ VHOST_ACCESS_RW);
+ if (unlikely(op->sym->auth.digest.phys_addr == 0)) {
+ VC_LOG_ERR("zero_copy may fail due to cross page data");
+ ret = VIRTIO_CRYPTO_ERR;
+ goto error_exit;
+ }
+
+ if (unlikely(move_desc(vc_req->head, &desc,
+ chain->para.hash_result_len,
+ nb_descs, vq_size) < 0)) {
+ VC_LOG_ERR("Incorrect descriptor");
+ ret = VIRTIO_CRYPTO_ERR;
+ goto error_exit;
+ }
+
+ break;
+ case RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE:
+ vc_req->wb = prepare_write_back_data(vc_req, &desc, &ewb,
+ rte_pktmbuf_mtod(m_src, uint8_t *),
+ chain->para.cipher_start_src_offset,
+ chain->para.dst_data_len -
+ chain->para.cipher_start_src_offset,
+ nb_descs, vq_size);
+ if (unlikely(vc_req->wb == NULL)) {
+ ret = VIRTIO_CRYPTO_ERR;
+ goto error_exit;
+ }
+
+ digest_offset = m_src->data_len;
+ digest_addr = rte_pktmbuf_mtod_offset(m_src, void *,
+ digest_offset);
+ digest_desc = desc;
+
+ /** create a wb_data for digest */
+ ewb->next = prepare_write_back_data(vc_req, &desc, &ewb2,
+ digest_addr, 0, chain->para.hash_result_len,
+ nb_descs, vq_size);
+ if (unlikely(ewb->next == NULL)) {
+ ret = VIRTIO_CRYPTO_ERR;
+ goto error_exit;
+ }
+
+ if (unlikely(copy_data(digest_addr, vc_req, &digest_desc,
+ chain->para.hash_result_len,
+ nb_descs, vq_size) < 0)) {
+ ret = VIRTIO_CRYPTO_BADMSG;
+ goto error_exit;
+ }
+
+ op->sym->auth.digest.data = digest_addr;
+ op->sym->auth.digest.phys_addr = rte_pktmbuf_iova_offset(m_src,
+ digest_offset);
+ break;
+ default:
+ ret = VIRTIO_CRYPTO_BADMSG;
+ goto error_exit;
+ }
+
+ /* record inhdr */
+ vc_req->inhdr = get_data_ptr(vc_req, desc, VHOST_ACCESS_WO);
+ if (unlikely(vc_req->inhdr == NULL)) {
+ ret = VIRTIO_CRYPTO_BADMSG;
+ goto error_exit;
+ }
+
+ vc_req->inhdr->status = VIRTIO_CRYPTO_OK;
+
+ op->type = RTE_CRYPTO_OP_TYPE_SYMMETRIC;
+ op->sess_type = RTE_CRYPTO_OP_WITH_SESSION;
+
+ op->sym->cipher.data.offset = chain->para.cipher_start_src_offset;
+ op->sym->cipher.data.length = chain->para.src_data_len -
+ chain->para.cipher_start_src_offset;
+
+ op->sym->auth.data.offset = chain->para.hash_start_src_offset;
+ op->sym->auth.data.length = chain->para.len_to_hash;
+
+ vc_req->len = chain->para.dst_data_len + chain->para.hash_result_len +
+ INHDR_LEN;
+ return 0;
+
+error_exit:
+ if (vc_req->wb)
+ free_wb_data(vc_req->wb, vc_req->wb_pool);
+ vc_req->len = INHDR_LEN;
+ return ret;
+}
+
+/**
+ * Process on descriptor
+ */
+static __rte_always_inline int
+vhost_crypto_process_one_req(struct vhost_crypto *vcrypto,
+ struct vhost_virtqueue *vq, struct rte_crypto_op *op,
+ struct vring_desc *head, uint16_t desc_idx)
+{
+ struct vhost_crypto_data_req *vc_req = rte_mbuf_to_priv(op->sym->m_src);
+ struct rte_cryptodev_sym_session *session;
+ struct virtio_crypto_op_data_req *req, tmp_req;
+ struct virtio_crypto_inhdr *inhdr;
+ struct vring_desc *desc = NULL;
+ uint64_t session_id;
+ uint64_t dlen;
+ uint32_t nb_descs = vq->size;
+ int err = 0;
+
+ vc_req->desc_idx = desc_idx;
+ vc_req->dev = vcrypto->dev;
+ vc_req->vq = vq;
+
+ if (likely(head->flags & VRING_DESC_F_INDIRECT)) {
+ dlen = head->len;
+ nb_descs = dlen / sizeof(struct vring_desc);
+ /* drop invalid descriptors */
+ if (unlikely(nb_descs > vq->size))
+ return -1;
+ desc = IOVA_TO_VVA(struct vring_desc *, vc_req, head->addr,
+ &dlen, VHOST_ACCESS_RO);
+ if (unlikely(!desc || dlen != head->len))
+ return -1;
+ desc_idx = 0;
+ head = desc;
+ } else {
+ desc = head;
+ }
+
+ vc_req->head = head;
+ vc_req->zero_copy = vcrypto->option;
+
+ req = get_data_ptr(vc_req, desc, VHOST_ACCESS_RO);
+ if (unlikely(req == NULL)) {
+ switch (vcrypto->option) {
+ case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE:
+ err = VIRTIO_CRYPTO_BADMSG;
+ VC_LOG_ERR("Invalid descriptor");
+ goto error_exit;
+ case RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE:
+ req = &tmp_req;
+ if (unlikely(copy_data(req, vc_req, &desc, sizeof(*req),
+ &nb_descs, vq->size) < 0)) {
+ err = VIRTIO_CRYPTO_BADMSG;
+ VC_LOG_ERR("Invalid descriptor");
+ goto error_exit;
+ }
+ break;
+ default:
+ err = VIRTIO_CRYPTO_ERR;
+ VC_LOG_ERR("Invalid option");
+ goto error_exit;
+ }
+ } else {
+ if (unlikely(move_desc(vc_req->head, &desc,
+ sizeof(*req), &nb_descs, vq->size) < 0)) {
+ VC_LOG_ERR("Incorrect descriptor");
+ goto error_exit;
+ }
+ }
+
+ switch (req->header.opcode) {
+ case VIRTIO_CRYPTO_CIPHER_ENCRYPT:
+ case VIRTIO_CRYPTO_CIPHER_DECRYPT:
+ session_id = req->header.session_id;
+
+ /* one branch to avoid unnecessary table lookup */
+ if (vcrypto->cache_session_id != session_id) {
+ err = rte_hash_lookup_data(vcrypto->session_map,
+ &session_id, (void **)&session);
+ if (unlikely(err < 0)) {
+ err = VIRTIO_CRYPTO_ERR;
+ VC_LOG_ERR("Failed to find session %"PRIu64,
+ session_id);
+ goto error_exit;
+ }
+
+ vcrypto->cache_session = session;
+ vcrypto->cache_session_id = session_id;
+ }
+
+ session = vcrypto->cache_session;
+
+ err = rte_crypto_op_attach_sym_session(op, session);
+ if (unlikely(err < 0)) {
+ err = VIRTIO_CRYPTO_ERR;
+ VC_LOG_ERR("Failed to attach session to op");
+ goto error_exit;
+ }
+
+ switch (req->u.sym_req.op_type) {
+ case VIRTIO_CRYPTO_SYM_OP_NONE:
+ err = VIRTIO_CRYPTO_NOTSUPP;
+ break;
+ case VIRTIO_CRYPTO_SYM_OP_CIPHER:
+ err = prepare_sym_cipher_op(vcrypto, op, vc_req,
+ &req->u.sym_req.u.cipher, desc,
+ &nb_descs, vq->size);
+ break;
+ case VIRTIO_CRYPTO_SYM_OP_ALGORITHM_CHAINING:
+ err = prepare_sym_chain_op(vcrypto, op, vc_req,
+ &req->u.sym_req.u.chain, desc,
+ &nb_descs, vq->size);
+ break;
+ }
+ if (unlikely(err != 0)) {
+ VC_LOG_ERR("Failed to process sym request");
+ goto error_exit;
+ }
+ break;
+ default:
+ VC_LOG_ERR("Unsupported symmetric crypto request type %u",
+ req->header.opcode);
+ goto error_exit;
+ }
+
+ return 0;
+
+error_exit:
+
+ inhdr = reach_inhdr(vc_req, desc, &nb_descs, vq->size);
+ if (likely(inhdr != NULL))
+ inhdr->status = (uint8_t)err;
+
+ return -1;
+}
+
+static __rte_always_inline struct vhost_virtqueue *
+vhost_crypto_finalize_one_request(struct rte_crypto_op *op,
+ struct vhost_virtqueue *old_vq)
+{
+ struct rte_mbuf *m_src = op->sym->m_src;
+ struct rte_mbuf *m_dst = op->sym->m_dst;
+ struct vhost_crypto_data_req *vc_req = rte_mbuf_to_priv(m_src);
+ uint16_t desc_idx;
+
+ if (unlikely(!vc_req)) {
+ VC_LOG_ERR("Failed to retrieve vc_req");
+ return NULL;
+ }
+
+ if (old_vq && (vc_req->vq != old_vq))
+ return vc_req->vq;
+
+ desc_idx = vc_req->desc_idx;
+
+ if (unlikely(op->status != RTE_CRYPTO_OP_STATUS_SUCCESS))
+ vc_req->inhdr->status = VIRTIO_CRYPTO_ERR;
+ else {
+ if (vc_req->zero_copy == 0)
+ write_back_data(vc_req);
+ }
+
+ vc_req->vq->used->ring[desc_idx].id = desc_idx;
+ vc_req->vq->used->ring[desc_idx].len = vc_req->len;
+
+ rte_mempool_put(m_src->pool, (void *)m_src);
+
+ if (m_dst)
+ rte_mempool_put(m_dst->pool, (void *)m_dst);
+
+ return vc_req->vq;
+}
+
+static __rte_always_inline uint16_t
+vhost_crypto_complete_one_vm_requests(struct rte_crypto_op **ops,
+ uint16_t nb_ops, int *callfd)
+{
+ uint16_t processed = 1;
+ struct vhost_virtqueue *vq, *tmp_vq;
+
+ if (unlikely(nb_ops == 0))
+ return 0;
+
+ vq = vhost_crypto_finalize_one_request(ops[0], NULL);
+ if (unlikely(vq == NULL))
+ return 0;
+ tmp_vq = vq;
+
+ while ((processed < nb_ops)) {
+ tmp_vq = vhost_crypto_finalize_one_request(ops[processed],
+ tmp_vq);
+
+ if (unlikely(vq != tmp_vq))
+ break;
+
+ processed++;
+ }
+
+ *callfd = vq->callfd;
+
+ *(volatile uint16_t *)&vq->used->idx += processed;
+
+ return processed;
+}
+
+int __rte_experimental
+rte_vhost_crypto_create(int vid, uint8_t cryptodev_id,
+ struct rte_mempool *sess_pool,
+ struct rte_mempool *sess_priv_pool,
+ int socket_id)
+{
+ struct virtio_net *dev = get_device(vid);
+ struct rte_hash_parameters params = {0};
+ struct vhost_crypto *vcrypto;
+ char name[128];
+ int ret;
+
+ if (!dev) {
+ VC_LOG_ERR("Invalid vid %i", vid);
+ return -EINVAL;
+ }
+
+ ret = rte_vhost_driver_set_features(dev->ifname,
+ VIRTIO_CRYPTO_FEATURES);
+ if (ret < 0) {
+ VC_LOG_ERR("Error setting features");
+ return -1;
+ }
+
+ vcrypto = rte_zmalloc_socket(NULL, sizeof(*vcrypto),
+ RTE_CACHE_LINE_SIZE, socket_id);
+ if (!vcrypto) {
+ VC_LOG_ERR("Insufficient memory");
+ return -ENOMEM;
+ }
+
+ vcrypto->sess_pool = sess_pool;
+ vcrypto->sess_priv_pool = sess_priv_pool;
+ vcrypto->cid = cryptodev_id;
+ vcrypto->cache_session_id = UINT64_MAX;
+ vcrypto->last_session_id = 1;
+ vcrypto->dev = dev;
+ vcrypto->option = RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE;
+
+ snprintf(name, 127, "HASH_VHOST_CRYPT_%u", (uint32_t)vid);
+ params.name = name;
+ params.entries = VHOST_CRYPTO_SESSION_MAP_ENTRIES;
+ params.hash_func = rte_jhash;
+ params.key_len = sizeof(uint64_t);
+ params.socket_id = socket_id;
+ vcrypto->session_map = rte_hash_create(&params);
+ if (!vcrypto->session_map) {
+ VC_LOG_ERR("Failed to creath session map");
+ ret = -ENOMEM;
+ goto error_exit;
+ }
+
+ snprintf(name, 127, "MBUF_POOL_VM_%u", (uint32_t)vid);
+ vcrypto->mbuf_pool = rte_pktmbuf_pool_create(name,
+ VHOST_CRYPTO_MBUF_POOL_SIZE, 512,
+ sizeof(struct vhost_crypto_data_req),
+ RTE_MBUF_DEFAULT_DATAROOM * 2 + RTE_PKTMBUF_HEADROOM,
+ rte_socket_id());
+ if (!vcrypto->mbuf_pool) {
+ VC_LOG_ERR("Failed to creath mbuf pool");
+ ret = -ENOMEM;
+ goto error_exit;
+ }
+
+ snprintf(name, 127, "WB_POOL_VM_%u", (uint32_t)vid);
+ vcrypto->wb_pool = rte_mempool_create(name,
+ VHOST_CRYPTO_MBUF_POOL_SIZE,
+ sizeof(struct vhost_crypto_writeback_data),
+ 128, 0, NULL, NULL, NULL, NULL,
+ rte_socket_id(), 0);
+ if (!vcrypto->wb_pool) {
+ VC_LOG_ERR("Failed to creath mempool");
+ ret = -ENOMEM;
+ goto error_exit;
+ }
+
+ dev->extern_data = vcrypto;
+ dev->extern_ops.pre_msg_handle = NULL;
+ dev->extern_ops.post_msg_handle = vhost_crypto_msg_post_handler;
+
+ return 0;
+
+error_exit:
+ if (vcrypto->session_map)
+ rte_hash_free(vcrypto->session_map);
+ if (vcrypto->mbuf_pool)
+ rte_mempool_free(vcrypto->mbuf_pool);
+
+ rte_free(vcrypto);
+
+ return ret;
+}
+
+int __rte_experimental
+rte_vhost_crypto_free(int vid)
+{
+ struct virtio_net *dev = get_device(vid);
+ struct vhost_crypto *vcrypto;
+
+ if (unlikely(dev == NULL)) {
+ VC_LOG_ERR("Invalid vid %i", vid);
+ return -EINVAL;
+ }
+
+ vcrypto = dev->extern_data;
+ if (unlikely(vcrypto == NULL)) {
+ VC_LOG_ERR("Cannot find required data, is it initialized?");
+ return -ENOENT;
+ }
+
+ rte_hash_free(vcrypto->session_map);
+ rte_mempool_free(vcrypto->mbuf_pool);
+ rte_mempool_free(vcrypto->wb_pool);
+ rte_free(vcrypto);
+
+ dev->extern_data = NULL;
+ dev->extern_ops.pre_msg_handle = NULL;
+ dev->extern_ops.post_msg_handle = NULL;
+
+ return 0;
+}
+
+int __rte_experimental
+rte_vhost_crypto_set_zero_copy(int vid, enum rte_vhost_crypto_zero_copy option)
+{
+ struct virtio_net *dev = get_device(vid);
+ struct vhost_crypto *vcrypto;
+
+ if (unlikely(dev == NULL)) {
+ VC_LOG_ERR("Invalid vid %i", vid);
+ return -EINVAL;
+ }
+
+ if (unlikely((uint32_t)option >=
+ RTE_VHOST_CRYPTO_MAX_ZERO_COPY_OPTIONS)) {
+ VC_LOG_ERR("Invalid option %i", option);
+ return -EINVAL;
+ }
+
+ vcrypto = (struct vhost_crypto *)dev->extern_data;
+ if (unlikely(vcrypto == NULL)) {
+ VC_LOG_ERR("Cannot find required data, is it initialized?");
+ return -ENOENT;
+ }
+
+ if (vcrypto->option == (uint8_t)option)
+ return 0;
+
+ if (!(rte_mempool_full(vcrypto->mbuf_pool)) ||
+ !(rte_mempool_full(vcrypto->wb_pool))) {
+ VC_LOG_ERR("Cannot update zero copy as mempool is not full");
+ return -EINVAL;
+ }
+
+ if (option == RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE) {
+ char name[128];
+
+ snprintf(name, 127, "WB_POOL_VM_%u", (uint32_t)vid);
+ vcrypto->wb_pool = rte_mempool_create(name,
+ VHOST_CRYPTO_MBUF_POOL_SIZE,
+ sizeof(struct vhost_crypto_writeback_data),
+ 128, 0, NULL, NULL, NULL, NULL,
+ rte_socket_id(), 0);
+ if (!vcrypto->wb_pool) {
+ VC_LOG_ERR("Failed to creath mbuf pool");
+ return -ENOMEM;
+ }
+ } else {
+ rte_mempool_free(vcrypto->wb_pool);
+ vcrypto->wb_pool = NULL;
+ }
+
+ vcrypto->option = (uint8_t)option;
+
+ return 0;
+}
+
+uint16_t __rte_experimental
+rte_vhost_crypto_fetch_requests(int vid, uint32_t qid,
+ struct rte_crypto_op **ops, uint16_t nb_ops)
+{
+ struct rte_mbuf *mbufs[VHOST_CRYPTO_MAX_BURST_SIZE * 2];
+ struct virtio_net *dev = get_device(vid);
+ struct vhost_crypto *vcrypto;
+ struct vhost_virtqueue *vq;
+ uint16_t avail_idx;
+ uint16_t start_idx;
+ uint16_t count;
+ uint16_t i = 0;
+
+ if (unlikely(dev == NULL)) {
+ VC_LOG_ERR("Invalid vid %i", vid);
+ return -EINVAL;
+ }
+
+ if (unlikely(qid >= VHOST_MAX_QUEUE_PAIRS)) {
+ VC_LOG_ERR("Invalid qid %u", qid);
+ return -EINVAL;
+ }
+
+ vcrypto = (struct vhost_crypto *)dev->extern_data;
+ if (unlikely(vcrypto == NULL)) {
+ VC_LOG_ERR("Cannot find required data, is it initialized?");
+ return -ENOENT;
+ }
+
+ vq = dev->virtqueue[qid];
+
+ avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+ start_idx = vq->last_used_idx;
+ count = avail_idx - start_idx;
+ count = RTE_MIN(count, VHOST_CRYPTO_MAX_BURST_SIZE);
+ count = RTE_MIN(count, nb_ops);
+
+ if (unlikely(count == 0))
+ return 0;
+
+ /* for zero copy, we need 2 empty mbufs for src and dst, otherwise
+ * we need only 1 mbuf as src and dst
+ */
+ switch (vcrypto->option) {
+ case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE:
+ if (unlikely(rte_mempool_get_bulk(vcrypto->mbuf_pool,
+ (void **)mbufs, count * 2) < 0)) {
+ VC_LOG_ERR("Insufficient memory");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < count; i++) {
+ uint16_t used_idx = (start_idx + i) & (vq->size - 1);
+ uint16_t desc_idx = vq->avail->ring[used_idx];
+ struct vring_desc *head = &vq->desc[desc_idx];
+ struct rte_crypto_op *op = ops[i];
+
+ op->sym->m_src = mbufs[i * 2];
+ op->sym->m_dst = mbufs[i * 2 + 1];
+ op->sym->m_src->data_off = 0;
+ op->sym->m_dst->data_off = 0;
+
+ if (unlikely(vhost_crypto_process_one_req(vcrypto, vq,
+ op, head, desc_idx)) < 0)
+ break;
+ }
+
+ if (unlikely(i < count))
+ rte_mempool_put_bulk(vcrypto->mbuf_pool,
+ (void **)&mbufs[i * 2],
+ (count - i) * 2);
+
+ break;
+
+ case RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE:
+ if (unlikely(rte_mempool_get_bulk(vcrypto->mbuf_pool,
+ (void **)mbufs, count) < 0)) {
+ VC_LOG_ERR("Insufficient memory");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < count; i++) {
+ uint16_t used_idx = (start_idx + i) & (vq->size - 1);
+ uint16_t desc_idx = vq->avail->ring[used_idx];
+ struct vring_desc *head = &vq->desc[desc_idx];
+ struct rte_crypto_op *op = ops[i];
+
+ op->sym->m_src = mbufs[i];
+ op->sym->m_dst = NULL;
+ op->sym->m_src->data_off = 0;
+
+ if (unlikely(vhost_crypto_process_one_req(vcrypto, vq,
+ op, head, desc_idx) < 0))
+ break;
+ }
+
+ if (unlikely(i < count))
+ rte_mempool_put_bulk(vcrypto->mbuf_pool,
+ (void **)&mbufs[i],
+ count - i);
+
+ break;
+
+ }
+
+ vq->last_used_idx += i;
+
+ return i;
+}
+
+uint16_t __rte_experimental
+rte_vhost_crypto_finalize_requests(struct rte_crypto_op **ops,
+ uint16_t nb_ops, int *callfds, uint16_t *nb_callfds)
+{
+ struct rte_crypto_op **tmp_ops = ops;
+ uint16_t count = 0, left = nb_ops;
+ int callfd;
+ uint16_t idx = 0;
+
+ while (left) {
+ count = vhost_crypto_complete_one_vm_requests(tmp_ops, left,
+ &callfd);
+ if (unlikely(count == 0))
+ break;
+
+ tmp_ops = &tmp_ops[count];
+ left -= count;
+
+ callfds[idx++] = callfd;
+
+ if (unlikely(idx >= VIRTIO_CRYPTO_MAX_NUM_BURST_VQS)) {
+ VC_LOG_ERR("Too many vqs");
+ break;
+ }
+ }
+
+ *nb_callfds = idx;
+
+ return nb_ops - left;
+}
diff --git a/src/seastar/dpdk/lib/librte_vhost/vhost_user.c b/src/seastar/dpdk/lib/librte_vhost/vhost_user.c
new file mode 100644
index 000000000..c9e29ece8
--- /dev/null
+++ b/src/seastar/dpdk/lib/librte_vhost/vhost_user.c
@@ -0,0 +1,2269 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2018 Intel Corporation
+ */
+
+/* Security model
+ * --------------
+ * The vhost-user protocol connection is an external interface, so it must be
+ * robust against invalid inputs.
+ *
+ * This is important because the vhost-user master is only one step removed
+ * from the guest. Malicious guests that have escaped will then launch further
+ * attacks from the vhost-user master.
+ *
+ * Even in deployments where guests are trusted, a bug in the vhost-user master
+ * can still cause invalid messages to be sent. Such messages must not
+ * compromise the stability of the DPDK application by causing crashes, memory
+ * corruption, or other problematic behavior.
+ *
+ * Do not assume received VhostUserMsg fields contain sensible values!
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <assert.h>
+#ifdef RTE_LIBRTE_VHOST_NUMA
+#include <numaif.h>
+#endif
+#ifdef RTE_LIBRTE_VHOST_POSTCOPY
+#include <linux/userfaultfd.h>
+#endif
+
+#include <rte_common.h>
+#include <rte_malloc.h>
+#include <rte_log.h>
+
+#include "iotlb.h"
+#include "vhost.h"
+#include "vhost_user.h"
+
+#define VIRTIO_MIN_MTU 68
+#define VIRTIO_MAX_MTU 65535
+
+static const char *vhost_message_str[VHOST_USER_MAX] = {
+ [VHOST_USER_NONE] = "VHOST_USER_NONE",
+ [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
+ [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
+ [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
+ [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
+ [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
+ [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
+ [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
+ [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
+ [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
+ [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
+ [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
+ [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
+ [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
+ [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR",
+ [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES",
+ [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES",
+ [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM",
+ [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE",
+ [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP",
+ [VHOST_USER_NET_SET_MTU] = "VHOST_USER_NET_SET_MTU",
+ [VHOST_USER_SET_SLAVE_REQ_FD] = "VHOST_USER_SET_SLAVE_REQ_FD",
+ [VHOST_USER_IOTLB_MSG] = "VHOST_USER_IOTLB_MSG",
+ [VHOST_USER_CRYPTO_CREATE_SESS] = "VHOST_USER_CRYPTO_CREATE_SESS",
+ [VHOST_USER_CRYPTO_CLOSE_SESS] = "VHOST_USER_CRYPTO_CLOSE_SESS",
+ [VHOST_USER_POSTCOPY_ADVISE] = "VHOST_USER_POSTCOPY_ADVISE",
+ [VHOST_USER_POSTCOPY_LISTEN] = "VHOST_USER_POSTCOPY_LISTEN",
+ [VHOST_USER_POSTCOPY_END] = "VHOST_USER_POSTCOPY_END",
+};
+
+static int send_vhost_reply(int sockfd, struct VhostUserMsg *msg);
+static int read_vhost_message(int sockfd, struct VhostUserMsg *msg);
+
+static uint64_t
+get_blk_size(int fd)
+{
+ struct stat stat;
+ int ret;
+
+ ret = fstat(fd, &stat);
+ return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
+}
+
+/*
+ * Reclaim all the outstanding zmbufs for a virtqueue.
+ */
+static void
+drain_zmbuf_list(struct vhost_virtqueue *vq)
+{
+ struct zcopy_mbuf *zmbuf, *next;
+
+ for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
+ zmbuf != NULL; zmbuf = next) {
+ next = TAILQ_NEXT(zmbuf, next);
+
+ while (!mbuf_is_consumed(zmbuf->mbuf))
+ usleep(1000);
+
+ TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
+ restore_mbuf(zmbuf->mbuf);
+ rte_pktmbuf_free(zmbuf->mbuf);
+ put_zmbuf(zmbuf);
+ vq->nr_zmbuf -= 1;
+ }
+}
+
+static void
+free_mem_region(struct virtio_net *dev)
+{
+ uint32_t i;
+ struct rte_vhost_mem_region *reg;
+ struct vhost_virtqueue *vq;
+
+ if (!dev || !dev->mem)
+ return;
+
+ if (dev->dequeue_zero_copy) {
+ for (i = 0; i < dev->nr_vring; i++) {
+ vq = dev->virtqueue[i];
+ if (vq)
+ drain_zmbuf_list(vq);
+ }
+ }
+
+ for (i = 0; i < dev->mem->nregions; i++) {
+ reg = &dev->mem->regions[i];
+ if (reg->host_user_addr) {
+ munmap(reg->mmap_addr, reg->mmap_size);
+ close(reg->fd);
+ }
+ }
+}
+
+void
+vhost_backend_cleanup(struct virtio_net *dev)
+{
+ if (dev->mem) {
+ free_mem_region(dev);
+ rte_free(dev->mem);
+ dev->mem = NULL;
+ }
+
+ free(dev->guest_pages);
+ dev->guest_pages = NULL;
+
+ if (dev->log_addr) {
+ munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
+ dev->log_addr = 0;
+ }
+
+ if (dev->slave_req_fd >= 0) {
+ close(dev->slave_req_fd);
+ dev->slave_req_fd = -1;
+ }
+
+ if (dev->postcopy_ufd >= 0) {
+ close(dev->postcopy_ufd);
+ dev->postcopy_ufd = -1;
+ }
+
+ dev->postcopy_listening = 0;
+}
+
+/*
+ * This function just returns success at the moment unless
+ * the device hasn't been initialised.
+ */
+static int
+vhost_user_set_owner(struct virtio_net **pdev __rte_unused,
+ struct VhostUserMsg *msg __rte_unused,
+ int main_fd __rte_unused)
+{
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
+static int
+vhost_user_reset_owner(struct virtio_net **pdev,
+ struct VhostUserMsg *msg __rte_unused,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+ vhost_destroy_device_notify(dev);
+
+ cleanup_device(dev, 0);
+ reset_device(dev);
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
+/*
+ * The features that we support are requested.
+ */
+static int
+vhost_user_get_features(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+ uint64_t features = 0;
+
+ rte_vhost_driver_get_features(dev->ifname, &features);
+
+ msg->payload.u64 = features;
+ msg->size = sizeof(msg->payload.u64);
+ msg->fd_num = 0;
+
+ return RTE_VHOST_MSG_RESULT_REPLY;
+}
+
+/*
+ * The queue number that we support are requested.
+ */
+static int
+vhost_user_get_queue_num(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+ uint32_t queue_num = 0;
+
+ rte_vhost_driver_get_queue_num(dev->ifname, &queue_num);
+
+ msg->payload.u64 = (uint64_t)queue_num;
+ msg->size = sizeof(msg->payload.u64);
+ msg->fd_num = 0;
+
+ return RTE_VHOST_MSG_RESULT_REPLY;
+}
+
+/*
+ * We receive the negotiated features supported by us and the virtio device.
+ */
+static int
+vhost_user_set_features(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+ uint64_t features = msg->payload.u64;
+ uint64_t vhost_features = 0;
+ struct rte_vdpa_device *vdpa_dev;
+ int did = -1;
+
+ rte_vhost_driver_get_features(dev->ifname, &vhost_features);
+ if (features & ~vhost_features) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) received invalid negotiated features.\n",
+ dev->vid);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ if (dev->features == features)
+ return RTE_VHOST_MSG_RESULT_OK;
+
+ /*
+ * Error out if master tries to change features while device is
+ * in running state. The exception being VHOST_F_LOG_ALL, which
+ * is enabled when the live-migration starts.
+ */
+ if ((dev->features ^ features) & ~(1ULL << VHOST_F_LOG_ALL)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) features changed while device is running.\n",
+ dev->vid);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ if (dev->notify_ops->features_changed)
+ dev->notify_ops->features_changed(dev->vid, features);
+ }
+
+ dev->features = features;
+ if (dev->features &
+ ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) {
+ dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+ } else {
+ dev->vhost_hlen = sizeof(struct virtio_net_hdr);
+ }
+ VHOST_LOG_DEBUG(VHOST_CONFIG,
+ "(%d) mergeable RX buffers %s, virtio 1 %s\n",
+ dev->vid,
+ (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off",
+ (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off");
+
+ if ((dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET) &&
+ !(dev->features & (1ULL << VIRTIO_NET_F_MQ))) {
+ /*
+ * Remove all but first queue pair if MQ hasn't been
+ * negotiated. This is safe because the device is not
+ * running at this stage.
+ */
+ while (dev->nr_vring > 2) {
+ struct vhost_virtqueue *vq;
+
+ vq = dev->virtqueue[--dev->nr_vring];
+ if (!vq)
+ continue;
+
+ dev->virtqueue[dev->nr_vring] = NULL;
+ cleanup_vq(vq, 1);
+ free_vq(dev, vq);
+ }
+ }
+
+ did = dev->vdpa_dev_id;
+ vdpa_dev = rte_vdpa_get_device(did);
+ if (vdpa_dev && vdpa_dev->ops->set_features)
+ vdpa_dev->ops->set_features(dev->vid);
+
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
+/*
+ * The virtio device sends us the size of the descriptor ring.
+ */
+static int
+vhost_user_set_vring_num(struct virtio_net **pdev,
+ struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+ struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
+
+ vq->size = msg->payload.state.num;
+
+ /* VIRTIO 1.0, 2.4 Virtqueues says:
+ *
+ * Queue Size value is always a power of 2. The maximum Queue Size
+ * value is 32768.
+ */
+ if ((vq->size & (vq->size - 1)) || vq->size > 32768) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "invalid virtqueue size %u\n", vq->size);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ if (dev->dequeue_zero_copy) {
+ vq->nr_zmbuf = 0;
+ vq->last_zmbuf_idx = 0;
+ vq->zmbuf_size = vq->size;
+ vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size *
+ sizeof(struct zcopy_mbuf), 0);
+ if (vq->zmbufs == NULL) {
+ RTE_LOG(WARNING, VHOST_CONFIG,
+ "failed to allocate mem for zero copy; "
+ "zero copy is force disabled\n");
+ dev->dequeue_zero_copy = 0;
+ }
+ TAILQ_INIT(&vq->zmbuf_list);
+ }
+
+ if (vq_is_packed(dev)) {
+ vq->shadow_used_packed = rte_malloc(NULL,
+ vq->size *
+ sizeof(struct vring_used_elem_packed),
+ RTE_CACHE_LINE_SIZE);
+ if (!vq->shadow_used_packed) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to allocate memory for shadow used ring.\n");
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ } else {
+ vq->shadow_used_split = rte_malloc(NULL,
+ vq->size * sizeof(struct vring_used_elem),
+ RTE_CACHE_LINE_SIZE);
+ if (!vq->shadow_used_split) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to allocate memory for shadow used ring.\n");
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+ }
+
+ vq->batch_copy_elems = rte_malloc(NULL,
+ vq->size * sizeof(struct batch_copy_elem),
+ RTE_CACHE_LINE_SIZE);
+ if (!vq->batch_copy_elems) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to allocate memory for batching copy.\n");
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
+/*
+ * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the
+ * same numa node as the memory of vring descriptor.
+ */
+#ifdef RTE_LIBRTE_VHOST_NUMA
+static struct virtio_net*
+numa_realloc(struct virtio_net *dev, int index)
+{
+ int oldnode, newnode;
+ struct virtio_net *old_dev;
+ struct vhost_virtqueue *old_vq, *vq;
+ struct zcopy_mbuf *new_zmbuf;
+ struct vring_used_elem *new_shadow_used_split;
+ struct vring_used_elem_packed *new_shadow_used_packed;
+ struct batch_copy_elem *new_batch_copy_elems;
+ int ret;
+
+ old_dev = dev;
+ vq = old_vq = dev->virtqueue[index];
+
+ ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc,
+ MPOL_F_NODE | MPOL_F_ADDR);
+
+ /* check if we need to reallocate vq */
+ ret |= get_mempolicy(&oldnode, NULL, 0, old_vq,
+ MPOL_F_NODE | MPOL_F_ADDR);
+ if (ret) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Unable to get vq numa information.\n");
+ return dev;
+ }
+ if (oldnode != newnode) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "reallocate vq from %d to %d node\n", oldnode, newnode);
+ vq = rte_malloc_socket(NULL, sizeof(*vq), 0, newnode);
+ if (!vq)
+ return dev;
+
+ memcpy(vq, old_vq, sizeof(*vq));
+ TAILQ_INIT(&vq->zmbuf_list);
+
+ if (dev->dequeue_zero_copy) {
+ new_zmbuf = rte_malloc_socket(NULL, vq->zmbuf_size *
+ sizeof(struct zcopy_mbuf), 0, newnode);
+ if (new_zmbuf) {
+ rte_free(vq->zmbufs);
+ vq->zmbufs = new_zmbuf;
+ }
+ }
+
+ if (vq_is_packed(dev)) {
+ new_shadow_used_packed = rte_malloc_socket(NULL,
+ vq->size *
+ sizeof(struct vring_used_elem_packed),
+ RTE_CACHE_LINE_SIZE,
+ newnode);
+ if (new_shadow_used_packed) {
+ rte_free(vq->shadow_used_packed);
+ vq->shadow_used_packed = new_shadow_used_packed;
+ }
+ } else {
+ new_shadow_used_split = rte_malloc_socket(NULL,
+ vq->size *
+ sizeof(struct vring_used_elem),
+ RTE_CACHE_LINE_SIZE,
+ newnode);
+ if (new_shadow_used_split) {
+ rte_free(vq->shadow_used_split);
+ vq->shadow_used_split = new_shadow_used_split;
+ }
+ }
+
+ new_batch_copy_elems = rte_malloc_socket(NULL,
+ vq->size * sizeof(struct batch_copy_elem),
+ RTE_CACHE_LINE_SIZE,
+ newnode);
+ if (new_batch_copy_elems) {
+ rte_free(vq->batch_copy_elems);
+ vq->batch_copy_elems = new_batch_copy_elems;
+ }
+
+ rte_free(old_vq);
+ }
+
+ /* check if we need to reallocate dev */
+ ret = get_mempolicy(&oldnode, NULL, 0, old_dev,
+ MPOL_F_NODE | MPOL_F_ADDR);
+ if (ret) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Unable to get dev numa information.\n");
+ goto out;
+ }
+ if (oldnode != newnode) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "reallocate dev from %d to %d node\n",
+ oldnode, newnode);
+ dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode);
+ if (!dev) {
+ dev = old_dev;
+ goto out;
+ }
+
+ memcpy(dev, old_dev, sizeof(*dev));
+ rte_free(old_dev);
+ }
+
+out:
+ dev->virtqueue[index] = vq;
+ vhost_devices[dev->vid] = dev;
+
+ if (old_vq != vq)
+ vhost_user_iotlb_init(dev, index);
+
+ return dev;
+}
+#else
+static struct virtio_net*
+numa_realloc(struct virtio_net *dev, int index __rte_unused)
+{
+ return dev;
+}
+#endif
+
+/* Converts QEMU virtual address to Vhost virtual address. */
+static uint64_t
+qva_to_vva(struct virtio_net *dev, uint64_t qva, uint64_t *len)
+{
+ struct rte_vhost_mem_region *r;
+ uint32_t i;
+
+ if (unlikely(!dev || !dev->mem))
+ goto out_error;
+
+ /* Find the region where the address lives. */
+ for (i = 0; i < dev->mem->nregions; i++) {
+ r = &dev->mem->regions[i];
+
+ if (qva >= r->guest_user_addr &&
+ qva < r->guest_user_addr + r->size) {
+
+ if (unlikely(*len > r->guest_user_addr + r->size - qva))
+ *len = r->guest_user_addr + r->size - qva;
+
+ return qva - r->guest_user_addr +
+ r->host_user_addr;
+ }
+ }
+out_error:
+ *len = 0;
+
+ return 0;
+}
+
+
+/*
+ * Converts ring address to Vhost virtual address.
+ * If IOMMU is enabled, the ring address is a guest IO virtual address,
+ * else it is a QEMU virtual address.
+ */
+static uint64_t
+ring_addr_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint64_t ra, uint64_t *size)
+{
+ if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) {
+ uint64_t vva;
+
+ vva = vhost_user_iotlb_cache_find(vq, ra,
+ size, VHOST_ACCESS_RW);
+ if (!vva)
+ vhost_user_iotlb_miss(dev, ra, VHOST_ACCESS_RW);
+
+ return vva;
+ }
+
+ return qva_to_vva(dev, ra, size);
+}
+
+static struct virtio_net *
+translate_ring_addresses(struct virtio_net *dev, int vq_index)
+{
+ struct vhost_virtqueue *vq = dev->virtqueue[vq_index];
+ struct vhost_vring_addr *addr = &vq->ring_addrs;
+ uint64_t len, expected_len;
+
+ if (vq_is_packed(dev)) {
+ len = sizeof(struct vring_packed_desc) * vq->size;
+ vq->desc_packed = (struct vring_packed_desc *)(uintptr_t)
+ ring_addr_to_vva(dev, vq, addr->desc_user_addr, &len);
+ vq->log_guest_addr = 0;
+ if (vq->desc_packed == NULL ||
+ len != sizeof(struct vring_packed_desc) *
+ vq->size) {
+ RTE_LOG(DEBUG, VHOST_CONFIG,
+ "(%d) failed to map desc_packed ring.\n",
+ dev->vid);
+ return dev;
+ }
+
+ dev = numa_realloc(dev, vq_index);
+ vq = dev->virtqueue[vq_index];
+ addr = &vq->ring_addrs;
+
+ len = sizeof(struct vring_packed_desc_event);
+ vq->driver_event = (struct vring_packed_desc_event *)
+ (uintptr_t)ring_addr_to_vva(dev,
+ vq, addr->avail_user_addr, &len);
+ if (vq->driver_event == NULL ||
+ len != sizeof(struct vring_packed_desc_event)) {
+ RTE_LOG(DEBUG, VHOST_CONFIG,
+ "(%d) failed to find driver area address.\n",
+ dev->vid);
+ return dev;
+ }
+
+ len = sizeof(struct vring_packed_desc_event);
+ vq->device_event = (struct vring_packed_desc_event *)
+ (uintptr_t)ring_addr_to_vva(dev,
+ vq, addr->used_user_addr, &len);
+ if (vq->device_event == NULL ||
+ len != sizeof(struct vring_packed_desc_event)) {
+ RTE_LOG(DEBUG, VHOST_CONFIG,
+ "(%d) failed to find device area address.\n",
+ dev->vid);
+ return dev;
+ }
+
+ return dev;
+ }
+
+ /* The addresses are converted from QEMU virtual to Vhost virtual. */
+ if (vq->desc && vq->avail && vq->used)
+ return dev;
+
+ len = sizeof(struct vring_desc) * vq->size;
+ vq->desc = (struct vring_desc *)(uintptr_t)ring_addr_to_vva(dev,
+ vq, addr->desc_user_addr, &len);
+ if (vq->desc == 0 || len != sizeof(struct vring_desc) * vq->size) {
+ RTE_LOG(DEBUG, VHOST_CONFIG,
+ "(%d) failed to map desc ring.\n",
+ dev->vid);
+ return dev;
+ }
+
+ dev = numa_realloc(dev, vq_index);
+ vq = dev->virtqueue[vq_index];
+ addr = &vq->ring_addrs;
+
+ len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size;
+ if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))
+ len += sizeof(uint16_t);
+ expected_len = len;
+ vq->avail = (struct vring_avail *)(uintptr_t)ring_addr_to_vva(dev,
+ vq, addr->avail_user_addr, &len);
+ if (vq->avail == 0 || len != expected_len) {
+ RTE_LOG(DEBUG, VHOST_CONFIG,
+ "(%d) failed to map avail ring.\n",
+ dev->vid);
+ return dev;
+ }
+
+ len = sizeof(struct vring_used) +
+ sizeof(struct vring_used_elem) * vq->size;
+ if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))
+ len += sizeof(uint16_t);
+ expected_len = len;
+ vq->used = (struct vring_used *)(uintptr_t)ring_addr_to_vva(dev,
+ vq, addr->used_user_addr, &len);
+ if (vq->used == 0 || len != expected_len) {
+ RTE_LOG(DEBUG, VHOST_CONFIG,
+ "(%d) failed to map used ring.\n",
+ dev->vid);
+ return dev;
+ }
+
+ if (vq->last_used_idx != vq->used->idx) {
+ RTE_LOG(WARNING, VHOST_CONFIG,
+ "last_used_idx (%u) and vq->used->idx (%u) mismatches; "
+ "some packets maybe resent for Tx and dropped for Rx\n",
+ vq->last_used_idx, vq->used->idx);
+ vq->last_used_idx = vq->used->idx;
+ vq->last_avail_idx = vq->used->idx;
+ }
+
+ vq->log_guest_addr = addr->log_guest_addr;
+
+ VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n",
+ dev->vid, vq->desc);
+ VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n",
+ dev->vid, vq->avail);
+ VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n",
+ dev->vid, vq->used);
+ VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n",
+ dev->vid, vq->log_guest_addr);
+
+ return dev;
+}
+
+/*
+ * The virtio device sends us the desc, used and avail ring addresses.
+ * This function then converts these to our address space.
+ */
+static int
+vhost_user_set_vring_addr(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+ struct vhost_virtqueue *vq;
+ struct vhost_vring_addr *addr = &msg->payload.addr;
+
+ if (dev->mem == NULL)
+ return RTE_VHOST_MSG_RESULT_ERR;
+
+ /* addr->index refers to the queue index. The txq 1, rxq is 0. */
+ vq = dev->virtqueue[msg->payload.addr.index];
+
+ /*
+ * Rings addresses should not be interpreted as long as the ring is not
+ * started and enabled
+ */
+ memcpy(&vq->ring_addrs, addr, sizeof(*addr));
+
+ vring_invalidate(dev, vq);
+
+ if (vq->enabled && (dev->features &
+ (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) {
+ dev = translate_ring_addresses(dev, msg->payload.addr.index);
+ if (!dev)
+ return RTE_VHOST_MSG_RESULT_ERR;
+
+ *pdev = dev;
+ }
+
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
+/*
+ * The virtio device sends us the available ring last used index.
+ */
+static int
+vhost_user_set_vring_base(struct virtio_net **pdev,
+ struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+ struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
+ uint64_t val = msg->payload.state.num;
+
+ if (vq_is_packed(dev)) {
+ /*
+ * Bit[0:14]: avail index
+ * Bit[15]: avail wrap counter
+ */
+ vq->last_avail_idx = val & 0x7fff;
+ vq->avail_wrap_counter = !!(val & (0x1 << 15));
+ /*
+ * Set used index to same value as available one, as
+ * their values should be the same since ring processing
+ * was stopped at get time.
+ */
+ vq->last_used_idx = vq->last_avail_idx;
+ vq->used_wrap_counter = vq->avail_wrap_counter;
+ } else {
+ vq->last_used_idx = msg->payload.state.num;
+ vq->last_avail_idx = msg->payload.state.num;
+ }
+
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
+static int
+add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
+ uint64_t host_phys_addr, uint64_t size)
+{
+ struct guest_page *page, *last_page;
+ struct guest_page *old_pages;
+
+ if (dev->nr_guest_pages == dev->max_guest_pages) {
+ dev->max_guest_pages *= 2;
+ old_pages = dev->guest_pages;
+ dev->guest_pages = realloc(dev->guest_pages,
+ dev->max_guest_pages * sizeof(*page));
+ if (!dev->guest_pages) {
+ RTE_LOG(ERR, VHOST_CONFIG, "cannot realloc guest_pages\n");
+ free(old_pages);
+ return -1;
+ }
+ }
+
+ if (dev->nr_guest_pages > 0) {
+ last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
+ /* merge if the two pages are continuous */
+ if (host_phys_addr == last_page->host_phys_addr +
+ last_page->size) {
+ last_page->size += size;
+ return 0;
+ }
+ }
+
+ page = &dev->guest_pages[dev->nr_guest_pages++];
+ page->guest_phys_addr = guest_phys_addr;
+ page->host_phys_addr = host_phys_addr;
+ page->size = size;
+
+ return 0;
+}
+
+static int
+add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
+ uint64_t page_size)
+{
+ uint64_t reg_size = reg->size;
+ uint64_t host_user_addr = reg->host_user_addr;
+ uint64_t guest_phys_addr = reg->guest_phys_addr;
+ uint64_t host_phys_addr;
+ uint64_t size;
+
+ host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr);
+ size = page_size - (guest_phys_addr & (page_size - 1));
+ size = RTE_MIN(size, reg_size);
+
+ if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size) < 0)
+ return -1;
+
+ host_user_addr += size;
+ guest_phys_addr += size;
+ reg_size -= size;
+
+ while (reg_size > 0) {
+ size = RTE_MIN(reg_size, page_size);
+ host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)
+ host_user_addr);
+ if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr,
+ size) < 0)
+ return -1;
+
+ host_user_addr += size;
+ guest_phys_addr += size;
+ reg_size -= size;
+ }
+
+ return 0;
+}
+
+#ifdef RTE_LIBRTE_VHOST_DEBUG
+/* TODO: enable it only in debug mode? */
+static void
+dump_guest_pages(struct virtio_net *dev)
+{
+ uint32_t i;
+ struct guest_page *page;
+
+ for (i = 0; i < dev->nr_guest_pages; i++) {
+ page = &dev->guest_pages[i];
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "guest physical page region %u\n"
+ "\t guest_phys_addr: %" PRIx64 "\n"
+ "\t host_phys_addr : %" PRIx64 "\n"
+ "\t size : %" PRIx64 "\n",
+ i,
+ page->guest_phys_addr,
+ page->host_phys_addr,
+ page->size);
+ }
+}
+#else
+#define dump_guest_pages(dev)
+#endif
+
+static bool
+vhost_memory_changed(struct VhostUserMemory *new,
+ struct rte_vhost_memory *old)
+{
+ uint32_t i;
+
+ if (new->nregions != old->nregions)
+ return true;
+
+ for (i = 0; i < new->nregions; ++i) {
+ VhostUserMemoryRegion *new_r = &new->regions[i];
+ struct rte_vhost_mem_region *old_r = &old->regions[i];
+
+ if (new_r->guest_phys_addr != old_r->guest_phys_addr)
+ return true;
+ if (new_r->memory_size != old_r->size)
+ return true;
+ if (new_r->userspace_addr != old_r->guest_user_addr)
+ return true;
+ }
+
+ return false;
+}
+
+static int
+vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd)
+{
+ struct virtio_net *dev = *pdev;
+ struct VhostUserMemory *memory = &msg->payload.memory;
+ struct rte_vhost_mem_region *reg;
+ void *mmap_addr;
+ uint64_t mmap_size;
+ uint64_t mmap_offset;
+ uint64_t alignment;
+ uint32_t i;
+ int populate;
+ int fd;
+
+ if (memory->nregions > VHOST_MEMORY_MAX_NREGIONS) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "too many memory regions (%u)\n", memory->nregions);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ if (dev->mem && !vhost_memory_changed(memory, dev->mem)) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "(%d) memory regions not changed\n", dev->vid);
+
+ for (i = 0; i < memory->nregions; i++)
+ close(msg->fds[i]);
+
+ return RTE_VHOST_MSG_RESULT_OK;
+ }
+
+ if (dev->mem) {
+ free_mem_region(dev);
+ rte_free(dev->mem);
+ dev->mem = NULL;
+ }
+
+ /* Flush IOTLB cache as previous HVAs are now invalid */
+ if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+ for (i = 0; i < dev->nr_vring; i++)
+ vhost_user_iotlb_flush_all(dev->virtqueue[i]);
+
+ dev->nr_guest_pages = 0;
+ if (!dev->guest_pages) {
+ dev->max_guest_pages = 8;
+ dev->guest_pages = malloc(dev->max_guest_pages *
+ sizeof(struct guest_page));
+ if (dev->guest_pages == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) failed to allocate memory "
+ "for dev->guest_pages\n",
+ dev->vid);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+ }
+
+ dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory) +
+ sizeof(struct rte_vhost_mem_region) * memory->nregions, 0);
+ if (dev->mem == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) failed to allocate memory for dev->mem\n",
+ dev->vid);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+ dev->mem->nregions = memory->nregions;
+
+ for (i = 0; i < memory->nregions; i++) {
+ fd = msg->fds[i];
+ reg = &dev->mem->regions[i];
+
+ reg->guest_phys_addr = memory->regions[i].guest_phys_addr;
+ reg->guest_user_addr = memory->regions[i].userspace_addr;
+ reg->size = memory->regions[i].memory_size;
+ reg->fd = fd;
+
+ mmap_offset = memory->regions[i].mmap_offset;
+
+ /* Check for memory_size + mmap_offset overflow */
+ if (mmap_offset >= -reg->size) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "mmap_offset (%#"PRIx64") and memory_size "
+ "(%#"PRIx64") overflow\n",
+ mmap_offset, reg->size);
+ goto err_mmap;
+ }
+
+ mmap_size = reg->size + mmap_offset;
+
+ /* mmap() without flag of MAP_ANONYMOUS, should be called
+ * with length argument aligned with hugepagesz at older
+ * longterm version Linux, like 2.6.32 and 3.2.72, or
+ * mmap() will fail with EINVAL.
+ *
+ * to avoid failure, make sure in caller to keep length
+ * aligned.
+ */
+ alignment = get_blk_size(fd);
+ if (alignment == (uint64_t)-1) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "couldn't get hugepage size through fstat\n");
+ goto err_mmap;
+ }
+ mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
+
+ populate = (dev->dequeue_zero_copy) ? MAP_POPULATE : 0;
+ mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | populate, fd, 0);
+
+ if (mmap_addr == MAP_FAILED) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "mmap region %u failed.\n", i);
+ goto err_mmap;
+ }
+
+ reg->mmap_addr = mmap_addr;
+ reg->mmap_size = mmap_size;
+ reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
+ mmap_offset;
+
+ if (dev->dequeue_zero_copy)
+ if (add_guest_pages(dev, reg, alignment) < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "adding guest pages to region %u failed.\n",
+ i);
+ goto err_mmap;
+ }
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "guest memory region %u, size: 0x%" PRIx64 "\n"
+ "\t guest physical addr: 0x%" PRIx64 "\n"
+ "\t guest virtual addr: 0x%" PRIx64 "\n"
+ "\t host virtual addr: 0x%" PRIx64 "\n"
+ "\t mmap addr : 0x%" PRIx64 "\n"
+ "\t mmap size : 0x%" PRIx64 "\n"
+ "\t mmap align: 0x%" PRIx64 "\n"
+ "\t mmap off : 0x%" PRIx64 "\n",
+ i, reg->size,
+ reg->guest_phys_addr,
+ reg->guest_user_addr,
+ reg->host_user_addr,
+ (uint64_t)(uintptr_t)mmap_addr,
+ mmap_size,
+ alignment,
+ mmap_offset);
+
+ if (dev->postcopy_listening) {
+ /*
+ * We haven't a better way right now than sharing
+ * DPDK's virtual address with Qemu, so that Qemu can
+ * retrieve the region offset when handling userfaults.
+ */
+ memory->regions[i].userspace_addr =
+ reg->host_user_addr;
+ }
+ }
+ if (dev->postcopy_listening) {
+ /* Send the addresses back to qemu */
+ msg->fd_num = 0;
+ send_vhost_reply(main_fd, msg);
+
+ /* Wait for qemu to acknolwedge it's got the addresses
+ * we've got to wait before we're allowed to generate faults.
+ */
+ VhostUserMsg ack_msg;
+ if (read_vhost_message(main_fd, &ack_msg) <= 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to read qemu ack on postcopy set-mem-table\n");
+ goto err_mmap;
+ }
+ if (ack_msg.request.master != VHOST_USER_SET_MEM_TABLE) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Bad qemu ack on postcopy set-mem-table (%d)\n",
+ ack_msg.request.master);
+ goto err_mmap;
+ }
+
+ /* Now userfault register and we can use the memory */
+ for (i = 0; i < memory->nregions; i++) {
+#ifdef RTE_LIBRTE_VHOST_POSTCOPY
+ reg = &dev->mem->regions[i];
+ struct uffdio_register reg_struct;
+
+ /*
+ * Let's register all the mmap'ed area to ensure
+ * alignment on page boundary.
+ */
+ reg_struct.range.start =
+ (uint64_t)(uintptr_t)reg->mmap_addr;
+ reg_struct.range.len = reg->mmap_size;
+ reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
+
+ if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER,
+ &reg_struct)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to register ufd for region %d: (ufd = %d) %s\n",
+ i, dev->postcopy_ufd,
+ strerror(errno));
+ goto err_mmap;
+ }
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "\t userfaultfd registered for range : %llx - %llx\n",
+ reg_struct.range.start,
+ reg_struct.range.start +
+ reg_struct.range.len - 1);
+#else
+ goto err_mmap;
+#endif
+ }
+ }
+
+ for (i = 0; i < dev->nr_vring; i++) {
+ struct vhost_virtqueue *vq = dev->virtqueue[i];
+
+ if (vq->desc || vq->avail || vq->used) {
+ /*
+ * If the memory table got updated, the ring addresses
+ * need to be translated again as virtual addresses have
+ * changed.
+ */
+ vring_invalidate(dev, vq);
+
+ dev = translate_ring_addresses(dev, i);
+ if (!dev) {
+ dev = *pdev;
+ goto err_mmap;
+ }
+
+ *pdev = dev;
+ }
+ }
+
+ dump_guest_pages(dev);
+
+ return RTE_VHOST_MSG_RESULT_OK;
+
+err_mmap:
+ free_mem_region(dev);
+ rte_free(dev->mem);
+ dev->mem = NULL;
+ return RTE_VHOST_MSG_RESULT_ERR;
+}
+
+static bool
+vq_is_ready(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+ bool rings_ok;
+
+ if (!vq)
+ return false;
+
+ if (vq_is_packed(dev))
+ rings_ok = !!vq->desc_packed;
+ else
+ rings_ok = vq->desc && vq->avail && vq->used;
+
+ return rings_ok &&
+ vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
+ vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD;
+}
+
+static int
+virtio_is_ready(struct virtio_net *dev)
+{
+ struct vhost_virtqueue *vq;
+ uint32_t i;
+
+ if (dev->nr_vring == 0)
+ return 0;
+
+ for (i = 0; i < dev->nr_vring; i++) {
+ vq = dev->virtqueue[i];
+
+ if (!vq_is_ready(dev, vq))
+ return 0;
+ }
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "virtio is now ready for processing.\n");
+ return 1;
+}
+
+static int
+vhost_user_set_vring_call(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+ struct vhost_vring_file file;
+ struct vhost_virtqueue *vq;
+
+ file.index = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ if (msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+ file.fd = VIRTIO_INVALID_EVENTFD;
+ else
+ file.fd = msg->fds[0];
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "vring call idx:%d file:%d\n", file.index, file.fd);
+
+ vq = dev->virtqueue[file.index];
+ if (vq->callfd >= 0)
+ close(vq->callfd);
+
+ vq->callfd = file.fd;
+
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
+static int vhost_user_set_vring_err(struct virtio_net **pdev __rte_unused,
+ struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ if (!(msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK))
+ close(msg->fds[0]);
+ RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
+
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
+static int
+vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+ struct vhost_vring_file file;
+ struct vhost_virtqueue *vq;
+
+ file.index = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ if (msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+ file.fd = VIRTIO_INVALID_EVENTFD;
+ else
+ file.fd = msg->fds[0];
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "vring kick idx:%d file:%d\n", file.index, file.fd);
+
+ /* Interpret ring addresses only when ring is started. */
+ dev = translate_ring_addresses(dev, file.index);
+ if (!dev)
+ return RTE_VHOST_MSG_RESULT_ERR;
+
+ *pdev = dev;
+
+ vq = dev->virtqueue[file.index];
+
+ /*
+ * When VHOST_USER_F_PROTOCOL_FEATURES is not negotiated,
+ * the ring starts already enabled. Otherwise, it is enabled via
+ * the SET_VRING_ENABLE message.
+ */
+ if (!(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) {
+ vq->enabled = 1;
+ if (dev->notify_ops->vring_state_changed)
+ dev->notify_ops->vring_state_changed(
+ dev->vid, file.index, 1);
+ }
+
+ if (vq->kickfd >= 0)
+ close(vq->kickfd);
+ vq->kickfd = file.fd;
+
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
+static void
+free_zmbufs(struct vhost_virtqueue *vq)
+{
+ drain_zmbuf_list(vq);
+
+ rte_free(vq->zmbufs);
+}
+
+/*
+ * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
+ */
+static int
+vhost_user_get_vring_base(struct virtio_net **pdev,
+ struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+ struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
+ uint64_t val;
+
+ /* We have to stop the queue (virtio) if it is running. */
+ vhost_destroy_device_notify(dev);
+
+ dev->flags &= ~VIRTIO_DEV_READY;
+ dev->flags &= ~VIRTIO_DEV_VDPA_CONFIGURED;
+
+ /* Here we are safe to get the indexes */
+ if (vq_is_packed(dev)) {
+ /*
+ * Bit[0:14]: avail index
+ * Bit[15]: avail wrap counter
+ */
+ val = vq->last_avail_idx & 0x7fff;
+ val |= vq->avail_wrap_counter << 15;
+ msg->payload.state.num = val;
+ } else {
+ msg->payload.state.num = vq->last_avail_idx;
+ }
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "vring base idx:%d file:%d\n", msg->payload.state.index,
+ msg->payload.state.num);
+ /*
+ * Based on current qemu vhost-user implementation, this message is
+ * sent and only sent in vhost_vring_stop.
+ * TODO: cleanup the vring, it isn't usable since here.
+ */
+ if (vq->kickfd >= 0)
+ close(vq->kickfd);
+
+ vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+ if (vq->callfd >= 0)
+ close(vq->callfd);
+
+ vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+ vq->signalled_used_valid = false;
+
+ if (dev->dequeue_zero_copy)
+ free_zmbufs(vq);
+ if (vq_is_packed(dev)) {
+ rte_free(vq->shadow_used_packed);
+ vq->shadow_used_packed = NULL;
+ } else {
+ rte_free(vq->shadow_used_split);
+ vq->shadow_used_split = NULL;
+ }
+
+ rte_free(vq->batch_copy_elems);
+ vq->batch_copy_elems = NULL;
+
+ msg->size = sizeof(msg->payload.state);
+ msg->fd_num = 0;
+
+ return RTE_VHOST_MSG_RESULT_REPLY;
+}
+
+/*
+ * when virtio queues are ready to work, qemu will send us to
+ * enable the virtio queue pair.
+ */
+static int
+vhost_user_set_vring_enable(struct virtio_net **pdev,
+ struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+ int enable = (int)msg->payload.state.num;
+ int index = (int)msg->payload.state.index;
+ struct rte_vdpa_device *vdpa_dev;
+ int did = -1;
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set queue enable: %d to qp idx: %d\n",
+ enable, index);
+
+ did = dev->vdpa_dev_id;
+ vdpa_dev = rte_vdpa_get_device(did);
+ if (vdpa_dev && vdpa_dev->ops->set_vring_state)
+ vdpa_dev->ops->set_vring_state(dev->vid, index, enable);
+
+ if (dev->notify_ops->vring_state_changed)
+ dev->notify_ops->vring_state_changed(dev->vid,
+ index, enable);
+
+ /* On disable, rings have to be stopped being processed. */
+ if (!enable && dev->dequeue_zero_copy)
+ drain_zmbuf_list(dev->virtqueue[index]);
+
+ dev->virtqueue[index]->enabled = enable;
+
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
+static int
+vhost_user_get_protocol_features(struct virtio_net **pdev,
+ struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+ uint64_t features, protocol_features;
+
+ rte_vhost_driver_get_features(dev->ifname, &features);
+ rte_vhost_driver_get_protocol_features(dev->ifname, &protocol_features);
+
+ /*
+ * REPLY_ACK protocol feature is only mandatory for now
+ * for IOMMU feature. If IOMMU is explicitly disabled by the
+ * application, disable also REPLY_ACK feature for older buggy
+ * Qemu versions (from v2.7.0 to v2.9.0).
+ */
+ if (!(features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)))
+ protocol_features &= ~(1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK);
+
+ msg->payload.u64 = protocol_features;
+ msg->size = sizeof(msg->payload.u64);
+ msg->fd_num = 0;
+
+ return RTE_VHOST_MSG_RESULT_REPLY;
+}
+
+static int
+vhost_user_set_protocol_features(struct virtio_net **pdev,
+ struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+ uint64_t protocol_features = msg->payload.u64;
+ uint64_t slave_protocol_features = 0;
+
+ rte_vhost_driver_get_protocol_features(dev->ifname,
+ &slave_protocol_features);
+ if (protocol_features & ~slave_protocol_features) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) received invalid protocol features.\n",
+ dev->vid);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ dev->protocol_features = protocol_features;
+
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
+static int
+vhost_user_set_log_base(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+ int fd = msg->fds[0];
+ uint64_t size, off;
+ void *addr;
+
+ if (fd < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ if (msg->size != sizeof(VhostUserLog)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "invalid log base msg size: %"PRId32" != %d\n",
+ msg->size, (int)sizeof(VhostUserLog));
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ size = msg->payload.log.mmap_size;
+ off = msg->payload.log.mmap_offset;
+
+ /* Don't allow mmap_offset to point outside the mmap region */
+ if (off > size) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "log offset %#"PRIx64" exceeds log size %#"PRIx64"\n",
+ off, size);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "log mmap size: %"PRId64", offset: %"PRId64"\n",
+ size, off);
+
+ /*
+ * mmap from 0 to workaround a hugepage mmap bug: mmap will
+ * fail when offset is not page size aligned.
+ */
+ addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ close(fd);
+ if (addr == MAP_FAILED) {
+ RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n");
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ /*
+ * Free previously mapped log memory on occasionally
+ * multiple VHOST_USER_SET_LOG_BASE.
+ */
+ if (dev->log_addr) {
+ munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
+ }
+ dev->log_addr = (uint64_t)(uintptr_t)addr;
+ dev->log_base = dev->log_addr + off;
+ dev->log_size = size;
+
+ /*
+ * The spec is not clear about it (yet), but QEMU doesn't expect
+ * any payload in the reply.
+ */
+ msg->size = 0;
+ msg->fd_num = 0;
+
+ return RTE_VHOST_MSG_RESULT_REPLY;
+}
+
+static int vhost_user_set_log_fd(struct virtio_net **pdev __rte_unused,
+ struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ close(msg->fds[0]);
+ RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
+
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
+/*
+ * An rarp packet is constructed and broadcasted to notify switches about
+ * the new location of the migrated VM, so that packets from outside will
+ * not be lost after migration.
+ *
+ * However, we don't actually "send" a rarp packet here, instead, we set
+ * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
+ */
+static int
+vhost_user_send_rarp(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+ uint8_t *mac = (uint8_t *)&msg->payload.u64;
+ struct rte_vdpa_device *vdpa_dev;
+ int did = -1;
+
+ RTE_LOG(DEBUG, VHOST_CONFIG,
+ ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
+ mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
+ memcpy(dev->mac.addr_bytes, mac, 6);
+
+ /*
+ * Set the flag to inject a RARP broadcast packet at
+ * rte_vhost_dequeue_burst().
+ *
+ * rte_smp_wmb() is for making sure the mac is copied
+ * before the flag is set.
+ */
+ rte_smp_wmb();
+ rte_atomic16_set(&dev->broadcast_rarp, 1);
+ did = dev->vdpa_dev_id;
+ vdpa_dev = rte_vdpa_get_device(did);
+ if (vdpa_dev && vdpa_dev->ops->migration_done)
+ vdpa_dev->ops->migration_done(dev->vid);
+
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
+static int
+vhost_user_net_set_mtu(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+ if (msg->payload.u64 < VIRTIO_MIN_MTU ||
+ msg->payload.u64 > VIRTIO_MAX_MTU) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Invalid MTU size (%"PRIu64")\n",
+ msg->payload.u64);
+
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ dev->mtu = msg->payload.u64;
+
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
+static int
+vhost_user_set_req_fd(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+ int fd = msg->fds[0];
+
+ if (fd < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Invalid file descriptor for slave channel (%d)\n",
+ fd);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ dev->slave_req_fd = fd;
+
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
+static int
+is_vring_iotlb_update(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg)
+{
+ struct vhost_vring_addr *ra;
+ uint64_t start, end;
+
+ start = imsg->iova;
+ end = start + imsg->size;
+
+ ra = &vq->ring_addrs;
+ if (ra->desc_user_addr >= start && ra->desc_user_addr < end)
+ return 1;
+ if (ra->avail_user_addr >= start && ra->avail_user_addr < end)
+ return 1;
+ if (ra->used_user_addr >= start && ra->used_user_addr < end)
+ return 1;
+
+ return 0;
+}
+
+static int
+is_vring_iotlb_invalidate(struct vhost_virtqueue *vq,
+ struct vhost_iotlb_msg *imsg)
+{
+ uint64_t istart, iend, vstart, vend;
+
+ istart = imsg->iova;
+ iend = istart + imsg->size - 1;
+
+ vstart = (uintptr_t)vq->desc;
+ vend = vstart + sizeof(struct vring_desc) * vq->size - 1;
+ if (vstart <= iend && istart <= vend)
+ return 1;
+
+ vstart = (uintptr_t)vq->avail;
+ vend = vstart + sizeof(struct vring_avail);
+ vend += sizeof(uint16_t) * vq->size - 1;
+ if (vstart <= iend && istart <= vend)
+ return 1;
+
+ vstart = (uintptr_t)vq->used;
+ vend = vstart + sizeof(struct vring_used);
+ vend += sizeof(struct vring_used_elem) * vq->size - 1;
+ if (vstart <= iend && istart <= vend)
+ return 1;
+
+ return 0;
+}
+
+static int
+vhost_user_iotlb_msg(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+ struct vhost_iotlb_msg *imsg = &msg->payload.iotlb;
+ uint16_t i;
+ uint64_t vva, len;
+
+ switch (imsg->type) {
+ case VHOST_IOTLB_UPDATE:
+ len = imsg->size;
+ vva = qva_to_vva(dev, imsg->uaddr, &len);
+ if (!vva)
+ return RTE_VHOST_MSG_RESULT_ERR;
+
+ for (i = 0; i < dev->nr_vring; i++) {
+ struct vhost_virtqueue *vq = dev->virtqueue[i];
+
+ vhost_user_iotlb_cache_insert(vq, imsg->iova, vva,
+ len, imsg->perm);
+
+ if (is_vring_iotlb_update(vq, imsg))
+ *pdev = dev = translate_ring_addresses(dev, i);
+ }
+ break;
+ case VHOST_IOTLB_INVALIDATE:
+ for (i = 0; i < dev->nr_vring; i++) {
+ struct vhost_virtqueue *vq = dev->virtqueue[i];
+
+ vhost_user_iotlb_cache_remove(vq, imsg->iova,
+ imsg->size);
+
+ if (is_vring_iotlb_invalidate(vq, imsg))
+ vring_invalidate(dev, vq);
+ }
+ break;
+ default:
+ RTE_LOG(ERR, VHOST_CONFIG, "Invalid IOTLB message type (%d)\n",
+ imsg->type);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
+static int
+vhost_user_set_postcopy_advise(struct virtio_net **pdev,
+ struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+#ifdef RTE_LIBRTE_VHOST_POSTCOPY
+ struct uffdio_api api_struct;
+
+ dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+
+ if (dev->postcopy_ufd == -1) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Userfaultfd not available: %s\n",
+ strerror(errno));
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+ api_struct.api = UFFD_API;
+ api_struct.features = 0;
+ if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) {
+ RTE_LOG(ERR, VHOST_CONFIG, "UFFDIO_API ioctl failure: %s\n",
+ strerror(errno));
+ close(dev->postcopy_ufd);
+ dev->postcopy_ufd = -1;
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+ msg->fds[0] = dev->postcopy_ufd;
+ msg->fd_num = 1;
+
+ return RTE_VHOST_MSG_RESULT_REPLY;
+#else
+ dev->postcopy_ufd = -1;
+ msg->fd_num = 0;
+
+ return RTE_VHOST_MSG_RESULT_ERR;
+#endif
+}
+
+static int
+vhost_user_set_postcopy_listen(struct virtio_net **pdev,
+ struct VhostUserMsg *msg __rte_unused,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+
+ if (dev->mem && dev->mem->nregions) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Regions already registered at postcopy-listen\n");
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+ dev->postcopy_listening = 1;
+
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
+static int
+vhost_user_postcopy_end(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+
+ dev->postcopy_listening = 0;
+ if (dev->postcopy_ufd >= 0) {
+ close(dev->postcopy_ufd);
+ dev->postcopy_ufd = -1;
+ }
+
+ msg->payload.u64 = 0;
+ msg->size = sizeof(msg->payload.u64);
+ msg->fd_num = 0;
+
+ return RTE_VHOST_MSG_RESULT_REPLY;
+}
+
+typedef int (*vhost_message_handler_t)(struct virtio_net **pdev,
+ struct VhostUserMsg *msg,
+ int main_fd);
+static vhost_message_handler_t vhost_message_handlers[VHOST_USER_MAX] = {
+ [VHOST_USER_NONE] = NULL,
+ [VHOST_USER_GET_FEATURES] = vhost_user_get_features,
+ [VHOST_USER_SET_FEATURES] = vhost_user_set_features,
+ [VHOST_USER_SET_OWNER] = vhost_user_set_owner,
+ [VHOST_USER_RESET_OWNER] = vhost_user_reset_owner,
+ [VHOST_USER_SET_MEM_TABLE] = vhost_user_set_mem_table,
+ [VHOST_USER_SET_LOG_BASE] = vhost_user_set_log_base,
+ [VHOST_USER_SET_LOG_FD] = vhost_user_set_log_fd,
+ [VHOST_USER_SET_VRING_NUM] = vhost_user_set_vring_num,
+ [VHOST_USER_SET_VRING_ADDR] = vhost_user_set_vring_addr,
+ [VHOST_USER_SET_VRING_BASE] = vhost_user_set_vring_base,
+ [VHOST_USER_GET_VRING_BASE] = vhost_user_get_vring_base,
+ [VHOST_USER_SET_VRING_KICK] = vhost_user_set_vring_kick,
+ [VHOST_USER_SET_VRING_CALL] = vhost_user_set_vring_call,
+ [VHOST_USER_SET_VRING_ERR] = vhost_user_set_vring_err,
+ [VHOST_USER_GET_PROTOCOL_FEATURES] = vhost_user_get_protocol_features,
+ [VHOST_USER_SET_PROTOCOL_FEATURES] = vhost_user_set_protocol_features,
+ [VHOST_USER_GET_QUEUE_NUM] = vhost_user_get_queue_num,
+ [VHOST_USER_SET_VRING_ENABLE] = vhost_user_set_vring_enable,
+ [VHOST_USER_SEND_RARP] = vhost_user_send_rarp,
+ [VHOST_USER_NET_SET_MTU] = vhost_user_net_set_mtu,
+ [VHOST_USER_SET_SLAVE_REQ_FD] = vhost_user_set_req_fd,
+ [VHOST_USER_IOTLB_MSG] = vhost_user_iotlb_msg,
+ [VHOST_USER_POSTCOPY_ADVISE] = vhost_user_set_postcopy_advise,
+ [VHOST_USER_POSTCOPY_LISTEN] = vhost_user_set_postcopy_listen,
+ [VHOST_USER_POSTCOPY_END] = vhost_user_postcopy_end,
+};
+
+
+/* return bytes# of read on success or negative val on failure. */
+static int
+read_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+ int ret;
+
+ ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
+ msg->fds, VHOST_MEMORY_MAX_NREGIONS, &msg->fd_num);
+ if (ret <= 0)
+ return ret;
+
+ if (msg->size) {
+ if (msg->size > sizeof(msg->payload)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "invalid msg size: %d\n", msg->size);
+ return -1;
+ }
+ ret = read(sockfd, &msg->payload, msg->size);
+ if (ret <= 0)
+ return ret;
+ if (ret != (int)msg->size) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "read control message failed\n");
+ return -1;
+ }
+ }
+
+ return ret;
+}
+
+static int
+send_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+ if (!msg)
+ return 0;
+
+ return send_fd_message(sockfd, (char *)msg,
+ VHOST_USER_HDR_SIZE + msg->size, msg->fds, msg->fd_num);
+}
+
+static int
+send_vhost_reply(int sockfd, struct VhostUserMsg *msg)
+{
+ if (!msg)
+ return 0;
+
+ msg->flags &= ~VHOST_USER_VERSION_MASK;
+ msg->flags &= ~VHOST_USER_NEED_REPLY;
+ msg->flags |= VHOST_USER_VERSION;
+ msg->flags |= VHOST_USER_REPLY_MASK;
+
+ return send_vhost_message(sockfd, msg);
+}
+
+static int
+send_vhost_slave_message(struct virtio_net *dev, struct VhostUserMsg *msg)
+{
+ int ret;
+
+ if (msg->flags & VHOST_USER_NEED_REPLY)
+ rte_spinlock_lock(&dev->slave_req_lock);
+
+ ret = send_vhost_message(dev->slave_req_fd, msg);
+ if (ret < 0 && (msg->flags & VHOST_USER_NEED_REPLY))
+ rte_spinlock_unlock(&dev->slave_req_lock);
+
+ return ret;
+}
+
+/*
+ * Allocate a queue pair if it hasn't been allocated yet
+ */
+static int
+vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev,
+ struct VhostUserMsg *msg)
+{
+ uint16_t vring_idx;
+
+ switch (msg->request.master) {
+ case VHOST_USER_SET_VRING_KICK:
+ case VHOST_USER_SET_VRING_CALL:
+ case VHOST_USER_SET_VRING_ERR:
+ vring_idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ break;
+ case VHOST_USER_SET_VRING_NUM:
+ case VHOST_USER_SET_VRING_BASE:
+ case VHOST_USER_SET_VRING_ENABLE:
+ vring_idx = msg->payload.state.index;
+ break;
+ case VHOST_USER_SET_VRING_ADDR:
+ vring_idx = msg->payload.addr.index;
+ break;
+ default:
+ return 0;
+ }
+
+ if (vring_idx >= VHOST_MAX_VRING) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "invalid vring index: %u\n", vring_idx);
+ return -1;
+ }
+
+ if (dev->virtqueue[vring_idx])
+ return 0;
+
+ return alloc_vring_queue(dev, vring_idx);
+}
+
+static void
+vhost_user_lock_all_queue_pairs(struct virtio_net *dev)
+{
+ unsigned int i = 0;
+ unsigned int vq_num = 0;
+
+ while (vq_num < dev->nr_vring) {
+ struct vhost_virtqueue *vq = dev->virtqueue[i];
+
+ if (vq) {
+ rte_spinlock_lock(&vq->access_lock);
+ vq_num++;
+ }
+ i++;
+ }
+}
+
+static void
+vhost_user_unlock_all_queue_pairs(struct virtio_net *dev)
+{
+ unsigned int i = 0;
+ unsigned int vq_num = 0;
+
+ while (vq_num < dev->nr_vring) {
+ struct vhost_virtqueue *vq = dev->virtqueue[i];
+
+ if (vq) {
+ rte_spinlock_unlock(&vq->access_lock);
+ vq_num++;
+ }
+ i++;
+ }
+}
+
+int
+vhost_user_msg_handler(int vid, int fd)
+{
+ struct virtio_net *dev;
+ struct VhostUserMsg msg;
+ struct rte_vdpa_device *vdpa_dev;
+ int did = -1;
+ int ret;
+ int unlock_required = 0;
+ bool handled;
+ int request;
+
+ dev = get_device(vid);
+ if (dev == NULL)
+ return -1;
+
+ if (!dev->notify_ops) {
+ dev->notify_ops = vhost_driver_callback_get(dev->ifname);
+ if (!dev->notify_ops) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to get callback ops for driver %s\n",
+ dev->ifname);
+ return -1;
+ }
+ }
+
+ ret = read_vhost_message(fd, &msg);
+ if (ret <= 0) {
+ if (ret < 0)
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "vhost read message failed\n");
+ else
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "vhost peer closed\n");
+
+ return -1;
+ }
+
+ ret = 0;
+ request = msg.request.master;
+ if (request > VHOST_USER_NONE && request < VHOST_USER_MAX &&
+ vhost_message_str[request]) {
+ if (request != VHOST_USER_IOTLB_MSG)
+ RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
+ vhost_message_str[request]);
+ else
+ RTE_LOG(DEBUG, VHOST_CONFIG, "read message %s\n",
+ vhost_message_str[request]);
+ } else {
+ RTE_LOG(DEBUG, VHOST_CONFIG, "External request %d\n", request);
+ }
+
+ ret = vhost_user_check_and_alloc_queue_pair(dev, &msg);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to alloc queue\n");
+ return -1;
+ }
+
+ /*
+ * Note: we don't lock all queues on VHOST_USER_GET_VRING_BASE
+ * and VHOST_USER_RESET_OWNER, since it is sent when virtio stops
+ * and device is destroyed. destroy_device waits for queues to be
+ * inactive, so it is safe. Otherwise taking the access_lock
+ * would cause a dead lock.
+ */
+ switch (request) {
+ case VHOST_USER_SET_FEATURES:
+ case VHOST_USER_SET_PROTOCOL_FEATURES:
+ case VHOST_USER_SET_OWNER:
+ case VHOST_USER_SET_MEM_TABLE:
+ case VHOST_USER_SET_LOG_BASE:
+ case VHOST_USER_SET_LOG_FD:
+ case VHOST_USER_SET_VRING_NUM:
+ case VHOST_USER_SET_VRING_ADDR:
+ case VHOST_USER_SET_VRING_BASE:
+ case VHOST_USER_SET_VRING_KICK:
+ case VHOST_USER_SET_VRING_CALL:
+ case VHOST_USER_SET_VRING_ERR:
+ case VHOST_USER_SET_VRING_ENABLE:
+ case VHOST_USER_SEND_RARP:
+ case VHOST_USER_NET_SET_MTU:
+ case VHOST_USER_SET_SLAVE_REQ_FD:
+ vhost_user_lock_all_queue_pairs(dev);
+ unlock_required = 1;
+ break;
+ default:
+ break;
+
+ }
+
+ handled = false;
+ if (dev->extern_ops.pre_msg_handle) {
+ ret = (*dev->extern_ops.pre_msg_handle)(dev->vid,
+ (void *)&msg);
+ switch (ret) {
+ case RTE_VHOST_MSG_RESULT_REPLY:
+ send_vhost_reply(fd, &msg);
+ /* Fall-through */
+ case RTE_VHOST_MSG_RESULT_ERR:
+ case RTE_VHOST_MSG_RESULT_OK:
+ handled = true;
+ goto skip_to_post_handle;
+ case RTE_VHOST_MSG_RESULT_NOT_HANDLED:
+ default:
+ break;
+ }
+ }
+
+ if (request > VHOST_USER_NONE && request < VHOST_USER_MAX) {
+ if (!vhost_message_handlers[request])
+ goto skip_to_post_handle;
+ ret = vhost_message_handlers[request](&dev, &msg, fd);
+
+ switch (ret) {
+ case RTE_VHOST_MSG_RESULT_ERR:
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Processing %s failed.\n",
+ vhost_message_str[request]);
+ handled = true;
+ break;
+ case RTE_VHOST_MSG_RESULT_OK:
+ RTE_LOG(DEBUG, VHOST_CONFIG,
+ "Processing %s succeeded.\n",
+ vhost_message_str[request]);
+ handled = true;
+ break;
+ case RTE_VHOST_MSG_RESULT_REPLY:
+ RTE_LOG(DEBUG, VHOST_CONFIG,
+ "Processing %s succeeded and needs reply.\n",
+ vhost_message_str[request]);
+ send_vhost_reply(fd, &msg);
+ handled = true;
+ break;
+ default:
+ break;
+ }
+ }
+
+skip_to_post_handle:
+ if (ret != RTE_VHOST_MSG_RESULT_ERR &&
+ dev->extern_ops.post_msg_handle) {
+ ret = (*dev->extern_ops.post_msg_handle)(dev->vid,
+ (void *)&msg);
+ switch (ret) {
+ case RTE_VHOST_MSG_RESULT_REPLY:
+ send_vhost_reply(fd, &msg);
+ /* Fall-through */
+ case RTE_VHOST_MSG_RESULT_ERR:
+ case RTE_VHOST_MSG_RESULT_OK:
+ handled = true;
+ case RTE_VHOST_MSG_RESULT_NOT_HANDLED:
+ default:
+ break;
+ }
+ }
+
+ if (unlock_required)
+ vhost_user_unlock_all_queue_pairs(dev);
+
+ /* If message was not handled at this stage, treat it as an error */
+ if (!handled) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "vhost message (req: %d) was not handled.\n", request);
+ ret = RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ /*
+ * If the request required a reply that was already sent,
+ * this optional reply-ack won't be sent as the
+ * VHOST_USER_NEED_REPLY was cleared in send_vhost_reply().
+ */
+ if (msg.flags & VHOST_USER_NEED_REPLY) {
+ msg.payload.u64 = ret == RTE_VHOST_MSG_RESULT_ERR;
+ msg.size = sizeof(msg.payload.u64);
+ msg.fd_num = 0;
+ send_vhost_reply(fd, &msg);
+ } else if (ret == RTE_VHOST_MSG_RESULT_ERR) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "vhost message handling failed.\n");
+ return -1;
+ }
+
+ if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) {
+ dev->flags |= VIRTIO_DEV_READY;
+
+ if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
+ if (dev->dequeue_zero_copy) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "dequeue zero copy is enabled\n");
+ }
+
+ if (dev->notify_ops->new_device(dev->vid) == 0)
+ dev->flags |= VIRTIO_DEV_RUNNING;
+ }
+ }
+
+ did = dev->vdpa_dev_id;
+ vdpa_dev = rte_vdpa_get_device(did);
+ if (vdpa_dev && virtio_is_ready(dev) &&
+ !(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED) &&
+ msg.request.master == VHOST_USER_SET_VRING_CALL) {
+ if (vdpa_dev->ops->dev_conf)
+ vdpa_dev->ops->dev_conf(dev->vid);
+ dev->flags |= VIRTIO_DEV_VDPA_CONFIGURED;
+ }
+
+ return 0;
+}
+
+static int process_slave_message_reply(struct virtio_net *dev,
+ const struct VhostUserMsg *msg)
+{
+ struct VhostUserMsg msg_reply;
+ int ret;
+
+ if ((msg->flags & VHOST_USER_NEED_REPLY) == 0)
+ return 0;
+
+ if (read_vhost_message(dev->slave_req_fd, &msg_reply) < 0) {
+ ret = -1;
+ goto out;
+ }
+
+ if (msg_reply.request.slave != msg->request.slave) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Received unexpected msg type (%u), expected %u\n",
+ msg_reply.request.slave, msg->request.slave);
+ ret = -1;
+ goto out;
+ }
+
+ ret = msg_reply.payload.u64 ? -1 : 0;
+
+out:
+ rte_spinlock_unlock(&dev->slave_req_lock);
+ return ret;
+}
+
+int
+vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm)
+{
+ int ret;
+ struct VhostUserMsg msg = {
+ .request.slave = VHOST_USER_SLAVE_IOTLB_MSG,
+ .flags = VHOST_USER_VERSION,
+ .size = sizeof(msg.payload.iotlb),
+ .payload.iotlb = {
+ .iova = iova,
+ .perm = perm,
+ .type = VHOST_IOTLB_MISS,
+ },
+ };
+
+ ret = send_vhost_message(dev->slave_req_fd, &msg);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to send IOTLB miss message (%d)\n",
+ ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int vhost_user_slave_set_vring_host_notifier(struct virtio_net *dev,
+ int index, int fd,
+ uint64_t offset,
+ uint64_t size)
+{
+ int ret;
+ struct VhostUserMsg msg = {
+ .request.slave = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG,
+ .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY,
+ .size = sizeof(msg.payload.area),
+ .payload.area = {
+ .u64 = index & VHOST_USER_VRING_IDX_MASK,
+ .size = size,
+ .offset = offset,
+ },
+ };
+
+ if (fd < 0)
+ msg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK;
+ else {
+ msg.fds[0] = fd;
+ msg.fd_num = 1;
+ }
+
+ ret = send_vhost_slave_message(dev, &msg);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to set host notifier (%d)\n", ret);
+ return ret;
+ }
+
+ return process_slave_message_reply(dev, &msg);
+}
+
+int rte_vhost_host_notifier_ctrl(int vid, bool enable)
+{
+ struct virtio_net *dev;
+ struct rte_vdpa_device *vdpa_dev;
+ int vfio_device_fd, did, ret = 0;
+ uint64_t offset, size;
+ unsigned int i;
+
+ dev = get_device(vid);
+ if (!dev)
+ return -ENODEV;
+
+ did = dev->vdpa_dev_id;
+ if (did < 0)
+ return -EINVAL;
+
+ if (!(dev->features & (1ULL << VIRTIO_F_VERSION_1)) ||
+ !(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)) ||
+ !(dev->protocol_features &
+ (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ)) ||
+ !(dev->protocol_features &
+ (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD)) ||
+ !(dev->protocol_features &
+ (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER)))
+ return -ENOTSUP;
+
+ vdpa_dev = rte_vdpa_get_device(did);
+ if (!vdpa_dev)
+ return -ENODEV;
+
+ RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_vfio_device_fd, -ENOTSUP);
+ RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_notify_area, -ENOTSUP);
+
+ vfio_device_fd = vdpa_dev->ops->get_vfio_device_fd(vid);
+ if (vfio_device_fd < 0)
+ return -ENOTSUP;
+
+ if (enable) {
+ for (i = 0; i < dev->nr_vring; i++) {
+ if (vdpa_dev->ops->get_notify_area(vid, i, &offset,
+ &size) < 0) {
+ ret = -ENOTSUP;
+ goto disable;
+ }
+
+ if (vhost_user_slave_set_vring_host_notifier(dev, i,
+ vfio_device_fd, offset, size) < 0) {
+ ret = -EFAULT;
+ goto disable;
+ }
+ }
+ } else {
+disable:
+ for (i = 0; i < dev->nr_vring; i++) {
+ vhost_user_slave_set_vring_host_notifier(dev, i, -1,
+ 0, 0);
+ }
+ }
+
+ return ret;
+}
diff --git a/src/seastar/dpdk/lib/librte_vhost/vhost_user.h b/src/seastar/dpdk/lib/librte_vhost/vhost_user.h
new file mode 100644
index 000000000..2a650fe4b
--- /dev/null
+++ b/src/seastar/dpdk/lib/librte_vhost/vhost_user.h
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2018 Intel Corporation
+ */
+
+#ifndef _VHOST_NET_USER_H
+#define _VHOST_NET_USER_H
+
+#include <stdint.h>
+#include <linux/vhost.h>
+
+#include "rte_vhost.h"
+
+/* refer to hw/virtio/vhost-user.c */
+
+#define VHOST_MEMORY_MAX_NREGIONS 8
+
+#define VHOST_USER_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\
+ (1ULL << VHOST_USER_PROTOCOL_F_RARP) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_CRYPTO_SESSION) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT))
+
+typedef enum VhostUserRequest {
+ VHOST_USER_NONE = 0,
+ VHOST_USER_GET_FEATURES = 1,
+ VHOST_USER_SET_FEATURES = 2,
+ VHOST_USER_SET_OWNER = 3,
+ VHOST_USER_RESET_OWNER = 4,
+ VHOST_USER_SET_MEM_TABLE = 5,
+ VHOST_USER_SET_LOG_BASE = 6,
+ VHOST_USER_SET_LOG_FD = 7,
+ VHOST_USER_SET_VRING_NUM = 8,
+ VHOST_USER_SET_VRING_ADDR = 9,
+ VHOST_USER_SET_VRING_BASE = 10,
+ VHOST_USER_GET_VRING_BASE = 11,
+ VHOST_USER_SET_VRING_KICK = 12,
+ VHOST_USER_SET_VRING_CALL = 13,
+ VHOST_USER_SET_VRING_ERR = 14,
+ VHOST_USER_GET_PROTOCOL_FEATURES = 15,
+ VHOST_USER_SET_PROTOCOL_FEATURES = 16,
+ VHOST_USER_GET_QUEUE_NUM = 17,
+ VHOST_USER_SET_VRING_ENABLE = 18,
+ VHOST_USER_SEND_RARP = 19,
+ VHOST_USER_NET_SET_MTU = 20,
+ VHOST_USER_SET_SLAVE_REQ_FD = 21,
+ VHOST_USER_IOTLB_MSG = 22,
+ VHOST_USER_CRYPTO_CREATE_SESS = 26,
+ VHOST_USER_CRYPTO_CLOSE_SESS = 27,
+ VHOST_USER_POSTCOPY_ADVISE = 28,
+ VHOST_USER_POSTCOPY_LISTEN = 29,
+ VHOST_USER_POSTCOPY_END = 30,
+ VHOST_USER_MAX = 31
+} VhostUserRequest;
+
+typedef enum VhostUserSlaveRequest {
+ VHOST_USER_SLAVE_NONE = 0,
+ VHOST_USER_SLAVE_IOTLB_MSG = 1,
+ VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
+ VHOST_USER_SLAVE_MAX
+} VhostUserSlaveRequest;
+
+typedef struct VhostUserMemoryRegion {
+ uint64_t guest_phys_addr;
+ uint64_t memory_size;
+ uint64_t userspace_addr;
+ uint64_t mmap_offset;
+} VhostUserMemoryRegion;
+
+typedef struct VhostUserMemory {
+ uint32_t nregions;
+ uint32_t padding;
+ VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
+} VhostUserMemory;
+
+typedef struct VhostUserLog {
+ uint64_t mmap_size;
+ uint64_t mmap_offset;
+} VhostUserLog;
+
+/* Comply with Cryptodev-Linux */
+#define VHOST_USER_CRYPTO_MAX_HMAC_KEY_LENGTH 512
+#define VHOST_USER_CRYPTO_MAX_CIPHER_KEY_LENGTH 64
+
+/* Same structure as vhost-user backend session info */
+typedef struct VhostUserCryptoSessionParam {
+ int64_t session_id;
+ uint32_t op_code;
+ uint32_t cipher_algo;
+ uint32_t cipher_key_len;
+ uint32_t hash_algo;
+ uint32_t digest_len;
+ uint32_t auth_key_len;
+ uint32_t aad_len;
+ uint8_t op_type;
+ uint8_t dir;
+ uint8_t hash_mode;
+ uint8_t chaining_dir;
+ uint8_t *ciphe_key;
+ uint8_t *auth_key;
+ uint8_t cipher_key_buf[VHOST_USER_CRYPTO_MAX_CIPHER_KEY_LENGTH];
+ uint8_t auth_key_buf[VHOST_USER_CRYPTO_MAX_HMAC_KEY_LENGTH];
+} VhostUserCryptoSessionParam;
+
+typedef struct VhostUserVringArea {
+ uint64_t u64;
+ uint64_t size;
+ uint64_t offset;
+} VhostUserVringArea;
+
+typedef struct VhostUserMsg {
+ union {
+ uint32_t master; /* a VhostUserRequest value */
+ uint32_t slave; /* a VhostUserSlaveRequest value*/
+ } request;
+
+#define VHOST_USER_VERSION_MASK 0x3
+#define VHOST_USER_REPLY_MASK (0x1 << 2)
+#define VHOST_USER_NEED_REPLY (0x1 << 3)
+ uint32_t flags;
+ uint32_t size; /* the following payload size */
+ union {
+#define VHOST_USER_VRING_IDX_MASK 0xff
+#define VHOST_USER_VRING_NOFD_MASK (0x1<<8)
+ uint64_t u64;
+ struct vhost_vring_state state;
+ struct vhost_vring_addr addr;
+ VhostUserMemory memory;
+ VhostUserLog log;
+ struct vhost_iotlb_msg iotlb;
+ VhostUserCryptoSessionParam crypto_session;
+ VhostUserVringArea area;
+ } payload;
+ int fds[VHOST_MEMORY_MAX_NREGIONS];
+ int fd_num;
+} __attribute((packed)) VhostUserMsg;
+
+#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
+
+/* The version of the protocol we support */
+#define VHOST_USER_VERSION 0x1
+
+
+/* vhost_user.c */
+int vhost_user_msg_handler(int vid, int fd);
+int vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm);
+
+/* socket.c */
+int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds,
+ int *fd_num);
+int send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num);
+
+#endif
diff --git a/src/seastar/dpdk/lib/librte_vhost/virtio_crypto.h b/src/seastar/dpdk/lib/librte_vhost/virtio_crypto.h
new file mode 100644
index 000000000..e3b93573c
--- /dev/null
+++ b/src/seastar/dpdk/lib/librte_vhost/virtio_crypto.h
@@ -0,0 +1,422 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_CRYPTO_H
+#define _VIRTIO_CRYPTO_H
+
+#define VIRTIO_CRYPTO_SERVICE_CIPHER 0
+#define VIRTIO_CRYPTO_SERVICE_HASH 1
+#define VIRTIO_CRYPTO_SERVICE_MAC 2
+#define VIRTIO_CRYPTO_SERVICE_AEAD 3
+
+#define VIRTIO_CRYPTO_OPCODE(service, op) (((service) << 8) | (op))
+
+struct virtio_crypto_ctrl_header {
+#define VIRTIO_CRYPTO_CIPHER_CREATE_SESSION \
+ VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_CIPHER, 0x02)
+#define VIRTIO_CRYPTO_CIPHER_DESTROY_SESSION \
+ VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_CIPHER, 0x03)
+#define VIRTIO_CRYPTO_HASH_CREATE_SESSION \
+ VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_HASH, 0x02)
+#define VIRTIO_CRYPTO_HASH_DESTROY_SESSION \
+ VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_HASH, 0x03)
+#define VIRTIO_CRYPTO_MAC_CREATE_SESSION \
+ VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_MAC, 0x02)
+#define VIRTIO_CRYPTO_MAC_DESTROY_SESSION \
+ VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_MAC, 0x03)
+#define VIRTIO_CRYPTO_AEAD_CREATE_SESSION \
+ VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AEAD, 0x02)
+#define VIRTIO_CRYPTO_AEAD_DESTROY_SESSION \
+ VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AEAD, 0x03)
+ uint32_t opcode;
+ uint32_t algo;
+ uint32_t flag;
+ /* data virtqueue id */
+ uint32_t queue_id;
+};
+
+struct virtio_crypto_cipher_session_para {
+#define VIRTIO_CRYPTO_NO_CIPHER 0
+#define VIRTIO_CRYPTO_CIPHER_ARC4 1
+#define VIRTIO_CRYPTO_CIPHER_AES_ECB 2
+#define VIRTIO_CRYPTO_CIPHER_AES_CBC 3
+#define VIRTIO_CRYPTO_CIPHER_AES_CTR 4
+#define VIRTIO_CRYPTO_CIPHER_DES_ECB 5
+#define VIRTIO_CRYPTO_CIPHER_DES_CBC 6
+#define VIRTIO_CRYPTO_CIPHER_3DES_ECB 7
+#define VIRTIO_CRYPTO_CIPHER_3DES_CBC 8
+#define VIRTIO_CRYPTO_CIPHER_3DES_CTR 9
+#define VIRTIO_CRYPTO_CIPHER_KASUMI_F8 10
+#define VIRTIO_CRYPTO_CIPHER_SNOW3G_UEA2 11
+#define VIRTIO_CRYPTO_CIPHER_AES_F8 12
+#define VIRTIO_CRYPTO_CIPHER_AES_XTS 13
+#define VIRTIO_CRYPTO_CIPHER_ZUC_EEA3 14
+ uint32_t algo;
+ /* length of key */
+ uint32_t keylen;
+
+#define VIRTIO_CRYPTO_OP_ENCRYPT 1
+#define VIRTIO_CRYPTO_OP_DECRYPT 2
+ /* encrypt or decrypt */
+ uint32_t op;
+ uint32_t padding;
+};
+
+struct virtio_crypto_session_input {
+ /* Device-writable part */
+ uint64_t session_id;
+ uint32_t status;
+ uint32_t padding;
+};
+
+struct virtio_crypto_cipher_session_req {
+ struct virtio_crypto_cipher_session_para para;
+ uint8_t padding[32];
+};
+
+struct virtio_crypto_hash_session_para {
+#define VIRTIO_CRYPTO_NO_HASH 0
+#define VIRTIO_CRYPTO_HASH_MD5 1
+#define VIRTIO_CRYPTO_HASH_SHA1 2
+#define VIRTIO_CRYPTO_HASH_SHA_224 3
+#define VIRTIO_CRYPTO_HASH_SHA_256 4
+#define VIRTIO_CRYPTO_HASH_SHA_384 5
+#define VIRTIO_CRYPTO_HASH_SHA_512 6
+#define VIRTIO_CRYPTO_HASH_SHA3_224 7
+#define VIRTIO_CRYPTO_HASH_SHA3_256 8
+#define VIRTIO_CRYPTO_HASH_SHA3_384 9
+#define VIRTIO_CRYPTO_HASH_SHA3_512 10
+#define VIRTIO_CRYPTO_HASH_SHA3_SHAKE128 11
+#define VIRTIO_CRYPTO_HASH_SHA3_SHAKE256 12
+ uint32_t algo;
+ /* hash result length */
+ uint32_t hash_result_len;
+ uint8_t padding[8];
+};
+
+struct virtio_crypto_hash_create_session_req {
+ struct virtio_crypto_hash_session_para para;
+ uint8_t padding[40];
+};
+
+struct virtio_crypto_mac_session_para {
+#define VIRTIO_CRYPTO_NO_MAC 0
+#define VIRTIO_CRYPTO_MAC_HMAC_MD5 1
+#define VIRTIO_CRYPTO_MAC_HMAC_SHA1 2
+#define VIRTIO_CRYPTO_MAC_HMAC_SHA_224 3
+#define VIRTIO_CRYPTO_MAC_HMAC_SHA_256 4
+#define VIRTIO_CRYPTO_MAC_HMAC_SHA_384 5
+#define VIRTIO_CRYPTO_MAC_HMAC_SHA_512 6
+#define VIRTIO_CRYPTO_MAC_CMAC_3DES 25
+#define VIRTIO_CRYPTO_MAC_CMAC_AES 26
+#define VIRTIO_CRYPTO_MAC_KASUMI_F9 27
+#define VIRTIO_CRYPTO_MAC_SNOW3G_UIA2 28
+#define VIRTIO_CRYPTO_MAC_GMAC_AES 41
+#define VIRTIO_CRYPTO_MAC_GMAC_TWOFISH 42
+#define VIRTIO_CRYPTO_MAC_CBCMAC_AES 49
+#define VIRTIO_CRYPTO_MAC_CBCMAC_KASUMI_F9 50
+#define VIRTIO_CRYPTO_MAC_XCBC_AES 53
+ uint32_t algo;
+ /* hash result length */
+ uint32_t hash_result_len;
+ /* length of authenticated key */
+ uint32_t auth_key_len;
+ uint32_t padding;
+};
+
+struct virtio_crypto_mac_create_session_req {
+ struct virtio_crypto_mac_session_para para;
+ uint8_t padding[40];
+};
+
+struct virtio_crypto_aead_session_para {
+#define VIRTIO_CRYPTO_NO_AEAD 0
+#define VIRTIO_CRYPTO_AEAD_GCM 1
+#define VIRTIO_CRYPTO_AEAD_CCM 2
+#define VIRTIO_CRYPTO_AEAD_CHACHA20_POLY1305 3
+ uint32_t algo;
+ /* length of key */
+ uint32_t key_len;
+ /* hash result length */
+ uint32_t hash_result_len;
+ /* length of the additional authenticated data (AAD) in bytes */
+ uint32_t aad_len;
+ /* encrypt or decrypt, See above VIRTIO_CRYPTO_OP_* */
+ uint32_t op;
+ uint32_t padding;
+};
+
+struct virtio_crypto_aead_create_session_req {
+ struct virtio_crypto_aead_session_para para;
+ uint8_t padding[32];
+};
+
+struct virtio_crypto_alg_chain_session_para {
+#define VIRTIO_CRYPTO_SYM_ALG_CHAIN_ORDER_HASH_THEN_CIPHER 1
+#define VIRTIO_CRYPTO_SYM_ALG_CHAIN_ORDER_CIPHER_THEN_HASH 2
+ uint32_t alg_chain_order;
+/* Plain hash */
+#define VIRTIO_CRYPTO_SYM_HASH_MODE_PLAIN 1
+/* Authenticated hash (mac) */
+#define VIRTIO_CRYPTO_SYM_HASH_MODE_AUTH 2
+/* Nested hash */
+#define VIRTIO_CRYPTO_SYM_HASH_MODE_NESTED 3
+ uint32_t hash_mode;
+ struct virtio_crypto_cipher_session_para cipher_param;
+ union {
+ struct virtio_crypto_hash_session_para hash_param;
+ struct virtio_crypto_mac_session_para mac_param;
+ uint8_t padding[16];
+ } u;
+ /* length of the additional authenticated data (AAD) in bytes */
+ uint32_t aad_len;
+ uint32_t padding;
+};
+
+struct virtio_crypto_alg_chain_session_req {
+ struct virtio_crypto_alg_chain_session_para para;
+};
+
+struct virtio_crypto_sym_create_session_req {
+ union {
+ struct virtio_crypto_cipher_session_req cipher;
+ struct virtio_crypto_alg_chain_session_req chain;
+ uint8_t padding[48];
+ } u;
+
+ /* Device-readable part */
+
+/* No operation */
+#define VIRTIO_CRYPTO_SYM_OP_NONE 0
+/* Cipher only operation on the data */
+#define VIRTIO_CRYPTO_SYM_OP_CIPHER 1
+/*
+ * Chain any cipher with any hash or mac operation. The order
+ * depends on the value of alg_chain_order param
+ */
+#define VIRTIO_CRYPTO_SYM_OP_ALGORITHM_CHAINING 2
+ uint32_t op_type;
+ uint32_t padding;
+};
+
+struct virtio_crypto_destroy_session_req {
+ /* Device-readable part */
+ uint64_t session_id;
+ uint8_t padding[48];
+};
+
+/* The request of the control virtqueue's packet */
+struct virtio_crypto_op_ctrl_req {
+ struct virtio_crypto_ctrl_header header;
+
+ union {
+ struct virtio_crypto_sym_create_session_req
+ sym_create_session;
+ struct virtio_crypto_hash_create_session_req
+ hash_create_session;
+ struct virtio_crypto_mac_create_session_req
+ mac_create_session;
+ struct virtio_crypto_aead_create_session_req
+ aead_create_session;
+ struct virtio_crypto_destroy_session_req
+ destroy_session;
+ uint8_t padding[56];
+ } u;
+};
+
+struct virtio_crypto_op_header {
+#define VIRTIO_CRYPTO_CIPHER_ENCRYPT \
+ VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_CIPHER, 0x00)
+#define VIRTIO_CRYPTO_CIPHER_DECRYPT \
+ VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_CIPHER, 0x01)
+#define VIRTIO_CRYPTO_HASH \
+ VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_HASH, 0x00)
+#define VIRTIO_CRYPTO_MAC \
+ VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_MAC, 0x00)
+#define VIRTIO_CRYPTO_AEAD_ENCRYPT \
+ VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AEAD, 0x00)
+#define VIRTIO_CRYPTO_AEAD_DECRYPT \
+ VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AEAD, 0x01)
+ uint32_t opcode;
+ /* algo should be service-specific algorithms */
+ uint32_t algo;
+ /* session_id should be service-specific algorithms */
+ uint64_t session_id;
+ /* control flag to control the request */
+ uint32_t flag;
+ uint32_t padding;
+};
+
+struct virtio_crypto_cipher_para {
+ /*
+ * Byte Length of valid IV/Counter
+ *
+ * For block ciphers in CBC or F8 mode, or for Kasumi in F8 mode, or for
+ * SNOW3G in UEA2 mode, this is the length of the IV (which
+ * must be the same as the block length of the cipher).
+ * For block ciphers in CTR mode, this is the length of the counter
+ * (which must be the same as the block length of the cipher).
+ * For AES-XTS, this is the 128bit tweak, i, from IEEE Std 1619-2007.
+ *
+ * The IV/Counter will be updated after every partial cryptographic
+ * operation.
+ */
+ uint32_t iv_len;
+ /* length of source data */
+ uint32_t src_data_len;
+ /* length of dst data */
+ uint32_t dst_data_len;
+ uint32_t padding;
+};
+
+struct virtio_crypto_hash_para {
+ /* length of source data */
+ uint32_t src_data_len;
+ /* hash result length */
+ uint32_t hash_result_len;
+};
+
+struct virtio_crypto_mac_para {
+ struct virtio_crypto_hash_para hash;
+};
+
+struct virtio_crypto_aead_para {
+ /*
+ * Byte Length of valid IV data pointed to by the below iv_addr
+ * parameter.
+ *
+ * For GCM mode, this is either 12 (for 96-bit IVs) or 16, in which
+ * case iv_addr points to J0.
+ * For CCM mode, this is the length of the nonce, which can be in the
+ * range 7 to 13 inclusive.
+ */
+ uint32_t iv_len;
+ /* length of additional auth data */
+ uint32_t aad_len;
+ /* length of source data */
+ uint32_t src_data_len;
+ /* length of dst data */
+ uint32_t dst_data_len;
+};
+
+struct virtio_crypto_cipher_data_req {
+ /* Device-readable part */
+ struct virtio_crypto_cipher_para para;
+ uint8_t padding[24];
+};
+
+struct virtio_crypto_hash_data_req {
+ /* Device-readable part */
+ struct virtio_crypto_hash_para para;
+ uint8_t padding[40];
+};
+
+struct virtio_crypto_mac_data_req {
+ /* Device-readable part */
+ struct virtio_crypto_mac_para para;
+ uint8_t padding[40];
+};
+
+struct virtio_crypto_alg_chain_data_para {
+ uint32_t iv_len;
+ /* Length of source data */
+ uint32_t src_data_len;
+ /* Length of destination data */
+ uint32_t dst_data_len;
+ /* Starting point for cipher processing in source data */
+ uint32_t cipher_start_src_offset;
+ /* Length of the source data that the cipher will be computed on */
+ uint32_t len_to_cipher;
+ /* Starting point for hash processing in source data */
+ uint32_t hash_start_src_offset;
+ /* Length of the source data that the hash will be computed on */
+ uint32_t len_to_hash;
+ /* Length of the additional auth data */
+ uint32_t aad_len;
+ /* Length of the hash result */
+ uint32_t hash_result_len;
+ uint32_t reserved;
+};
+
+struct virtio_crypto_alg_chain_data_req {
+ /* Device-readable part */
+ struct virtio_crypto_alg_chain_data_para para;
+};
+
+struct virtio_crypto_sym_data_req {
+ union {
+ struct virtio_crypto_cipher_data_req cipher;
+ struct virtio_crypto_alg_chain_data_req chain;
+ uint8_t padding[40];
+ } u;
+
+ /* See above VIRTIO_CRYPTO_SYM_OP_* */
+ uint32_t op_type;
+ uint32_t padding;
+};
+
+struct virtio_crypto_aead_data_req {
+ /* Device-readable part */
+ struct virtio_crypto_aead_para para;
+ uint8_t padding[32];
+};
+
+/* The request of the data virtqueue's packet */
+struct virtio_crypto_op_data_req {
+ struct virtio_crypto_op_header header;
+
+ union {
+ struct virtio_crypto_sym_data_req sym_req;
+ struct virtio_crypto_hash_data_req hash_req;
+ struct virtio_crypto_mac_data_req mac_req;
+ struct virtio_crypto_aead_data_req aead_req;
+ uint8_t padding[48];
+ } u;
+};
+
+#define VIRTIO_CRYPTO_OK 0
+#define VIRTIO_CRYPTO_ERR 1
+#define VIRTIO_CRYPTO_BADMSG 2
+#define VIRTIO_CRYPTO_NOTSUPP 3
+#define VIRTIO_CRYPTO_INVSESS 4 /* Invalid session id */
+
+/* The accelerator hardware is ready */
+#define VIRTIO_CRYPTO_S_HW_READY (1 << 0)
+
+struct virtio_crypto_config {
+ /* See VIRTIO_CRYPTO_OP_* above */
+ uint32_t status;
+
+ /*
+ * Maximum number of data queue
+ */
+ uint32_t max_dataqueues;
+
+ /*
+ * Specifies the services mask which the device support,
+ * see VIRTIO_CRYPTO_SERVICE_* above
+ */
+ uint32_t crypto_services;
+
+ /* Detailed algorithms mask */
+ uint32_t cipher_algo_l;
+ uint32_t cipher_algo_h;
+ uint32_t hash_algo;
+ uint32_t mac_algo_l;
+ uint32_t mac_algo_h;
+ uint32_t aead_algo;
+ /* Maximum length of cipher key */
+ uint32_t max_cipher_key_len;
+ /* Maximum length of authenticated key */
+ uint32_t max_auth_key_len;
+ uint32_t reserve;
+ /* Maximum size of each crypto request's content */
+ uint64_t max_size;
+};
+
+struct virtio_crypto_inhdr {
+ /* See VIRTIO_CRYPTO_* above */
+ uint8_t status;
+};
+#endif /* _VIRTIO_CRYPTO_H */
diff --git a/src/seastar/dpdk/lib/librte_vhost/virtio_net.c b/src/seastar/dpdk/lib/librte_vhost/virtio_net.c
new file mode 100644
index 000000000..a6a33a101
--- /dev/null
+++ b/src/seastar/dpdk/lib/librte_vhost/virtio_net.c
@@ -0,0 +1,1638 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <linux/virtio_net.h>
+
+#include <rte_mbuf.h>
+#include <rte_memcpy.h>
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_vhost.h>
+#include <rte_tcp.h>
+#include <rte_udp.h>
+#include <rte_sctp.h>
+#include <rte_arp.h>
+#include <rte_spinlock.h>
+#include <rte_malloc.h>
+
+#include "iotlb.h"
+#include "vhost.h"
+
+#define MAX_PKT_BURST 32
+
+#define MAX_BATCH_LEN 256
+
+static __rte_always_inline bool
+rxvq_is_mergeable(struct virtio_net *dev)
+{
+ return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF);
+}
+
+static bool
+is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
+{
+ return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
+}
+
+static __rte_always_inline void
+do_flush_shadow_used_ring_split(struct virtio_net *dev,
+ struct vhost_virtqueue *vq,
+ uint16_t to, uint16_t from, uint16_t size)
+{
+ rte_memcpy(&vq->used->ring[to],
+ &vq->shadow_used_split[from],
+ size * sizeof(struct vring_used_elem));
+ vhost_log_cache_used_vring(dev, vq,
+ offsetof(struct vring_used, ring[to]),
+ size * sizeof(struct vring_used_elem));
+}
+
+static __rte_always_inline void
+flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+ uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
+
+ if (used_idx + vq->shadow_used_idx <= vq->size) {
+ do_flush_shadow_used_ring_split(dev, vq, used_idx, 0,
+ vq->shadow_used_idx);
+ } else {
+ uint16_t size;
+
+ /* update used ring interval [used_idx, vq->size] */
+ size = vq->size - used_idx;
+ do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, size);
+
+ /* update the left half used ring interval [0, left_size] */
+ do_flush_shadow_used_ring_split(dev, vq, 0, size,
+ vq->shadow_used_idx - size);
+ }
+ vq->last_used_idx += vq->shadow_used_idx;
+
+ rte_smp_wmb();
+
+ vhost_log_cache_sync(dev, vq);
+
+ *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
+ vq->shadow_used_idx = 0;
+ vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+ sizeof(vq->used->idx));
+}
+
+static __rte_always_inline void
+update_shadow_used_ring_split(struct vhost_virtqueue *vq,
+ uint16_t desc_idx, uint32_t len)
+{
+ uint16_t i = vq->shadow_used_idx++;
+
+ vq->shadow_used_split[i].id = desc_idx;
+ vq->shadow_used_split[i].len = len;
+}
+
+static __rte_always_inline void
+flush_shadow_used_ring_packed(struct virtio_net *dev,
+ struct vhost_virtqueue *vq)
+{
+ int i;
+ uint16_t used_idx = vq->last_used_idx;
+ uint16_t head_idx = vq->last_used_idx;
+ uint16_t head_flags = 0;
+
+ /* Split loop in two to save memory barriers */
+ for (i = 0; i < vq->shadow_used_idx; i++) {
+ vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
+ vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
+
+ used_idx += vq->shadow_used_packed[i].count;
+ if (used_idx >= vq->size)
+ used_idx -= vq->size;
+ }
+
+ rte_smp_wmb();
+
+ for (i = 0; i < vq->shadow_used_idx; i++) {
+ uint16_t flags;
+
+ if (vq->shadow_used_packed[i].len)
+ flags = VRING_DESC_F_WRITE;
+ else
+ flags = 0;
+
+ if (vq->used_wrap_counter) {
+ flags |= VRING_DESC_F_USED;
+ flags |= VRING_DESC_F_AVAIL;
+ } else {
+ flags &= ~VRING_DESC_F_USED;
+ flags &= ~VRING_DESC_F_AVAIL;
+ }
+
+ if (i > 0) {
+ vq->desc_packed[vq->last_used_idx].flags = flags;
+
+ vhost_log_cache_used_vring(dev, vq,
+ vq->last_used_idx *
+ sizeof(struct vring_packed_desc),
+ sizeof(struct vring_packed_desc));
+ } else {
+ head_idx = vq->last_used_idx;
+ head_flags = flags;
+ }
+
+ vq->last_used_idx += vq->shadow_used_packed[i].count;
+ if (vq->last_used_idx >= vq->size) {
+ vq->used_wrap_counter ^= 1;
+ vq->last_used_idx -= vq->size;
+ }
+ }
+
+ vq->desc_packed[head_idx].flags = head_flags;
+
+ vhost_log_cache_used_vring(dev, vq,
+ head_idx *
+ sizeof(struct vring_packed_desc),
+ sizeof(struct vring_packed_desc));
+
+ vq->shadow_used_idx = 0;
+ vhost_log_cache_sync(dev, vq);
+}
+
+static __rte_always_inline void
+update_shadow_used_ring_packed(struct vhost_virtqueue *vq,
+ uint16_t desc_idx, uint32_t len, uint16_t count)
+{
+ uint16_t i = vq->shadow_used_idx++;
+
+ vq->shadow_used_packed[i].id = desc_idx;
+ vq->shadow_used_packed[i].len = len;
+ vq->shadow_used_packed[i].count = count;
+}
+
+static inline void
+do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+ struct batch_copy_elem *elem = vq->batch_copy_elems;
+ uint16_t count = vq->batch_copy_nb_elems;
+ int i;
+
+ for (i = 0; i < count; i++) {
+ rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
+ vhost_log_cache_write(dev, vq, elem[i].log_addr, elem[i].len);
+ PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
+ }
+
+ vq->batch_copy_nb_elems = 0;
+}
+
+static inline void
+do_data_copy_dequeue(struct vhost_virtqueue *vq)
+{
+ struct batch_copy_elem *elem = vq->batch_copy_elems;
+ uint16_t count = vq->batch_copy_nb_elems;
+ int i;
+
+ for (i = 0; i < count; i++)
+ rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
+
+ vq->batch_copy_nb_elems = 0;
+}
+
+/* avoid write operation when necessary, to lessen cache issues */
+#define ASSIGN_UNLESS_EQUAL(var, val) do { \
+ if ((var) != (val)) \
+ (var) = (val); \
+} while (0)
+
+static __rte_always_inline void
+virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
+{
+ uint64_t csum_l4 = m_buf->ol_flags & PKT_TX_L4_MASK;
+
+ if (m_buf->ol_flags & PKT_TX_TCP_SEG)
+ csum_l4 |= PKT_TX_TCP_CKSUM;
+
+ if (csum_l4) {
+ net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+ net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
+
+ switch (csum_l4) {
+ case PKT_TX_TCP_CKSUM:
+ net_hdr->csum_offset = (offsetof(struct tcp_hdr,
+ cksum));
+ break;
+ case PKT_TX_UDP_CKSUM:
+ net_hdr->csum_offset = (offsetof(struct udp_hdr,
+ dgram_cksum));
+ break;
+ case PKT_TX_SCTP_CKSUM:
+ net_hdr->csum_offset = (offsetof(struct sctp_hdr,
+ cksum));
+ break;
+ }
+ } else {
+ ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0);
+ ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0);
+ ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0);
+ }
+
+ /* IP cksum verification cannot be bypassed, then calculate here */
+ if (m_buf->ol_flags & PKT_TX_IP_CKSUM) {
+ struct ipv4_hdr *ipv4_hdr;
+
+ ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct ipv4_hdr *,
+ m_buf->l2_len);
+ ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
+ }
+
+ if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
+ if (m_buf->ol_flags & PKT_TX_IPV4)
+ net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+ else
+ net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+ net_hdr->gso_size = m_buf->tso_segsz;
+ net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
+ + m_buf->l4_len;
+ } else if (m_buf->ol_flags & PKT_TX_UDP_SEG) {
+ net_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
+ net_hdr->gso_size = m_buf->tso_segsz;
+ net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len +
+ m_buf->l4_len;
+ } else {
+ ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0);
+ ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0);
+ ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0);
+ }
+}
+
+static __rte_always_inline int
+map_one_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ struct buf_vector *buf_vec, uint16_t *vec_idx,
+ uint64_t desc_iova, uint64_t desc_len, uint8_t perm)
+{
+ uint16_t vec_id = *vec_idx;
+
+ while (desc_len) {
+ uint64_t desc_addr;
+ uint64_t desc_chunck_len = desc_len;
+
+ if (unlikely(vec_id >= BUF_VECTOR_MAX))
+ return -1;
+
+ desc_addr = vhost_iova_to_vva(dev, vq,
+ desc_iova,
+ &desc_chunck_len,
+ perm);
+ if (unlikely(!desc_addr))
+ return -1;
+
+ buf_vec[vec_id].buf_iova = desc_iova;
+ buf_vec[vec_id].buf_addr = desc_addr;
+ buf_vec[vec_id].buf_len = desc_chunck_len;
+
+ desc_len -= desc_chunck_len;
+ desc_iova += desc_chunck_len;
+ vec_id++;
+ }
+ *vec_idx = vec_id;
+
+ return 0;
+}
+
+static __rte_always_inline int
+fill_vec_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint32_t avail_idx, uint16_t *vec_idx,
+ struct buf_vector *buf_vec, uint16_t *desc_chain_head,
+ uint32_t *desc_chain_len, uint8_t perm)
+{
+ uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
+ uint16_t vec_id = *vec_idx;
+ uint32_t len = 0;
+ uint64_t dlen;
+ uint32_t nr_descs = vq->size;
+ uint32_t cnt = 0;
+ struct vring_desc *descs = vq->desc;
+ struct vring_desc *idesc = NULL;
+
+ if (unlikely(idx >= vq->size))
+ return -1;
+
+ *desc_chain_head = idx;
+
+ if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
+ dlen = vq->desc[idx].len;
+ nr_descs = dlen / sizeof(struct vring_desc);
+ if (unlikely(nr_descs > vq->size))
+ return -1;
+
+ descs = (struct vring_desc *)(uintptr_t)
+ vhost_iova_to_vva(dev, vq, vq->desc[idx].addr,
+ &dlen,
+ VHOST_ACCESS_RO);
+ if (unlikely(!descs))
+ return -1;
+
+ if (unlikely(dlen < vq->desc[idx].len)) {
+ /*
+ * The indirect desc table is not contiguous
+ * in process VA space, we have to copy it.
+ */
+ idesc = alloc_copy_ind_table(dev, vq,
+ vq->desc[idx].addr, vq->desc[idx].len);
+ if (unlikely(!idesc))
+ return -1;
+
+ descs = idesc;
+ }
+
+ idx = 0;
+ }
+
+ while (1) {
+ if (unlikely(idx >= nr_descs || cnt++ >= nr_descs)) {
+ free_ind_table(idesc);
+ return -1;
+ }
+
+ len += descs[idx].len;
+
+ if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
+ descs[idx].addr, descs[idx].len,
+ perm))) {
+ free_ind_table(idesc);
+ return -1;
+ }
+
+ if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
+ break;
+
+ idx = descs[idx].next;
+ }
+
+ *desc_chain_len = len;
+ *vec_idx = vec_id;
+
+ if (unlikely(!!idesc))
+ free_ind_table(idesc);
+
+ return 0;
+}
+
+/*
+ * Returns -1 on fail, 0 on success
+ */
+static inline int
+reserve_avail_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint32_t size, struct buf_vector *buf_vec,
+ uint16_t *num_buffers, uint16_t avail_head,
+ uint16_t *nr_vec)
+{
+ uint16_t cur_idx;
+ uint16_t vec_idx = 0;
+ uint16_t max_tries, tries = 0;
+
+ uint16_t head_idx = 0;
+ uint32_t len = 0;
+
+ *num_buffers = 0;
+ cur_idx = vq->last_avail_idx;
+
+ if (rxvq_is_mergeable(dev))
+ max_tries = vq->size - 1;
+ else
+ max_tries = 1;
+
+ while (size > 0) {
+ if (unlikely(cur_idx == avail_head))
+ return -1;
+ /*
+ * if we tried all available ring items, and still
+ * can't get enough buf, it means something abnormal
+ * happened.
+ */
+ if (unlikely(++tries > max_tries))
+ return -1;
+
+ if (unlikely(fill_vec_buf_split(dev, vq, cur_idx,
+ &vec_idx, buf_vec,
+ &head_idx, &len,
+ VHOST_ACCESS_RW) < 0))
+ return -1;
+ len = RTE_MIN(len, size);
+ update_shadow_used_ring_split(vq, head_idx, len);
+ size -= len;
+
+ cur_idx++;
+ *num_buffers += 1;
+ }
+
+ *nr_vec = vec_idx;
+
+ return 0;
+}
+
+static __rte_always_inline int
+fill_vec_buf_packed_indirect(struct virtio_net *dev,
+ struct vhost_virtqueue *vq,
+ struct vring_packed_desc *desc, uint16_t *vec_idx,
+ struct buf_vector *buf_vec, uint32_t *len, uint8_t perm)
+{
+ uint16_t i;
+ uint32_t nr_descs;
+ uint16_t vec_id = *vec_idx;
+ uint64_t dlen;
+ struct vring_packed_desc *descs, *idescs = NULL;
+
+ dlen = desc->len;
+ descs = (struct vring_packed_desc *)(uintptr_t)
+ vhost_iova_to_vva(dev, vq, desc->addr, &dlen, VHOST_ACCESS_RO);
+ if (unlikely(!descs))
+ return -1;
+
+ if (unlikely(dlen < desc->len)) {
+ /*
+ * The indirect desc table is not contiguous
+ * in process VA space, we have to copy it.
+ */
+ idescs = alloc_copy_ind_table(dev, vq, desc->addr, desc->len);
+ if (unlikely(!idescs))
+ return -1;
+
+ descs = idescs;
+ }
+
+ nr_descs = desc->len / sizeof(struct vring_packed_desc);
+ if (unlikely(nr_descs >= vq->size)) {
+ free_ind_table(idescs);
+ return -1;
+ }
+
+ for (i = 0; i < nr_descs; i++) {
+ if (unlikely(vec_id >= BUF_VECTOR_MAX)) {
+ free_ind_table(idescs);
+ return -1;
+ }
+
+ *len += descs[i].len;
+ if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
+ descs[i].addr, descs[i].len,
+ perm)))
+ return -1;
+ }
+ *vec_idx = vec_id;
+
+ if (unlikely(!!idescs))
+ free_ind_table(idescs);
+
+ return 0;
+}
+
+static __rte_always_inline int
+fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint16_t avail_idx, uint16_t *desc_count,
+ struct buf_vector *buf_vec, uint16_t *vec_idx,
+ uint16_t *buf_id, uint32_t *len, uint8_t perm)
+{
+ bool wrap_counter = vq->avail_wrap_counter;
+ struct vring_packed_desc *descs = vq->desc_packed;
+ uint16_t vec_id = *vec_idx;
+
+ if (avail_idx < vq->last_avail_idx)
+ wrap_counter ^= 1;
+
+ if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)))
+ return -1;
+
+ /*
+ * The ordering between desc flags and desc
+ * content reads need to be enforced.
+ */
+ rte_smp_rmb();
+
+ *desc_count = 0;
+ *len = 0;
+
+ while (1) {
+ if (unlikely(vec_id >= BUF_VECTOR_MAX))
+ return -1;
+
+ if (unlikely(*desc_count >= vq->size))
+ return -1;
+
+ *desc_count += 1;
+ *buf_id = descs[avail_idx].id;
+
+ if (descs[avail_idx].flags & VRING_DESC_F_INDIRECT) {
+ if (unlikely(fill_vec_buf_packed_indirect(dev, vq,
+ &descs[avail_idx],
+ &vec_id, buf_vec,
+ len, perm) < 0))
+ return -1;
+ } else {
+ *len += descs[avail_idx].len;
+
+ if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
+ descs[avail_idx].addr,
+ descs[avail_idx].len,
+ perm)))
+ return -1;
+ }
+
+ if ((descs[avail_idx].flags & VRING_DESC_F_NEXT) == 0)
+ break;
+
+ if (++avail_idx >= vq->size) {
+ avail_idx -= vq->size;
+ wrap_counter ^= 1;
+ }
+ }
+
+ *vec_idx = vec_id;
+
+ return 0;
+}
+
+/*
+ * Returns -1 on fail, 0 on success
+ */
+static inline int
+reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint32_t size, struct buf_vector *buf_vec,
+ uint16_t *nr_vec, uint16_t *num_buffers,
+ uint16_t *nr_descs)
+{
+ uint16_t avail_idx;
+ uint16_t vec_idx = 0;
+ uint16_t max_tries, tries = 0;
+
+ uint16_t buf_id = 0;
+ uint32_t len = 0;
+ uint16_t desc_count;
+
+ *num_buffers = 0;
+ avail_idx = vq->last_avail_idx;
+
+ if (rxvq_is_mergeable(dev))
+ max_tries = vq->size - 1;
+ else
+ max_tries = 1;
+
+ while (size > 0) {
+ /*
+ * if we tried all available ring items, and still
+ * can't get enough buf, it means something abnormal
+ * happened.
+ */
+ if (unlikely(++tries > max_tries))
+ return -1;
+
+ if (unlikely(fill_vec_buf_packed(dev, vq,
+ avail_idx, &desc_count,
+ buf_vec, &vec_idx,
+ &buf_id, &len,
+ VHOST_ACCESS_RW) < 0))
+ return -1;
+
+ len = RTE_MIN(len, size);
+ update_shadow_used_ring_packed(vq, buf_id, len, desc_count);
+ size -= len;
+
+ avail_idx += desc_count;
+ if (avail_idx >= vq->size)
+ avail_idx -= vq->size;
+
+ *nr_descs += desc_count;
+ *num_buffers += 1;
+ }
+
+ *nr_vec = vec_idx;
+
+ return 0;
+}
+
+static __rte_always_inline int
+copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ struct rte_mbuf *m, struct buf_vector *buf_vec,
+ uint16_t nr_vec, uint16_t num_buffers)
+{
+ uint32_t vec_idx = 0;
+ uint32_t mbuf_offset, mbuf_avail;
+ uint32_t buf_offset, buf_avail;
+ uint64_t buf_addr, buf_iova, buf_len;
+ uint32_t cpy_len;
+ uint64_t hdr_addr;
+ struct rte_mbuf *hdr_mbuf;
+ struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
+ struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
+ int error = 0;
+
+ if (unlikely(m == NULL)) {
+ error = -1;
+ goto out;
+ }
+
+ buf_addr = buf_vec[vec_idx].buf_addr;
+ buf_iova = buf_vec[vec_idx].buf_iova;
+ buf_len = buf_vec[vec_idx].buf_len;
+
+ if (nr_vec > 1)
+ rte_prefetch0((void *)(uintptr_t)buf_vec[1].buf_addr);
+
+ if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
+ error = -1;
+ goto out;
+ }
+
+ hdr_mbuf = m;
+ hdr_addr = buf_addr;
+ if (unlikely(buf_len < dev->vhost_hlen))
+ hdr = &tmp_hdr;
+ else
+ hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
+
+ VHOST_LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
+ dev->vid, num_buffers);
+
+ if (unlikely(buf_len < dev->vhost_hlen)) {
+ buf_offset = dev->vhost_hlen - buf_len;
+ vec_idx++;
+ buf_addr = buf_vec[vec_idx].buf_addr;
+ buf_iova = buf_vec[vec_idx].buf_iova;
+ buf_len = buf_vec[vec_idx].buf_len;
+ buf_avail = buf_len - buf_offset;
+ } else {
+ buf_offset = dev->vhost_hlen;
+ buf_avail = buf_len - dev->vhost_hlen;
+ }
+
+ mbuf_avail = rte_pktmbuf_data_len(m);
+ mbuf_offset = 0;
+ while (mbuf_avail != 0 || m->next != NULL) {
+ /* done with current buf, get the next one */
+ if (buf_avail == 0) {
+ vec_idx++;
+ if (unlikely(vec_idx >= nr_vec)) {
+ error = -1;
+ goto out;
+ }
+
+ buf_addr = buf_vec[vec_idx].buf_addr;
+ buf_iova = buf_vec[vec_idx].buf_iova;
+ buf_len = buf_vec[vec_idx].buf_len;
+
+ /* Prefetch next buffer address. */
+ if (vec_idx + 1 < nr_vec)
+ rte_prefetch0((void *)(uintptr_t)
+ buf_vec[vec_idx + 1].buf_addr);
+ buf_offset = 0;
+ buf_avail = buf_len;
+ }
+
+ /* done with current mbuf, get the next one */
+ if (mbuf_avail == 0) {
+ m = m->next;
+
+ mbuf_offset = 0;
+ mbuf_avail = rte_pktmbuf_data_len(m);
+ }
+
+ if (hdr_addr) {
+ virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
+ if (rxvq_is_mergeable(dev))
+ ASSIGN_UNLESS_EQUAL(hdr->num_buffers,
+ num_buffers);
+
+ if (unlikely(hdr == &tmp_hdr)) {
+ uint64_t len;
+ uint64_t remain = dev->vhost_hlen;
+ uint64_t src = (uint64_t)(uintptr_t)hdr, dst;
+ uint64_t iova = buf_vec[0].buf_iova;
+ uint16_t hdr_vec_idx = 0;
+
+ while (remain) {
+ len = RTE_MIN(remain,
+ buf_vec[hdr_vec_idx].buf_len);
+ dst = buf_vec[hdr_vec_idx].buf_addr;
+ rte_memcpy((void *)(uintptr_t)dst,
+ (void *)(uintptr_t)src,
+ len);
+
+ PRINT_PACKET(dev, (uintptr_t)dst,
+ (uint32_t)len, 0);
+ vhost_log_cache_write(dev, vq,
+ iova, len);
+
+ remain -= len;
+ iova += len;
+ src += len;
+ hdr_vec_idx++;
+ }
+ } else {
+ PRINT_PACKET(dev, (uintptr_t)hdr_addr,
+ dev->vhost_hlen, 0);
+ vhost_log_cache_write(dev, vq,
+ buf_vec[0].buf_iova,
+ dev->vhost_hlen);
+ }
+
+ hdr_addr = 0;
+ }
+
+ cpy_len = RTE_MIN(buf_avail, mbuf_avail);
+
+ if (likely(cpy_len > MAX_BATCH_LEN ||
+ vq->batch_copy_nb_elems >= vq->size)) {
+ rte_memcpy((void *)((uintptr_t)(buf_addr + buf_offset)),
+ rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
+ cpy_len);
+ vhost_log_cache_write(dev, vq, buf_iova + buf_offset,
+ cpy_len);
+ PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
+ cpy_len, 0);
+ } else {
+ batch_copy[vq->batch_copy_nb_elems].dst =
+ (void *)((uintptr_t)(buf_addr + buf_offset));
+ batch_copy[vq->batch_copy_nb_elems].src =
+ rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
+ batch_copy[vq->batch_copy_nb_elems].log_addr =
+ buf_iova + buf_offset;
+ batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
+ vq->batch_copy_nb_elems++;
+ }
+
+ mbuf_avail -= cpy_len;
+ mbuf_offset += cpy_len;
+ buf_avail -= cpy_len;
+ buf_offset += cpy_len;
+ }
+
+out:
+
+ return error;
+}
+
+static __rte_always_inline uint32_t
+virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ struct rte_mbuf **pkts, uint32_t count)
+{
+ uint32_t pkt_idx = 0;
+ uint16_t num_buffers;
+ struct buf_vector buf_vec[BUF_VECTOR_MAX];
+ uint16_t avail_head;
+
+ avail_head = *((volatile uint16_t *)&vq->avail->idx);
+
+ /*
+ * The ordering between avail index and
+ * desc reads needs to be enforced.
+ */
+ rte_smp_rmb();
+
+ rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
+
+ for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+ uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
+ uint16_t nr_vec = 0;
+
+ if (unlikely(reserve_avail_buf_split(dev, vq,
+ pkt_len, buf_vec, &num_buffers,
+ avail_head, &nr_vec) < 0)) {
+ VHOST_LOG_DEBUG(VHOST_DATA,
+ "(%d) failed to get enough desc from vring\n",
+ dev->vid);
+ vq->shadow_used_idx -= num_buffers;
+ break;
+ }
+
+ rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr);
+
+ VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
+ dev->vid, vq->last_avail_idx,
+ vq->last_avail_idx + num_buffers);
+
+ if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx],
+ buf_vec, nr_vec,
+ num_buffers) < 0) {
+ vq->shadow_used_idx -= num_buffers;
+ break;
+ }
+
+ vq->last_avail_idx += num_buffers;
+ }
+
+ do_data_copy_enqueue(dev, vq);
+
+ if (likely(vq->shadow_used_idx)) {
+ flush_shadow_used_ring_split(dev, vq);
+ vhost_vring_call_split(dev, vq);
+ }
+
+ return pkt_idx;
+}
+
+static __rte_always_inline uint32_t
+virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ struct rte_mbuf **pkts, uint32_t count)
+{
+ uint32_t pkt_idx = 0;
+ uint16_t num_buffers;
+ struct buf_vector buf_vec[BUF_VECTOR_MAX];
+
+ for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+ uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
+ uint16_t nr_vec = 0;
+ uint16_t nr_descs = 0;
+
+ if (unlikely(reserve_avail_buf_packed(dev, vq,
+ pkt_len, buf_vec, &nr_vec,
+ &num_buffers, &nr_descs) < 0)) {
+ VHOST_LOG_DEBUG(VHOST_DATA,
+ "(%d) failed to get enough desc from vring\n",
+ dev->vid);
+ vq->shadow_used_idx -= num_buffers;
+ break;
+ }
+
+ rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr);
+
+ VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
+ dev->vid, vq->last_avail_idx,
+ vq->last_avail_idx + num_buffers);
+
+ if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx],
+ buf_vec, nr_vec,
+ num_buffers) < 0) {
+ vq->shadow_used_idx -= num_buffers;
+ break;
+ }
+
+ vq->last_avail_idx += nr_descs;
+ if (vq->last_avail_idx >= vq->size) {
+ vq->last_avail_idx -= vq->size;
+ vq->avail_wrap_counter ^= 1;
+ }
+ }
+
+ do_data_copy_enqueue(dev, vq);
+
+ if (likely(vq->shadow_used_idx)) {
+ flush_shadow_used_ring_packed(dev, vq);
+ vhost_vring_call_packed(dev, vq);
+ }
+
+ return pkt_idx;
+}
+
+static __rte_always_inline uint32_t
+virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
+ struct rte_mbuf **pkts, uint32_t count)
+{
+ struct vhost_virtqueue *vq;
+ uint32_t nb_tx = 0;
+
+ VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
+ if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
+ RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
+ dev->vid, __func__, queue_id);
+ return 0;
+ }
+
+ vq = dev->virtqueue[queue_id];
+
+ rte_spinlock_lock(&vq->access_lock);
+
+ if (unlikely(vq->enabled == 0))
+ goto out_access_unlock;
+
+ if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+ vhost_user_iotlb_rd_lock(vq);
+
+ if (unlikely(vq->access_ok == 0))
+ if (unlikely(vring_translate(dev, vq) < 0))
+ goto out;
+
+ count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
+ if (count == 0)
+ goto out;
+
+ if (vq_is_packed(dev))
+ nb_tx = virtio_dev_rx_packed(dev, vq, pkts, count);
+ else
+ nb_tx = virtio_dev_rx_split(dev, vq, pkts, count);
+
+out:
+ if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+ vhost_user_iotlb_rd_unlock(vq);
+
+out_access_unlock:
+ rte_spinlock_unlock(&vq->access_lock);
+
+ return nb_tx;
+}
+
+uint16_t
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+ struct rte_mbuf **pkts, uint16_t count)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (!dev)
+ return 0;
+
+ if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
+ RTE_LOG(ERR, VHOST_DATA,
+ "(%d) %s: built-in vhost net backend is disabled.\n",
+ dev->vid, __func__);
+ return 0;
+ }
+
+ return virtio_dev_rx(dev, queue_id, pkts, count);
+}
+
+static inline bool
+virtio_net_with_host_offload(struct virtio_net *dev)
+{
+ if (dev->features &
+ ((1ULL << VIRTIO_NET_F_CSUM) |
+ (1ULL << VIRTIO_NET_F_HOST_ECN) |
+ (1ULL << VIRTIO_NET_F_HOST_TSO4) |
+ (1ULL << VIRTIO_NET_F_HOST_TSO6) |
+ (1ULL << VIRTIO_NET_F_HOST_UFO)))
+ return true;
+
+ return false;
+}
+
+static void
+parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ struct ipv6_hdr *ipv6_hdr;
+ void *l3_hdr = NULL;
+ struct ether_hdr *eth_hdr;
+ uint16_t ethertype;
+
+ eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
+
+ m->l2_len = sizeof(struct ether_hdr);
+ ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
+
+ if (ethertype == ETHER_TYPE_VLAN) {
+ struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
+
+ m->l2_len += sizeof(struct vlan_hdr);
+ ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
+ }
+
+ l3_hdr = (char *)eth_hdr + m->l2_len;
+
+ switch (ethertype) {
+ case ETHER_TYPE_IPv4:
+ ipv4_hdr = l3_hdr;
+ *l4_proto = ipv4_hdr->next_proto_id;
+ m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
+ *l4_hdr = (char *)l3_hdr + m->l3_len;
+ m->ol_flags |= PKT_TX_IPV4;
+ break;
+ case ETHER_TYPE_IPv6:
+ ipv6_hdr = l3_hdr;
+ *l4_proto = ipv6_hdr->proto;
+ m->l3_len = sizeof(struct ipv6_hdr);
+ *l4_hdr = (char *)l3_hdr + m->l3_len;
+ m->ol_flags |= PKT_TX_IPV6;
+ break;
+ default:
+ m->l3_len = 0;
+ *l4_proto = 0;
+ *l4_hdr = NULL;
+ break;
+ }
+}
+
+static __rte_always_inline void
+vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
+{
+ uint16_t l4_proto = 0;
+ void *l4_hdr = NULL;
+ struct tcp_hdr *tcp_hdr = NULL;
+
+ if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
+ return;
+
+ parse_ethernet(m, &l4_proto, &l4_hdr);
+ if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+ if (hdr->csum_start == (m->l2_len + m->l3_len)) {
+ switch (hdr->csum_offset) {
+ case (offsetof(struct tcp_hdr, cksum)):
+ if (l4_proto == IPPROTO_TCP)
+ m->ol_flags |= PKT_TX_TCP_CKSUM;
+ break;
+ case (offsetof(struct udp_hdr, dgram_cksum)):
+ if (l4_proto == IPPROTO_UDP)
+ m->ol_flags |= PKT_TX_UDP_CKSUM;
+ break;
+ case (offsetof(struct sctp_hdr, cksum)):
+ if (l4_proto == IPPROTO_SCTP)
+ m->ol_flags |= PKT_TX_SCTP_CKSUM;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+ switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+ case VIRTIO_NET_HDR_GSO_TCPV4:
+ case VIRTIO_NET_HDR_GSO_TCPV6:
+ tcp_hdr = l4_hdr;
+ m->ol_flags |= PKT_TX_TCP_SEG;
+ m->tso_segsz = hdr->gso_size;
+ m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
+ break;
+ case VIRTIO_NET_HDR_GSO_UDP:
+ m->ol_flags |= PKT_TX_UDP_SEG;
+ m->tso_segsz = hdr->gso_size;
+ m->l4_len = sizeof(struct udp_hdr);
+ break;
+ default:
+ RTE_LOG(WARNING, VHOST_DATA,
+ "unsupported gso type %u.\n", hdr->gso_type);
+ break;
+ }
+ }
+}
+
+static __rte_always_inline int
+copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ struct buf_vector *buf_vec, uint16_t nr_vec,
+ struct rte_mbuf *m, struct rte_mempool *mbuf_pool)
+{
+ uint32_t buf_avail, buf_offset;
+ uint64_t buf_addr, buf_iova, buf_len;
+ uint32_t mbuf_avail, mbuf_offset;
+ uint32_t cpy_len;
+ struct rte_mbuf *cur = m, *prev = m;
+ struct virtio_net_hdr tmp_hdr;
+ struct virtio_net_hdr *hdr = NULL;
+ /* A counter to avoid desc dead loop chain */
+ uint16_t vec_idx = 0;
+ struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
+ int error = 0;
+
+ buf_addr = buf_vec[vec_idx].buf_addr;
+ buf_iova = buf_vec[vec_idx].buf_iova;
+ buf_len = buf_vec[vec_idx].buf_len;
+
+ if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
+ error = -1;
+ goto out;
+ }
+
+ if (likely(nr_vec > 1))
+ rte_prefetch0((void *)(uintptr_t)buf_vec[1].buf_addr);
+
+ if (virtio_net_with_host_offload(dev)) {
+ if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
+ uint64_t len;
+ uint64_t remain = sizeof(struct virtio_net_hdr);
+ uint64_t src;
+ uint64_t dst = (uint64_t)(uintptr_t)&tmp_hdr;
+ uint16_t hdr_vec_idx = 0;
+
+ /*
+ * No luck, the virtio-net header doesn't fit
+ * in a contiguous virtual area.
+ */
+ while (remain) {
+ len = RTE_MIN(remain,
+ buf_vec[hdr_vec_idx].buf_len);
+ src = buf_vec[hdr_vec_idx].buf_addr;
+ rte_memcpy((void *)(uintptr_t)dst,
+ (void *)(uintptr_t)src, len);
+
+ remain -= len;
+ dst += len;
+ hdr_vec_idx++;
+ }
+
+ hdr = &tmp_hdr;
+ } else {
+ hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
+ rte_prefetch0(hdr);
+ }
+ }
+
+ /*
+ * A virtio driver normally uses at least 2 desc buffers
+ * for Tx: the first for storing the header, and others
+ * for storing the data.
+ */
+ if (unlikely(buf_len < dev->vhost_hlen)) {
+ buf_offset = dev->vhost_hlen - buf_len;
+ vec_idx++;
+ buf_addr = buf_vec[vec_idx].buf_addr;
+ buf_iova = buf_vec[vec_idx].buf_iova;
+ buf_len = buf_vec[vec_idx].buf_len;
+ buf_avail = buf_len - buf_offset;
+ } else if (buf_len == dev->vhost_hlen) {
+ if (unlikely(++vec_idx >= nr_vec))
+ goto out;
+ buf_addr = buf_vec[vec_idx].buf_addr;
+ buf_iova = buf_vec[vec_idx].buf_iova;
+ buf_len = buf_vec[vec_idx].buf_len;
+
+ buf_offset = 0;
+ buf_avail = buf_len;
+ } else {
+ buf_offset = dev->vhost_hlen;
+ buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
+ }
+
+ rte_prefetch0((void *)(uintptr_t)
+ (buf_addr + buf_offset));
+
+ PRINT_PACKET(dev,
+ (uintptr_t)(buf_addr + buf_offset),
+ (uint32_t)buf_avail, 0);
+
+ mbuf_offset = 0;
+ mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
+ while (1) {
+ uint64_t hpa;
+
+ cpy_len = RTE_MIN(buf_avail, mbuf_avail);
+
+ /*
+ * A desc buf might across two host physical pages that are
+ * not continuous. In such case (gpa_to_hpa returns 0), data
+ * will be copied even though zero copy is enabled.
+ */
+ if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev,
+ buf_iova + buf_offset, cpy_len)))) {
+ cur->data_len = cpy_len;
+ cur->data_off = 0;
+ cur->buf_addr =
+ (void *)(uintptr_t)(buf_addr + buf_offset);
+ cur->buf_iova = hpa;
+
+ /*
+ * In zero copy mode, one mbuf can only reference data
+ * for one or partial of one desc buff.
+ */
+ mbuf_avail = cpy_len;
+ } else {
+ if (likely(cpy_len > MAX_BATCH_LEN ||
+ vq->batch_copy_nb_elems >= vq->size ||
+ (hdr && cur == m))) {
+ rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
+ mbuf_offset),
+ (void *)((uintptr_t)(buf_addr +
+ buf_offset)),
+ cpy_len);
+ } else {
+ batch_copy[vq->batch_copy_nb_elems].dst =
+ rte_pktmbuf_mtod_offset(cur, void *,
+ mbuf_offset);
+ batch_copy[vq->batch_copy_nb_elems].src =
+ (void *)((uintptr_t)(buf_addr +
+ buf_offset));
+ batch_copy[vq->batch_copy_nb_elems].len =
+ cpy_len;
+ vq->batch_copy_nb_elems++;
+ }
+ }
+
+ mbuf_avail -= cpy_len;
+ mbuf_offset += cpy_len;
+ buf_avail -= cpy_len;
+ buf_offset += cpy_len;
+
+ /* This buf reaches to its end, get the next one */
+ if (buf_avail == 0) {
+ if (++vec_idx >= nr_vec)
+ break;
+
+ buf_addr = buf_vec[vec_idx].buf_addr;
+ buf_iova = buf_vec[vec_idx].buf_iova;
+ buf_len = buf_vec[vec_idx].buf_len;
+
+ /*
+ * Prefecth desc n + 1 buffer while
+ * desc n buffer is processed.
+ */
+ if (vec_idx + 1 < nr_vec)
+ rte_prefetch0((void *)(uintptr_t)
+ buf_vec[vec_idx + 1].buf_addr);
+
+ buf_offset = 0;
+ buf_avail = buf_len;
+
+ PRINT_PACKET(dev, (uintptr_t)buf_addr,
+ (uint32_t)buf_avail, 0);
+ }
+
+ /*
+ * This mbuf reaches to its end, get a new one
+ * to hold more data.
+ */
+ if (mbuf_avail == 0) {
+ cur = rte_pktmbuf_alloc(mbuf_pool);
+ if (unlikely(cur == NULL)) {
+ RTE_LOG(ERR, VHOST_DATA, "Failed to "
+ "allocate memory for mbuf.\n");
+ error = -1;
+ goto out;
+ }
+ if (unlikely(dev->dequeue_zero_copy))
+ rte_mbuf_refcnt_update(cur, 1);
+
+ prev->next = cur;
+ prev->data_len = mbuf_offset;
+ m->nb_segs += 1;
+ m->pkt_len += mbuf_offset;
+ prev = cur;
+
+ mbuf_offset = 0;
+ mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
+ }
+ }
+
+ prev->data_len = mbuf_offset;
+ m->pkt_len += mbuf_offset;
+
+ if (hdr)
+ vhost_dequeue_offload(hdr, m);
+
+out:
+
+ return error;
+}
+
+static __rte_always_inline struct zcopy_mbuf *
+get_zmbuf(struct vhost_virtqueue *vq)
+{
+ uint16_t i;
+ uint16_t last;
+ int tries = 0;
+
+ /* search [last_zmbuf_idx, zmbuf_size) */
+ i = vq->last_zmbuf_idx;
+ last = vq->zmbuf_size;
+
+again:
+ for (; i < last; i++) {
+ if (vq->zmbufs[i].in_use == 0) {
+ vq->last_zmbuf_idx = i + 1;
+ vq->zmbufs[i].in_use = 1;
+ return &vq->zmbufs[i];
+ }
+ }
+
+ tries++;
+ if (tries == 1) {
+ /* search [0, last_zmbuf_idx) */
+ i = 0;
+ last = vq->last_zmbuf_idx;
+ goto again;
+ }
+
+ return NULL;
+}
+
+static __rte_always_inline uint16_t
+virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
+{
+ uint16_t i;
+ uint16_t free_entries;
+
+ if (unlikely(dev->dequeue_zero_copy)) {
+ struct zcopy_mbuf *zmbuf, *next;
+
+ for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
+ zmbuf != NULL; zmbuf = next) {
+ next = TAILQ_NEXT(zmbuf, next);
+
+ if (mbuf_is_consumed(zmbuf->mbuf)) {
+ update_shadow_used_ring_split(vq,
+ zmbuf->desc_idx, 0);
+ TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
+ restore_mbuf(zmbuf->mbuf);
+ rte_pktmbuf_free(zmbuf->mbuf);
+ put_zmbuf(zmbuf);
+ vq->nr_zmbuf -= 1;
+ }
+ }
+
+ if (likely(vq->shadow_used_idx)) {
+ flush_shadow_used_ring_split(dev, vq);
+ vhost_vring_call_split(dev, vq);
+ }
+ }
+
+ free_entries = *((volatile uint16_t *)&vq->avail->idx) -
+ vq->last_avail_idx;
+ if (free_entries == 0)
+ return 0;
+
+ /*
+ * The ordering between avail index and
+ * desc reads needs to be enforced.
+ */
+ rte_smp_rmb();
+
+ rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
+
+ VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
+
+ count = RTE_MIN(count, MAX_PKT_BURST);
+ count = RTE_MIN(count, free_entries);
+ VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
+ dev->vid, count);
+
+ for (i = 0; i < count; i++) {
+ struct buf_vector buf_vec[BUF_VECTOR_MAX];
+ uint16_t head_idx;
+ uint32_t dummy_len;
+ uint16_t nr_vec = 0;
+ int err;
+
+ if (unlikely(fill_vec_buf_split(dev, vq,
+ vq->last_avail_idx + i,
+ &nr_vec, buf_vec,
+ &head_idx, &dummy_len,
+ VHOST_ACCESS_RO) < 0))
+ break;
+
+ if (likely(dev->dequeue_zero_copy == 0))
+ update_shadow_used_ring_split(vq, head_idx, 0);
+
+ rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr);
+
+ pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
+ if (unlikely(pkts[i] == NULL)) {
+ RTE_LOG(ERR, VHOST_DATA,
+ "Failed to allocate memory for mbuf.\n");
+ break;
+ }
+
+ err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
+ mbuf_pool);
+ if (unlikely(err)) {
+ rte_pktmbuf_free(pkts[i]);
+ break;
+ }
+
+ if (unlikely(dev->dequeue_zero_copy)) {
+ struct zcopy_mbuf *zmbuf;
+
+ zmbuf = get_zmbuf(vq);
+ if (!zmbuf) {
+ rte_pktmbuf_free(pkts[i]);
+ break;
+ }
+ zmbuf->mbuf = pkts[i];
+ zmbuf->desc_idx = head_idx;
+
+ /*
+ * Pin lock the mbuf; we will check later to see
+ * whether the mbuf is freed (when we are the last
+ * user) or not. If that's the case, we then could
+ * update the used ring safely.
+ */
+ rte_mbuf_refcnt_update(pkts[i], 1);
+
+ vq->nr_zmbuf += 1;
+ TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
+ }
+ }
+ vq->last_avail_idx += i;
+
+ if (likely(dev->dequeue_zero_copy == 0)) {
+ do_data_copy_dequeue(vq);
+ if (unlikely(i < count))
+ vq->shadow_used_idx = i;
+ if (likely(vq->shadow_used_idx)) {
+ flush_shadow_used_ring_split(dev, vq);
+ vhost_vring_call_split(dev, vq);
+ }
+ }
+
+ return i;
+}
+
+static __rte_always_inline uint16_t
+virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
+{
+ uint16_t i;
+
+ if (unlikely(dev->dequeue_zero_copy)) {
+ struct zcopy_mbuf *zmbuf, *next;
+
+ for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
+ zmbuf != NULL; zmbuf = next) {
+ next = TAILQ_NEXT(zmbuf, next);
+
+ if (mbuf_is_consumed(zmbuf->mbuf)) {
+ update_shadow_used_ring_packed(vq,
+ zmbuf->desc_idx,
+ 0,
+ zmbuf->desc_count);
+
+ TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
+ restore_mbuf(zmbuf->mbuf);
+ rte_pktmbuf_free(zmbuf->mbuf);
+ put_zmbuf(zmbuf);
+ vq->nr_zmbuf -= 1;
+ }
+ }
+
+ if (likely(vq->shadow_used_idx)) {
+ flush_shadow_used_ring_packed(dev, vq);
+ vhost_vring_call_packed(dev, vq);
+ }
+ }
+
+ VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
+
+ count = RTE_MIN(count, MAX_PKT_BURST);
+ VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
+ dev->vid, count);
+
+ for (i = 0; i < count; i++) {
+ struct buf_vector buf_vec[BUF_VECTOR_MAX];
+ uint16_t buf_id;
+ uint32_t dummy_len;
+ uint16_t desc_count, nr_vec = 0;
+ int err;
+
+ if (unlikely(fill_vec_buf_packed(dev, vq,
+ vq->last_avail_idx, &desc_count,
+ buf_vec, &nr_vec,
+ &buf_id, &dummy_len,
+ VHOST_ACCESS_RO) < 0))
+ break;
+
+ if (likely(dev->dequeue_zero_copy == 0))
+ update_shadow_used_ring_packed(vq, buf_id, 0,
+ desc_count);
+
+ rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr);
+
+ pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
+ if (unlikely(pkts[i] == NULL)) {
+ RTE_LOG(ERR, VHOST_DATA,
+ "Failed to allocate memory for mbuf.\n");
+ break;
+ }
+
+ err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
+ mbuf_pool);
+ if (unlikely(err)) {
+ rte_pktmbuf_free(pkts[i]);
+ break;
+ }
+
+ if (unlikely(dev->dequeue_zero_copy)) {
+ struct zcopy_mbuf *zmbuf;
+
+ zmbuf = get_zmbuf(vq);
+ if (!zmbuf) {
+ rte_pktmbuf_free(pkts[i]);
+ break;
+ }
+ zmbuf->mbuf = pkts[i];
+ zmbuf->desc_idx = buf_id;
+ zmbuf->desc_count = desc_count;
+
+ /*
+ * Pin lock the mbuf; we will check later to see
+ * whether the mbuf is freed (when we are the last
+ * user) or not. If that's the case, we then could
+ * update the used ring safely.
+ */
+ rte_mbuf_refcnt_update(pkts[i], 1);
+
+ vq->nr_zmbuf += 1;
+ TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
+ }
+
+ vq->last_avail_idx += desc_count;
+ if (vq->last_avail_idx >= vq->size) {
+ vq->last_avail_idx -= vq->size;
+ vq->avail_wrap_counter ^= 1;
+ }
+ }
+
+ if (likely(dev->dequeue_zero_copy == 0)) {
+ do_data_copy_dequeue(vq);
+ if (unlikely(i < count))
+ vq->shadow_used_idx = i;
+ if (likely(vq->shadow_used_idx)) {
+ flush_shadow_used_ring_packed(dev, vq);
+ vhost_vring_call_packed(dev, vq);
+ }
+ }
+
+ return i;
+}
+
+uint16_t
+rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
+ struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
+{
+ struct virtio_net *dev;
+ struct rte_mbuf *rarp_mbuf = NULL;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (!dev)
+ return 0;
+
+ if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
+ RTE_LOG(ERR, VHOST_DATA,
+ "(%d) %s: built-in vhost net backend is disabled.\n",
+ dev->vid, __func__);
+ return 0;
+ }
+
+ if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
+ RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
+ dev->vid, __func__, queue_id);
+ return 0;
+ }
+
+ vq = dev->virtqueue[queue_id];
+
+ if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
+ return 0;
+
+ if (unlikely(vq->enabled == 0)) {
+ count = 0;
+ goto out_access_unlock;
+ }
+
+ if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+ vhost_user_iotlb_rd_lock(vq);
+
+ if (unlikely(vq->access_ok == 0))
+ if (unlikely(vring_translate(dev, vq) < 0)) {
+ count = 0;
+ goto out;
+ }
+
+ /*
+ * Construct a RARP broadcast packet, and inject it to the "pkts"
+ * array, to looks like that guest actually send such packet.
+ *
+ * Check user_send_rarp() for more information.
+ *
+ * broadcast_rarp shares a cacheline in the virtio_net structure
+ * with some fields that are accessed during enqueue and
+ * rte_atomic16_cmpset() causes a write if using cmpxchg. This could
+ * result in false sharing between enqueue and dequeue.
+ *
+ * Prevent unnecessary false sharing by reading broadcast_rarp first
+ * and only performing cmpset if the read indicates it is likely to
+ * be set.
+ */
+ if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) &&
+ rte_atomic16_cmpset((volatile uint16_t *)
+ &dev->broadcast_rarp.cnt, 1, 0))) {
+
+ rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
+ if (rarp_mbuf == NULL) {
+ RTE_LOG(ERR, VHOST_DATA,
+ "Failed to make RARP packet.\n");
+ count = 0;
+ goto out;
+ }
+ count -= 1;
+ }
+
+ if (vq_is_packed(dev))
+ count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count);
+ else
+ count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count);
+
+out:
+ if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+ vhost_user_iotlb_rd_unlock(vq);
+
+out_access_unlock:
+ rte_spinlock_unlock(&vq->access_lock);
+
+ if (unlikely(rarp_mbuf != NULL)) {
+ /*
+ * Inject it to the head of "pkts" array, so that switch's mac
+ * learning table will get updated first.
+ */
+ memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *));
+ pkts[0] = rarp_mbuf;
+ count += 1;
+ }
+
+ return count;
+}