summaryrefslogtreecommitdiffstats
path: root/src/seastar/dpdk/drivers/net/xenvirt
diff options
context:
space:
mode:
Diffstat (limited to 'src/seastar/dpdk/drivers/net/xenvirt')
-rw-r--r--src/seastar/dpdk/drivers/net/xenvirt/Makefile57
-rw-r--r--src/seastar/dpdk/drivers/net/xenvirt/rte_eth_xenvirt.c767
-rw-r--r--src/seastar/dpdk/drivers/net/xenvirt/rte_eth_xenvirt.h61
-rw-r--r--src/seastar/dpdk/drivers/net/xenvirt/rte_eth_xenvirt_version.map7
-rw-r--r--src/seastar/dpdk/drivers/net/xenvirt/rte_mempool_gntalloc.c295
-rw-r--r--src/seastar/dpdk/drivers/net/xenvirt/rte_xen_lib.c454
-rw-r--r--src/seastar/dpdk/drivers/net/xenvirt/rte_xen_lib.h116
-rw-r--r--src/seastar/dpdk/drivers/net/xenvirt/virtio_logs.h70
-rw-r--r--src/seastar/dpdk/drivers/net/xenvirt/virtqueue.h273
9 files changed, 2100 insertions, 0 deletions
diff --git a/src/seastar/dpdk/drivers/net/xenvirt/Makefile b/src/seastar/dpdk/drivers/net/xenvirt/Makefile
new file mode 100644
index 00000000..8b4b8f03
--- /dev/null
+++ b/src/seastar/dpdk/drivers/net/xenvirt/Makefile
@@ -0,0 +1,57 @@
+# BSD LICENSE
+#
+# Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+#
+# library name
+#
+LIB = librte_pmd_xenvirt.a
+
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS)
+LDLIBS += -lxenstore
+
+EXPORT_MAP := rte_eth_xenvirt_version.map
+
+LIBABIVER := 1
+
+#
+# all source are stored in SRCS-y
+#
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_XENVIRT) += rte_eth_xenvirt.c rte_mempool_gntalloc.c rte_xen_lib.c
+
+#
+# Export include files
+#
+SYMLINK-y-include += rte_eth_xenvirt.h
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/src/seastar/dpdk/drivers/net/xenvirt/rte_eth_xenvirt.c b/src/seastar/dpdk/drivers/net/xenvirt/rte_eth_xenvirt.c
new file mode 100644
index 00000000..7bd29fae
--- /dev/null
+++ b/src/seastar/dpdk/drivers/net/xenvirt/rte_eth_xenvirt.c
@@ -0,0 +1,767 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <errno.h>
+#include <sys/user.h>
+#ifndef PAGE_SIZE
+#define PAGE_SIZE sysconf(_SC_PAGE_SIZE)
+#endif
+#include <linux/binfmts.h>
+#include <xen/xen-compat.h>
+#if __XEN_LATEST_INTERFACE_VERSION__ < 0x00040200
+#include <xs.h>
+#else
+#include <xenstore.h>
+#endif
+#include <linux/virtio_ring.h>
+
+#include <rte_mbuf.h>
+#include <rte_ethdev.h>
+#include <rte_malloc.h>
+#include <rte_memcpy.h>
+#include <rte_string_fns.h>
+#include <rte_vdev.h>
+#include <cmdline_parse.h>
+#include <cmdline_parse_etheraddr.h>
+
+#include "rte_xen_lib.h"
+#include "virtqueue.h"
+#include "rte_eth_xenvirt.h"
+
+#define VQ_DESC_NUM 256
+#define VIRTIO_MBUF_BURST_SZ 64
+
+/* virtio_idx is increased after new device is created.*/
+static int virtio_idx = 0;
+
+static struct rte_eth_link pmd_link = {
+ .link_speed = ETH_SPEED_NUM_10G,
+ .link_duplex = ETH_LINK_FULL_DUPLEX,
+ .link_status = ETH_LINK_DOWN,
+ .link_autoneg = ETH_LINK_SPEED_FIXED
+};
+
+static void
+eth_xenvirt_free_queues(struct rte_eth_dev *dev);
+
+static uint16_t
+eth_xenvirt_rx(void *q, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+ struct virtqueue *rxvq = q;
+ struct rte_mbuf *rxm, *new_mbuf;
+ uint16_t nb_used, num;
+ uint32_t len[VIRTIO_MBUF_BURST_SZ];
+ uint32_t i;
+ struct pmd_internals *pi = rxvq->internals;
+
+ nb_used = VIRTQUEUE_NUSED(rxvq);
+
+ rte_smp_rmb();
+ num = (uint16_t)(likely(nb_used <= nb_pkts) ? nb_used : nb_pkts);
+ num = (uint16_t)(likely(num <= VIRTIO_MBUF_BURST_SZ) ? num : VIRTIO_MBUF_BURST_SZ);
+ if (unlikely(num == 0)) return 0;
+
+ num = virtqueue_dequeue_burst(rxvq, rx_pkts, len, num);
+ PMD_RX_LOG(DEBUG, "used:%d dequeue:%d\n", nb_used, num);
+ for (i = 0; i < num ; i ++) {
+ rxm = rx_pkts[i];
+ PMD_RX_LOG(DEBUG, "packet len:%d\n", len[i]);
+ rxm->next = NULL;
+ rxm->data_off = RTE_PKTMBUF_HEADROOM;
+ rxm->data_len = (uint16_t)(len[i] - sizeof(struct virtio_net_hdr));
+ rxm->nb_segs = 1;
+ rxm->port = pi->port_id;
+ rxm->pkt_len = (uint32_t)(len[i] - sizeof(struct virtio_net_hdr));
+ }
+ /* allocate new mbuf for the used descriptor */
+ while (likely(!virtqueue_full(rxvq))) {
+ new_mbuf = rte_mbuf_raw_alloc(rxvq->mpool);
+ if (unlikely(new_mbuf == NULL)) {
+ break;
+ }
+ if (unlikely(virtqueue_enqueue_recv_refill(rxvq, new_mbuf))) {
+ rte_pktmbuf_free_seg(new_mbuf);
+ break;
+ }
+ }
+ pi->eth_stats.ipackets += num;
+ return num;
+}
+
+static uint16_t
+eth_xenvirt_tx(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+ struct virtqueue *txvq = tx_queue;
+ struct rte_mbuf *txm;
+ uint16_t nb_used, nb_tx, num, i;
+ int error;
+ uint32_t len[VIRTIO_MBUF_BURST_SZ];
+ struct rte_mbuf *snd_pkts[VIRTIO_MBUF_BURST_SZ];
+ struct pmd_internals *pi = txvq->internals;
+
+ nb_tx = 0;
+
+ if (unlikely(nb_pkts == 0))
+ return 0;
+
+ PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
+ nb_used = VIRTQUEUE_NUSED(txvq);
+
+ rte_smp_rmb();
+
+ num = (uint16_t)(likely(nb_used <= VIRTIO_MBUF_BURST_SZ) ? nb_used : VIRTIO_MBUF_BURST_SZ);
+ num = virtqueue_dequeue_burst(txvq, snd_pkts, len, num);
+
+ for (i = 0; i < num ; i ++) {
+ /* mergable not supported, one segment only */
+ rte_pktmbuf_free_seg(snd_pkts[i]);
+ }
+
+ while (nb_tx < nb_pkts) {
+ if (likely(!virtqueue_full(txvq))) {
+ /* TODO drop tx_pkts if it contains multiple segments */
+ txm = tx_pkts[nb_tx];
+ error = virtqueue_enqueue_xmit(txvq, txm);
+ if (unlikely(error)) {
+ if (error == ENOSPC)
+ PMD_TX_LOG(ERR, "virtqueue_enqueue Free count = 0\n");
+ else if (error == EMSGSIZE)
+ PMD_TX_LOG(ERR, "virtqueue_enqueue Free count < 1\n");
+ else
+ PMD_TX_LOG(ERR, "virtqueue_enqueue error: %d\n", error);
+ break;
+ }
+ nb_tx++;
+ } else {
+ PMD_TX_LOG(ERR, "No free tx descriptors to transmit\n");
+ /* virtqueue_notify not needed in our para-virt solution */
+ break;
+ }
+ }
+ pi->eth_stats.opackets += nb_tx;
+ return nb_tx;
+}
+
+static int
+eth_dev_configure(struct rte_eth_dev *dev __rte_unused)
+{
+ RTE_LOG(ERR, PMD, "%s\n", __func__);
+ return 0;
+}
+
+/*
+ * Create a shared page between guest and host.
+ * Host monitors this page if it is cleared on unmap, and then
+ * do necessary clean up.
+ */
+static void
+gntalloc_vring_flag(int vtidx)
+{
+ char key_str[PATH_MAX];
+ char val_str[PATH_MAX];
+ uint32_t gref_tmp;
+ void *ptr;
+
+ if (grefwatch_from_alloc(&gref_tmp, &ptr)) {
+ RTE_LOG(ERR, PMD, "grefwatch_from_alloc error\n");
+ exit(0);
+ }
+
+ *(uint8_t *)ptr = MAP_FLAG;
+ snprintf(val_str, sizeof(val_str), "%u", gref_tmp);
+ snprintf(key_str, sizeof(key_str),
+ DPDK_XENSTORE_PATH"%d"VRING_FLAG_STR, vtidx);
+ xenstore_write(key_str, val_str);
+}
+
+/*
+ * Notify host this virtio device is started.
+ * Host could start polling this device.
+ */
+static void
+dev_start_notify(int vtidx)
+{
+ char key_str[PATH_MAX];
+ char val_str[PATH_MAX];
+
+ RTE_LOG(INFO, PMD, "%s: virtio %d is started\n", __func__, vtidx);
+ gntalloc_vring_flag(vtidx);
+
+ snprintf(key_str, sizeof(key_str), "%s%s%d",
+ DPDK_XENSTORE_PATH, EVENT_TYPE_START_STR,
+ vtidx);
+ snprintf(val_str, sizeof(val_str), "1");
+ xenstore_write(key_str, val_str);
+}
+
+/*
+ * Notify host this virtio device is stopped.
+ * Host could stop polling this device.
+ */
+static void
+dev_stop_notify(int vtidx)
+{
+ RTE_SET_USED(vtidx);
+}
+
+
+static int
+update_mac_address(struct ether_addr *mac_addrs, int vtidx)
+{
+ char key_str[PATH_MAX];
+ char val_str[PATH_MAX];
+ int rv;
+
+ if (mac_addrs == NULL) {
+ RTE_LOG(ERR, PMD, "%s: NULL pointer mac specified\n", __func__);
+ return -1;
+ }
+ rv = snprintf(key_str, sizeof(key_str),
+ DPDK_XENSTORE_PATH"%d_ether_addr", vtidx);
+ if (rv == -1)
+ return rv;
+ rv = snprintf(val_str, sizeof(val_str), "%02x:%02x:%02x:%02x:%02x:%02x",
+ mac_addrs->addr_bytes[0],
+ mac_addrs->addr_bytes[1],
+ mac_addrs->addr_bytes[2],
+ mac_addrs->addr_bytes[3],
+ mac_addrs->addr_bytes[4],
+ mac_addrs->addr_bytes[5]);
+ if (rv == -1)
+ return rv;
+ if (xenstore_write(key_str, val_str))
+ return rv;
+ return 0;
+}
+
+
+static int
+eth_dev_start(struct rte_eth_dev *dev)
+{
+ struct virtqueue *rxvq = dev->data->rx_queues[0];
+ struct virtqueue *txvq = dev->data->tx_queues[0];
+ struct rte_mbuf *m;
+ struct pmd_internals *pi = (struct pmd_internals *)dev->data->dev_private;
+ int rv;
+
+ dev->data->dev_link.link_status = ETH_LINK_UP;
+ while (!virtqueue_full(rxvq)) {
+ m = rte_mbuf_raw_alloc(rxvq->mpool);
+ if (m == NULL)
+ break;
+ /* Enqueue allocated buffers. */
+ if (virtqueue_enqueue_recv_refill(rxvq, m)) {
+ rte_pktmbuf_free_seg(m);
+ break;
+ }
+ }
+
+ rxvq->internals = pi;
+ txvq->internals = pi;
+
+ rv = update_mac_address(dev->data->mac_addrs, pi->virtio_idx);
+ if (rv)
+ return -1;
+ dev_start_notify(pi->virtio_idx);
+
+ return 0;
+}
+
+static void
+eth_dev_stop(struct rte_eth_dev *dev)
+{
+ struct pmd_internals *pi = (struct pmd_internals *)dev->data->dev_private;
+
+ dev->data->dev_link.link_status = ETH_LINK_DOWN;
+ dev_stop_notify(pi->virtio_idx);
+}
+
+/*
+ * Notify host this virtio device is closed.
+ * Host could do necessary clean up to this device.
+ */
+static void
+eth_dev_close(struct rte_eth_dev *dev)
+{
+ eth_xenvirt_free_queues(dev);
+}
+
+static void
+eth_dev_info(struct rte_eth_dev *dev,
+ struct rte_eth_dev_info *dev_info)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+
+ RTE_SET_USED(internals);
+ dev_info->max_mac_addrs = 1;
+ dev_info->max_rx_pktlen = (uint32_t)2048;
+ dev_info->max_rx_queues = (uint16_t)1;
+ dev_info->max_tx_queues = (uint16_t)1;
+ dev_info->min_rx_bufsize = 0;
+}
+
+static void
+eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+ if(stats)
+ rte_memcpy(stats, &internals->eth_stats, sizeof(*stats));
+}
+
+static void
+eth_stats_reset(struct rte_eth_dev *dev)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+ /* Reset software totals */
+ memset(&internals->eth_stats, 0, sizeof(internals->eth_stats));
+}
+
+static void
+eth_queue_release(void *q)
+{
+ rte_free(q);
+}
+
+static int
+eth_link_update(struct rte_eth_dev *dev __rte_unused,
+ int wait_to_complete __rte_unused)
+{
+ return 0;
+}
+
+/*
+ * Create shared vring between guest and host.
+ * Memory is allocated through grant alloc driver, so it is not physical continuous.
+ */
+static void *
+gntalloc_vring_create(int queue_type, uint32_t size, int vtidx)
+{
+ char key_str[PATH_MAX] = {0};
+ char val_str[PATH_MAX] = {0};
+ void *va = NULL;
+ int pg_size;
+ uint32_t pg_num;
+ uint32_t *gref_arr = NULL;
+ phys_addr_t *pa_arr = NULL;
+ uint64_t start_index;
+ int rv;
+
+ pg_size = getpagesize();
+ size = RTE_ALIGN_CEIL(size, pg_size);
+ pg_num = size / pg_size;
+
+ gref_arr = calloc(pg_num, sizeof(gref_arr[0]));
+ pa_arr = calloc(pg_num, sizeof(pa_arr[0]));
+
+ if (gref_arr == NULL || pa_arr == NULL) {
+ RTE_LOG(ERR, PMD, "%s: calloc failed\n", __func__);
+ goto out;
+ }
+
+ va = gntalloc(size, gref_arr, &start_index);
+ if (va == NULL) {
+ RTE_LOG(ERR, PMD, "%s: gntalloc failed\n", __func__);
+ goto out;
+ }
+
+ if (get_phys_map(va, pa_arr, pg_num, pg_size))
+ goto out;
+
+ /* write in xenstore gref and pfn for each page of vring */
+ if (grant_node_create(pg_num, gref_arr, pa_arr, val_str, sizeof(val_str))) {
+ gntfree(va, size, start_index);
+ va = NULL;
+ goto out;
+ }
+
+ if (queue_type == VTNET_RQ)
+ rv = snprintf(key_str, sizeof(key_str), DPDK_XENSTORE_PATH"%d"RXVRING_XENSTORE_STR, vtidx);
+ else
+ rv = snprintf(key_str, sizeof(key_str), DPDK_XENSTORE_PATH"%d"TXVRING_XENSTORE_STR, vtidx);
+ if (rv == -1 || xenstore_write(key_str, val_str) == -1) {
+ gntfree(va, size, start_index);
+ va = NULL;
+ }
+out:
+ free(pa_arr);
+ free(gref_arr);
+
+ return va;
+}
+
+
+
+static struct virtqueue *
+virtio_queue_setup(struct rte_eth_dev *dev, int queue_type)
+{
+ struct virtqueue *vq = NULL;
+ uint16_t vq_size = VQ_DESC_NUM;
+ int i = 0;
+ char vq_name[VIRTQUEUE_MAX_NAME_SZ];
+ size_t size;
+ struct vring *vr;
+
+ /* Allocate memory for virtqueue. */
+ if (queue_type == VTNET_RQ) {
+ snprintf(vq_name, sizeof(vq_name), "port%d_rvq",
+ dev->data->port_id);
+ vq = rte_zmalloc(vq_name, sizeof(struct virtqueue) +
+ vq_size * sizeof(struct vq_desc_extra), RTE_CACHE_LINE_SIZE);
+ if (vq == NULL) {
+ RTE_LOG(ERR, PMD, "%s: unabled to allocate virtqueue\n", __func__);
+ return NULL;
+ }
+ memcpy(vq->vq_name, vq_name, sizeof(vq->vq_name));
+ } else if(queue_type == VTNET_TQ) {
+ snprintf(vq_name, sizeof(vq_name), "port%d_tvq",
+ dev->data->port_id);
+ vq = rte_zmalloc(vq_name, sizeof(struct virtqueue) +
+ vq_size * sizeof(struct vq_desc_extra), RTE_CACHE_LINE_SIZE);
+ if (vq == NULL) {
+ RTE_LOG(ERR, PMD, "%s: unabled to allocate virtqueue\n", __func__);
+ return NULL;
+ }
+ memcpy(vq->vq_name, vq_name, sizeof(vq->vq_name));
+ }
+
+ memcpy(vq->vq_name, vq_name, sizeof(vq->vq_name));
+
+ vq->vq_alignment = VIRTIO_PCI_VRING_ALIGN;
+ vq->vq_nentries = vq_size;
+ vq->vq_free_cnt = vq_size;
+ /* Calcuate vring size according to virtio spec */
+ size = vring_size(vq_size, VIRTIO_PCI_VRING_ALIGN);
+ vq->vq_ring_size = RTE_ALIGN_CEIL(size, VIRTIO_PCI_VRING_ALIGN);
+ /* Allocate memory for virtio vring through gntalloc driver*/
+ vq->vq_ring_virt_mem = gntalloc_vring_create(queue_type, vq->vq_ring_size,
+ ((struct pmd_internals *)dev->data->dev_private)->virtio_idx);
+ memset(vq->vq_ring_virt_mem, 0, vq->vq_ring_size);
+ vr = &vq->vq_ring;
+ vring_init(vr, vq_size, vq->vq_ring_virt_mem, vq->vq_alignment);
+ /*
+ * Locally maintained last consumed index, this idex trails
+ * vq_ring.used->idx.
+ */
+ vq->vq_used_cons_idx = 0;
+ vq->vq_desc_head_idx = 0;
+ vq->vq_free_cnt = vq->vq_nentries;
+ memset(vq->vq_descx, 0, sizeof(struct vq_desc_extra) * vq->vq_nentries);
+
+ /* Chain all the descriptors in the ring with an END */
+ for (i = 0; i < vq_size - 1; i++)
+ vr->desc[i].next = (uint16_t)(i + 1);
+ vr->desc[i].next = VQ_RING_DESC_CHAIN_END;
+
+ return vq;
+}
+
+static int
+eth_rx_queue_setup(struct rte_eth_dev *dev,uint16_t rx_queue_id,
+ uint16_t nb_rx_desc __rte_unused,
+ unsigned int socket_id __rte_unused,
+ const struct rte_eth_rxconf *rx_conf __rte_unused,
+ struct rte_mempool *mb_pool)
+{
+ struct virtqueue *vq;
+ vq = dev->data->rx_queues[rx_queue_id] = virtio_queue_setup(dev, VTNET_RQ);
+ vq->mpool = mb_pool;
+ return 0;
+}
+
+static int
+eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
+ uint16_t nb_tx_desc __rte_unused,
+ unsigned int socket_id __rte_unused,
+ const struct rte_eth_txconf *tx_conf __rte_unused)
+{
+ dev->data->tx_queues[tx_queue_id] = virtio_queue_setup(dev, VTNET_TQ);
+ return 0;
+}
+
+static void
+eth_xenvirt_free_queues(struct rte_eth_dev *dev)
+{
+ int i;
+
+ for (i = 0; i < dev->data->nb_rx_queues; i++) {
+ eth_queue_release(dev->data->rx_queues[i]);
+ dev->data->rx_queues[i] = NULL;
+ }
+ dev->data->nb_rx_queues = 0;
+
+ for (i = 0; i < dev->data->nb_tx_queues; i++) {
+ eth_queue_release(dev->data->tx_queues[i]);
+ dev->data->tx_queues[i] = NULL;
+ }
+ dev->data->nb_tx_queues = 0;
+}
+
+static const struct eth_dev_ops ops = {
+ .dev_start = eth_dev_start,
+ .dev_stop = eth_dev_stop,
+ .dev_close = eth_dev_close,
+ .dev_configure = eth_dev_configure,
+ .dev_infos_get = eth_dev_info,
+ .rx_queue_setup = eth_rx_queue_setup,
+ .tx_queue_setup = eth_tx_queue_setup,
+ .rx_queue_release = eth_queue_release,
+ .tx_queue_release = eth_queue_release,
+ .link_update = eth_link_update,
+ .stats_get = eth_stats_get,
+ .stats_reset = eth_stats_reset,
+};
+
+
+static int
+rte_eth_xenvirt_parse_args(struct xenvirt_dict *dict,
+ const char *name, const char *params)
+{
+ int i;
+ char *pairs[RTE_ETH_XENVIRT_MAX_ARGS];
+ int num_of_pairs;
+ char *pair[2];
+ char *args;
+ int ret = -1;
+
+ if (params == NULL)
+ return 0;
+
+ args = rte_zmalloc(NULL, strlen(params) + 1, RTE_CACHE_LINE_SIZE);
+ if (args == NULL) {
+ RTE_LOG(ERR, PMD, "Couldn't parse %s device \n", name);
+ return -1;
+ }
+ rte_memcpy(args, params, strlen(params));
+
+ num_of_pairs = rte_strsplit(args, strnlen(args, MAX_ARG_STRLEN),
+ pairs,
+ RTE_ETH_XENVIRT_MAX_ARGS ,
+ RTE_ETH_XENVIRT_PAIRS_DELIM);
+
+ for (i = 0; i < num_of_pairs; i++) {
+ pair[0] = NULL;
+ pair[1] = NULL;
+ rte_strsplit(pairs[i], strnlen(pairs[i], MAX_ARG_STRLEN),
+ pair, 2,
+ RTE_ETH_XENVIRT_KEY_VALUE_DELIM);
+
+ if (pair[0] == NULL || pair[1] == NULL || pair[0][0] == 0
+ || pair[1][0] == 0) {
+ RTE_LOG(ERR, PMD,
+ "Couldn't parse %s device,"
+ "wrong key or value \n", name);
+ goto err;
+ }
+
+ if (!strncmp(pair[0], RTE_ETH_XENVIRT_MAC_PARAM,
+ sizeof(RTE_ETH_XENVIRT_MAC_PARAM))) {
+ if (cmdline_parse_etheraddr(NULL,
+ pair[1],
+ &dict->addr,
+ sizeof(dict->addr)) < 0) {
+ RTE_LOG(ERR, PMD,
+ "Invalid %s device ether address\n",
+ name);
+ goto err;
+ }
+
+ dict->addr_valid = 1;
+ }
+ }
+
+ ret = 0;
+err:
+ rte_free(args);
+ return ret;
+}
+
+enum dev_action {
+ DEV_CREATE,
+ DEV_ATTACH
+};
+
+static struct rte_vdev_driver pmd_xenvirt_drv;
+
+static int
+eth_dev_xenvirt_create(const char *name, const char *params,
+ const unsigned numa_node,
+ enum dev_action action)
+{
+ struct rte_eth_dev_data *data = NULL;
+ struct pmd_internals *internals = NULL;
+ struct rte_eth_dev *eth_dev = NULL;
+ struct xenvirt_dict dict;
+
+ memset(&dict, 0, sizeof(struct xenvirt_dict));
+
+ RTE_LOG(INFO, PMD, "Creating virtio rings backed ethdev on numa socket %u\n",
+ numa_node);
+ RTE_SET_USED(action);
+
+ if (rte_eth_xenvirt_parse_args(&dict, name, params) < 0) {
+ RTE_LOG(ERR, PMD, "%s: Failed to parse ethdev parameters\n", __func__);
+ return -1;
+ }
+
+ /* now do all data allocation - for eth_dev structure, dummy pci driver
+ * and internal (private) data
+ */
+ data = rte_zmalloc_socket(name, sizeof(*data), 0, numa_node);
+ if (data == NULL)
+ goto err;
+
+ internals = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
+ if (internals == NULL)
+ goto err;
+
+ /* reserve an ethdev entry */
+ eth_dev = rte_eth_dev_allocate(name);
+ if (eth_dev == NULL)
+ goto err;
+
+ data->dev_private = internals;
+ data->port_id = eth_dev->data->port_id;
+ data->nb_rx_queues = (uint16_t)1;
+ data->nb_tx_queues = (uint16_t)1;
+ data->dev_link = pmd_link;
+ data->mac_addrs = rte_zmalloc("xen_virtio", ETHER_ADDR_LEN, 0);
+
+ if(dict.addr_valid)
+ memcpy(&data->mac_addrs->addr_bytes, &dict.addr, sizeof(struct ether_addr));
+ else
+ eth_random_addr(&data->mac_addrs->addr_bytes[0]);
+
+ eth_dev->data = data;
+ eth_dev->dev_ops = &ops;
+
+ eth_dev->data->dev_flags = RTE_ETH_DEV_DETACHABLE;
+ eth_dev->data->kdrv = RTE_KDRV_NONE;
+ eth_dev->data->drv_name = pmd_xenvirt_drv.driver.name;
+ eth_dev->data->numa_node = numa_node;
+
+ eth_dev->rx_pkt_burst = eth_xenvirt_rx;
+ eth_dev->tx_pkt_burst = eth_xenvirt_tx;
+
+ internals->virtio_idx = virtio_idx++;
+ internals->port_id = eth_dev->data->port_id;
+
+ return 0;
+
+err:
+ rte_free(data);
+ rte_free(internals);
+
+ return -1;
+}
+
+
+static int
+eth_dev_xenvirt_free(const char *name, const unsigned numa_node)
+{
+ struct rte_eth_dev *eth_dev = NULL;
+
+ RTE_LOG(DEBUG, PMD,
+ "Free virtio rings backed ethdev on numa socket %u\n",
+ numa_node);
+
+ /* find an ethdev entry */
+ eth_dev = rte_eth_dev_allocated(name);
+ if (eth_dev == NULL)
+ return -1;
+
+ if (eth_dev->data->dev_started == 1) {
+ eth_dev_stop(eth_dev);
+ eth_dev_close(eth_dev);
+ }
+
+ eth_dev->rx_pkt_burst = NULL;
+ eth_dev->tx_pkt_burst = NULL;
+ eth_dev->dev_ops = NULL;
+
+ rte_free(eth_dev->data);
+ rte_free(eth_dev->data->dev_private);
+ rte_free(eth_dev->data->mac_addrs);
+
+ virtio_idx--;
+
+ return 0;
+}
+
+/*TODO: Support multiple process model */
+static int
+rte_pmd_xenvirt_probe(struct rte_vdev_device *dev)
+{
+ if (virtio_idx == 0) {
+ if (xenstore_init() != 0) {
+ RTE_LOG(ERR, PMD, "%s: xenstore init failed\n", __func__);
+ return -1;
+ }
+ if (gntalloc_open() != 0) {
+ RTE_LOG(ERR, PMD, "%s: grant init failed\n", __func__);
+ return -1;
+ }
+ }
+ eth_dev_xenvirt_create(rte_vdev_device_name(dev),
+ rte_vdev_device_args(dev), rte_socket_id(), DEV_CREATE);
+ return 0;
+}
+
+static int
+rte_pmd_xenvirt_remove(struct rte_vdev_device *dev)
+{
+ eth_dev_xenvirt_free(rte_vdev_device_name(dev), rte_socket_id());
+
+ if (virtio_idx == 0) {
+ if (xenstore_uninit() != 0)
+ RTE_LOG(ERR, PMD, "%s: xenstore uninit failed\n", __func__);
+
+ gntalloc_close();
+ }
+ return 0;
+}
+
+static struct rte_vdev_driver pmd_xenvirt_drv = {
+ .probe = rte_pmd_xenvirt_probe,
+ .remove = rte_pmd_xenvirt_remove,
+};
+
+RTE_PMD_REGISTER_VDEV(net_xenvirt, pmd_xenvirt_drv);
+RTE_PMD_REGISTER_ALIAS(net_xenvirt, eth_xenvirt);
+RTE_PMD_REGISTER_PARAM_STRING(net_xenvirt,
+ "mac=<mac addr>");
diff --git a/src/seastar/dpdk/drivers/net/xenvirt/rte_eth_xenvirt.h b/src/seastar/dpdk/drivers/net/xenvirt/rte_eth_xenvirt.h
new file mode 100644
index 00000000..598adc6f
--- /dev/null
+++ b/src/seastar/dpdk/drivers/net/xenvirt/rte_eth_xenvirt.h
@@ -0,0 +1,61 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_ETH_XENVIRT_H_
+#define _RTE_ETH_XENVIRT_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_mempool.h>
+
+/**
+ * Creates mempool for xen virtio PMD.
+ * This function uses memzone_reserve to allocate memory for meta data,
+ * and uses grant alloc driver to allocate memory for data area.
+ * The input parameters are exactly the same as rte_mempool_create.
+ */
+struct rte_mempool *
+rte_mempool_gntalloc_create(const char *name, unsigned elt_num, unsigned elt_size,
+ unsigned cache_size, unsigned private_data_size,
+ rte_mempool_ctor_t *mp_init, void *mp_init_arg,
+ rte_mempool_obj_cb_t *obj_init, void *obj_init_arg,
+ int socket_id, unsigned flags);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/seastar/dpdk/drivers/net/xenvirt/rte_eth_xenvirt_version.map b/src/seastar/dpdk/drivers/net/xenvirt/rte_eth_xenvirt_version.map
new file mode 100644
index 00000000..dd636f72
--- /dev/null
+++ b/src/seastar/dpdk/drivers/net/xenvirt/rte_eth_xenvirt_version.map
@@ -0,0 +1,7 @@
+DPDK_2.0 {
+ global:
+
+ rte_mempool_gntalloc_create;
+
+ local: *;
+};
diff --git a/src/seastar/dpdk/drivers/net/xenvirt/rte_mempool_gntalloc.c b/src/seastar/dpdk/drivers/net/xenvirt/rte_mempool_gntalloc.c
new file mode 100644
index 00000000..73e82f80
--- /dev/null
+++ b/src/seastar/dpdk/drivers/net/xenvirt/rte_mempool_gntalloc.c
@@ -0,0 +1,295 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <xen/sys/gntalloc.h>
+
+#include <rte_common.h>
+#include <rte_mempool.h>
+#include <rte_memory.h>
+#include <rte_errno.h>
+
+#include "rte_xen_lib.h"
+#include "rte_eth_xenvirt.h"
+
+struct _gntarr {
+ uint32_t gref;
+ phys_addr_t pa;
+ uint64_t index;
+ void *va;
+};
+
+struct _mempool_gntalloc_info {
+ struct rte_mempool *mp;
+ uint32_t pg_num;
+ uint32_t *gref_arr;
+ phys_addr_t *pa_arr;
+ void *va;
+ uint32_t mempool_idx;
+ uint64_t start_index;
+};
+
+
+static rte_atomic32_t global_xenvirt_mempool_idx = RTE_ATOMIC32_INIT(-1);
+
+static int
+compare(const void *p1, const void *p2)
+{
+ return ((const struct _gntarr *)p1)->pa - ((const struct _gntarr *)p2)->pa;
+}
+
+
+static struct _mempool_gntalloc_info
+_create_mempool(const char *name, unsigned elt_num, unsigned elt_size,
+ unsigned cache_size, unsigned private_data_size,
+ rte_mempool_ctor_t *mp_init, void *mp_init_arg,
+ rte_mempool_obj_cb_t *obj_init, void *obj_init_arg,
+ int socket_id, unsigned flags)
+{
+ struct _mempool_gntalloc_info mgi;
+ struct rte_mempool *mp = NULL;
+ struct rte_mempool_objsz objsz;
+ uint32_t pg_num, rpg_num, pg_shift, pg_sz;
+ char *va, *orig_va, *uv; /* uv: from which, the pages could be freed */
+ ssize_t sz, usz; /* usz: unused size */
+ /*
+ * for each page allocated through xen_gntalloc driver,
+ * gref_arr:stores grant references,
+ * pa_arr: stores physical address,
+ * gnt_arr: stores all meta dat
+ */
+ uint32_t *gref_arr = NULL;
+ phys_addr_t *pa_arr = NULL;
+ struct _gntarr *gnt_arr = NULL;
+ /* start index of the grant referances, used for dealloc*/
+ uint64_t start_index;
+ uint32_t i, j;
+ int rv = 0;
+ struct ioctl_gntalloc_dealloc_gref arg;
+
+ mgi.mp = NULL;
+ va = orig_va = uv = NULL;
+ pg_num = rpg_num = 0;
+ sz = 0;
+
+ pg_sz = getpagesize();
+ if (rte_is_power_of_2(pg_sz) == 0) {
+ goto out;
+ }
+ pg_shift = rte_bsf32(pg_sz);
+
+ rte_mempool_calc_obj_size(elt_size, flags, &objsz);
+ sz = rte_mempool_xmem_size(elt_num, objsz.total_size, pg_shift);
+ pg_num = sz >> pg_shift;
+
+ pa_arr = calloc(pg_num, sizeof(pa_arr[0]));
+ gref_arr = calloc(pg_num, sizeof(gref_arr[0]));
+ gnt_arr = calloc(pg_num, sizeof(gnt_arr[0]));
+ if ((gnt_arr == NULL) || (gref_arr == NULL) || (pa_arr == NULL))
+ goto out;
+
+ /* grant index is continuous in ascending order */
+ orig_va = gntalloc(sz, gref_arr, &start_index);
+ if (orig_va == NULL)
+ goto out;
+
+ get_phys_map(orig_va, pa_arr, pg_num, pg_sz);
+ for (i = 0; i < pg_num; i++) {
+ gnt_arr[i].index = start_index + i * pg_sz;
+ gnt_arr[i].gref = gref_arr[i];
+ gnt_arr[i].pa = pa_arr[i];
+ gnt_arr[i].va = RTE_PTR_ADD(orig_va, i * pg_sz);
+ }
+ qsort(gnt_arr, pg_num, sizeof(struct _gntarr), compare);
+
+ va = get_xen_virtual(sz, pg_sz);
+ if (va == NULL) {
+ goto out;
+ }
+
+ /*
+ * map one by one, as index isn't continuous now.
+ * pg_num VMAs, doesn't linux has a limitation on this?
+ */
+ for (i = 0; i < pg_num; i++) {
+ /* update gref_arr and pa_arr after sort */
+ gref_arr[i] = gnt_arr[i].gref;
+ pa_arr[i] = gnt_arr[i].pa;
+ gnt_arr[i].va = mmap(va + i * pg_sz, pg_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FIXED, gntalloc_fd, gnt_arr[i].index);
+ if ((gnt_arr[i].va == MAP_FAILED) || (gnt_arr[i].va != (va + i * pg_sz))) {
+ RTE_LOG(ERR, PMD, "failed to map %d pages\n", i);
+ goto mmap_failed;
+ }
+ }
+
+ /*
+ * Check that allocated size is big enough to hold elt_num
+ * objects and a calcualte how many bytes are actually required.
+ */
+ usz = rte_mempool_xmem_usage(va, elt_num, objsz.total_size, pa_arr, pg_num, pg_shift);
+ if (usz < 0) {
+ mp = NULL;
+ i = pg_num;
+ goto mmap_failed;
+ } else {
+ /* unmap unused pages if any */
+ uv = RTE_PTR_ADD(va, usz);
+ if ((usz = va + sz - uv) > 0) {
+
+ RTE_LOG(ERR, PMD,
+ "%s(%s): unmap unused %zu of %zu "
+ "mmaped bytes @%p orig:%p\n",
+ __func__, name, usz, sz, uv, va);
+ munmap(uv, usz);
+ i = (sz - usz) / pg_sz;
+ for (; i < pg_num; i++) {
+ arg.count = 1;
+ arg.index = gnt_arr[i].index;
+ rv = ioctl(gntalloc_fd, IOCTL_GNTALLOC_DEALLOC_GREF, &arg);
+ if (rv) {
+ /* shouldn't fail here */
+ RTE_LOG(ERR, PMD, "va=%p pa=%"PRIu64"x index=%"PRIu64" %s\n",
+ gnt_arr[i].va,
+ gnt_arr[i].pa,
+ arg.index, strerror(errno));
+ rte_panic("gntdealloc failed when freeing pages\n");
+ }
+ }
+
+ rpg_num = (sz - usz) >> pg_shift;
+ } else
+ rpg_num = pg_num;
+
+ mp = rte_mempool_xmem_create(name, elt_num, elt_size,
+ cache_size, private_data_size,
+ mp_init, mp_init_arg,
+ obj_init, obj_init_arg,
+ socket_id, flags, va, pa_arr, rpg_num, pg_shift);
+
+ RTE_ASSERT(elt_num == mp->size);
+ }
+ mgi.mp = mp;
+ mgi.pg_num = rpg_num;
+ mgi.gref_arr = gref_arr;
+ mgi.pa_arr = pa_arr;
+ if (mp)
+ mgi.mempool_idx = rte_atomic32_add_return(&global_xenvirt_mempool_idx, 1);
+ mgi.start_index = start_index;
+ mgi.va = va;
+
+ if (mp == NULL) {
+ i = pg_num;
+ goto mmap_failed;
+ }
+
+/*
+ * unmap only, without deallocate grant reference.
+ * unused pages have already been unmaped,
+ * unmap twice will fail, but it is safe.
+ */
+mmap_failed:
+ for (j = 0; j < i; j++) {
+ if (gnt_arr[i].va)
+ munmap(gnt_arr[i].va, pg_sz);
+ }
+out:
+ free(gnt_arr);
+ if (orig_va)
+ munmap(orig_va, sz);
+ if (mp == NULL) {
+ free(gref_arr);
+ free(pa_arr);
+
+ /* some gref has already been de-allocated from the list in the driver,
+ * so dealloc one by one, and it is safe to deallocate twice
+ */
+ if (orig_va) {
+ for (i = 0; i < pg_num; i++) {
+ arg.index = start_index + i * pg_sz;
+ rv = ioctl(gntalloc_fd, IOCTL_GNTALLOC_DEALLOC_GREF, arg);
+ }
+ }
+ }
+ return mgi;
+}
+
+struct rte_mempool *
+rte_mempool_gntalloc_create(const char *name, unsigned elt_num, unsigned elt_size,
+ unsigned cache_size, unsigned private_data_size,
+ rte_mempool_ctor_t *mp_init, void *mp_init_arg,
+ rte_mempool_obj_cb_t *obj_init, void *obj_init_arg,
+ int socket_id, unsigned flags)
+{
+ int rv;
+ uint32_t i;
+ struct _mempool_gntalloc_info mgi;
+ struct ioctl_gntalloc_dealloc_gref arg;
+ int pg_sz = getpagesize();
+
+ mgi = _create_mempool(name, elt_num, elt_size,
+ cache_size, private_data_size,
+ mp_init, mp_init_arg,
+ obj_init, obj_init_arg,
+ socket_id, flags);
+ if (mgi.mp) {
+ rv = grant_gntalloc_mbuf_pool(mgi.mp,
+ mgi.pg_num,
+ mgi.gref_arr,
+ mgi.pa_arr,
+ mgi.mempool_idx);
+ free(mgi.gref_arr);
+ free(mgi.pa_arr);
+ if (rv == 0)
+ return mgi.mp;
+ /*
+ * in _create_mempool, unused pages have already been unmapped, deallocagted
+ * unmap and dealloc the remained ones here.
+ */
+ munmap(mgi.va, pg_sz * mgi.pg_num);
+ for (i = 0; i < mgi.pg_num; i++) {
+ arg.index = mgi.start_index + i * pg_sz;
+ rv = ioctl(gntalloc_fd, IOCTL_GNTALLOC_DEALLOC_GREF, arg);
+ }
+ return NULL;
+ }
+ return NULL;
+
+
+
+}
diff --git a/src/seastar/dpdk/drivers/net/xenvirt/rte_xen_lib.c b/src/seastar/dpdk/drivers/net/xenvirt/rte_xen_lib.c
new file mode 100644
index 00000000..6c9a1d49
--- /dev/null
+++ b/src/seastar/dpdk/drivers/net/xenvirt/rte_xen_lib.c
@@ -0,0 +1,454 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <xen/xen-compat.h>
+#if __XEN_LATEST_INTERFACE_VERSION__ < 0x00040200
+#include <xs.h>
+#else
+#include <xenstore.h>
+#endif
+#include <xen/sys/gntalloc.h>
+
+#include <rte_common.h>
+#include <rte_string_fns.h>
+#include <rte_malloc.h>
+
+#include "rte_xen_lib.h"
+
+/*
+ * The grant node format in xenstore for vring/mpool is:
+ * 0_rx_vring_gref = "gref1#, gref2#, gref3#"
+ * 0_mempool_gref = "gref1#, gref2#, gref3#"
+ * each gref# is a grant reference for a shared page.
+ * In each shared page, we store the grant_node_item items.
+ */
+struct grant_node_item {
+ uint32_t gref;
+ uint32_t pfn;
+} __attribute__((packed));
+
+/* fd for xen_gntalloc driver, used to allocate grant pages*/
+int gntalloc_fd = -1;
+
+/* xenstore path for local domain, now it is '/local/domain/domid/' */
+static char *dompath = NULL;
+/* handle to xenstore read/write operations */
+static struct xs_handle *xs = NULL;
+/* flag to indicate if xenstore cleanup is required */
+static bool is_xenstore_cleaned_up;
+
+/*
+ * Reserve a virtual address space.
+ * On success, returns the pointer. On failure, returns NULL.
+ */
+void *
+get_xen_virtual(size_t size, size_t page_sz)
+{
+ void *addr;
+ uintptr_t aligned_addr;
+
+ addr = mmap(NULL, size + page_sz, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+ if (addr == MAP_FAILED) {
+ RTE_LOG(ERR, PMD, "failed get a virtual area\n");
+ return NULL;
+ }
+
+ aligned_addr = RTE_ALIGN_CEIL((uintptr_t)addr, page_sz);
+ addr = (void *)(aligned_addr);
+
+ return addr;
+}
+
+/*
+ * Get the physical address for virtual memory starting at va.
+ */
+int
+get_phys_map(void *va, phys_addr_t pa[], uint32_t pg_num, uint32_t pg_sz)
+{
+ int32_t fd, rc = 0;
+ uint32_t i, nb;
+ off_t ofs;
+
+ ofs = (uintptr_t)va / pg_sz * sizeof(*pa);
+ nb = pg_num * sizeof(*pa);
+
+ if ((fd = open(PAGEMAP_FNAME, O_RDONLY)) < 0 ||
+ (rc = pread(fd, pa, nb, ofs)) < 0 ||
+ (rc -= nb) != 0) {
+ RTE_LOG(ERR, PMD, "%s: failed read of %u bytes from \'%s\' "
+ "at offset %lu, error code: %d\n",
+ __func__, nb, PAGEMAP_FNAME, (unsigned long)ofs, errno);
+ rc = ENOENT;
+ }
+
+ close(fd);
+ for (i = 0; i != pg_num; i++)
+ pa[i] = (pa[i] & PAGEMAP_PFN_MASK) * pg_sz;
+
+ return rc;
+}
+
+int
+gntalloc_open(void)
+{
+ gntalloc_fd = open(XEN_GNTALLOC_FNAME, O_RDWR);
+ return (gntalloc_fd != -1) ? 0 : -1;
+}
+
+void
+gntalloc_close(void)
+{
+ if (gntalloc_fd != -1)
+ close(gntalloc_fd);
+ gntalloc_fd = -1;
+}
+
+void *
+gntalloc(size_t size, uint32_t *gref, uint64_t *start_index)
+{
+ int page_size = getpagesize();
+ uint32_t i, pg_num;
+ void *va;
+ int rv;
+ struct ioctl_gntalloc_alloc_gref *arg;
+ struct ioctl_gntalloc_dealloc_gref arg_d;
+
+ if (size % page_size) {
+ RTE_LOG(ERR, PMD, "%s: %zu isn't multiple of page size\n",
+ __func__, size);
+ return NULL;
+ }
+
+ pg_num = size / page_size;
+ arg = malloc(sizeof(*arg) + (pg_num - 1) * sizeof(uint32_t));
+ if (arg == NULL)
+ return NULL;
+ arg->domid = DOM0_DOMID;
+ arg->flags = GNTALLOC_FLAG_WRITABLE;
+ arg->count = pg_num;
+
+ rv = ioctl(gntalloc_fd, IOCTL_GNTALLOC_ALLOC_GREF, arg);
+ if (rv) {
+ RTE_LOG(ERR, PMD, "%s: ioctl error\n", __func__);
+ free(arg);
+ return NULL;
+ }
+
+ va = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, gntalloc_fd, arg->index);
+ if (va == MAP_FAILED) {
+ RTE_LOG(ERR, PMD, "%s: mmap failed\n", __func__);
+ arg_d.count = pg_num;
+ arg_d.index = arg->index;
+ ioctl(gntalloc_fd, IOCTL_GNTALLOC_DEALLOC_GREF, arg_d);
+ free(arg);
+ return NULL;
+ }
+
+ if (gref) {
+ for (i = 0; i < pg_num; i++) {
+ gref[i] = arg->gref_ids[i];
+ }
+ }
+ if (start_index)
+ *start_index = arg->index;
+
+ free(arg);
+
+ return va;
+}
+
+int
+grefwatch_from_alloc(uint32_t *gref, void **pptr)
+{
+ int rv;
+ void *ptr;
+ int pg_size = getpagesize();
+ struct ioctl_gntalloc_alloc_gref arg = {
+ .domid = DOM0_DOMID,
+ .flags = GNTALLOC_FLAG_WRITABLE,
+ .count = 1
+ };
+ struct ioctl_gntalloc_dealloc_gref arg_d;
+ struct ioctl_gntalloc_unmap_notify notify = {
+ .action = UNMAP_NOTIFY_CLEAR_BYTE
+ };
+
+ rv = ioctl(gntalloc_fd, IOCTL_GNTALLOC_ALLOC_GREF, &arg);
+ if (rv) {
+ RTE_LOG(ERR, PMD, "%s: ioctl error\n", __func__);
+ return -1;
+ }
+
+ ptr = (void *)mmap(NULL, pg_size, PROT_READ|PROT_WRITE, MAP_SHARED, gntalloc_fd, arg.index);
+ arg_d.index = arg.index;
+ arg_d.count = 1;
+ if (ptr == MAP_FAILED) {
+ RTE_LOG(ERR, PMD, "%s: mmap failed\n", __func__);
+ ioctl(gntalloc_fd, IOCTL_GNTALLOC_DEALLOC_GREF, &arg_d);
+ return -1;
+ }
+ if (pptr)
+ *pptr = ptr;
+ if (gref)
+ *gref = arg.gref_ids[0];
+
+ notify.index = arg.index;
+ rv = ioctl(gntalloc_fd, IOCTL_GNTALLOC_SET_UNMAP_NOTIFY, &notify);
+ if (rv) {
+ RTE_LOG(ERR, PMD, "%s: unmap notify failed\n", __func__);
+ munmap(ptr, pg_size);
+ ioctl(gntalloc_fd, IOCTL_GNTALLOC_DEALLOC_GREF, &arg_d);
+ return -1;
+ }
+
+ return 0;
+}
+
+void
+gntfree(void *va, size_t sz, uint64_t start_index)
+{
+ struct ioctl_gntalloc_dealloc_gref arg_d;
+
+ if (va && sz) {
+ munmap(va, sz);
+ arg_d.count = sz / getpagesize();
+ arg_d.index = start_index;
+ ioctl(gntalloc_fd, IOCTL_GNTALLOC_DEALLOC_GREF, &arg_d);
+ }
+}
+
+static int
+xenstore_cleanup(void)
+{
+ char store_path[PATH_MAX] = {0};
+
+ if (snprintf(store_path, sizeof(store_path),
+ "%s%s", dompath, DPDK_XENSTORE_NODE) == -1)
+ return -1;
+
+ if (xs_rm(xs, XBT_NULL, store_path) == false) {
+ RTE_LOG(ERR, PMD, "%s: failed cleanup node\n", __func__);
+ return -1;
+ }
+
+ return 0;
+}
+
+int
+xenstore_init(void)
+{
+ unsigned int len, domid;
+ char *buf;
+ char *end;
+
+ xs = xs_domain_open();
+ if (xs == NULL) {
+ RTE_LOG(ERR, PMD,"%s: xs_domain_open failed\n", __func__);
+ return -1;
+ }
+ buf = xs_read(xs, XBT_NULL, "domid", &len);
+ if (buf == NULL) {
+ RTE_LOG(ERR, PMD, "%s: failed read domid\n", __func__);
+ return -1;
+ }
+ errno = 0;
+ domid = strtoul(buf, &end, 0);
+ if (errno != 0 || end == NULL || end == buf || domid == 0)
+ return -1;
+
+ RTE_LOG(INFO, PMD, "retrieved dom ID = %d\n", domid);
+
+ dompath = xs_get_domain_path(xs, domid);
+ if (dompath == NULL)
+ return -1;
+
+ xs_transaction_start(xs); /* When to stop transaction */
+
+ if (is_xenstore_cleaned_up == 0) {
+ if (xenstore_cleanup())
+ return -1;
+ is_xenstore_cleaned_up = 1;
+ }
+
+ return 0;
+}
+
+int
+xenstore_uninit(void)
+{
+ xs_close(xs);
+
+ if (is_xenstore_cleaned_up == 0) {
+ if (xenstore_cleanup())
+ return -1;
+ is_xenstore_cleaned_up = 1;
+ }
+ free(dompath);
+ dompath = NULL;
+
+ return 0;
+}
+
+int
+xenstore_write(const char *key_str, const char *val_str)
+{
+ char grant_path[PATH_MAX];
+ int rv, len;
+
+ if (xs == NULL) {
+ RTE_LOG(ERR, PMD, "%s: xenstore init failed\n", __func__);
+ return -1;
+ }
+ rv = snprintf(grant_path, sizeof(grant_path), "%s%s", dompath, key_str);
+ if (rv == -1) {
+ RTE_LOG(ERR, PMD, "%s: snprintf %s %s failed\n",
+ __func__, dompath, key_str);
+ return -1;
+ }
+ len = strnlen(val_str, PATH_MAX);
+
+ if (xs_write(xs, XBT_NULL, grant_path, val_str, len) == false) {
+ RTE_LOG(ERR, PMD, "%s: xs_write failed\n", __func__);
+ return -1;
+ }
+
+ return 0;
+}
+
+int
+grant_node_create(uint32_t pg_num, uint32_t *gref_arr, phys_addr_t *pa_arr, char *val_str, size_t str_size)
+{
+ uint64_t start_index;
+ int pg_size;
+ uint32_t pg_shift;
+ void *ptr = NULL;
+ uint32_t count, entries_per_pg;
+ uint32_t i, j = 0, k = 0;
+ uint32_t *gref_tmp;
+ int first = 1;
+ char tmp_str[PATH_MAX] = {0};
+ int rv = -1;
+
+ pg_size = getpagesize();
+ if (rte_is_power_of_2(pg_size) == 0) {
+ return -1;
+ }
+ pg_shift = rte_bsf32(pg_size);
+ if (pg_size % sizeof(struct grant_node_item)) {
+ RTE_LOG(ERR, PMD, "pg_size isn't a multiple of grant node item\n");
+ return -1;
+ }
+
+ entries_per_pg = pg_size / sizeof(struct grant_node_item);
+ count = (pg_num + entries_per_pg - 1 ) / entries_per_pg;
+ gref_tmp = malloc(count * sizeof(uint32_t));
+ if (gref_tmp == NULL)
+ return -1;
+ ptr = gntalloc(pg_size * count, gref_tmp, &start_index);
+ if (ptr == NULL) {
+ RTE_LOG(ERR, PMD, "%s: gntalloc error of %d pages\n", __func__, count);
+ free(gref_tmp);
+ return -1;
+ }
+
+ while (j < pg_num) {
+ if (first) {
+ rv = snprintf(val_str, str_size, "%u", gref_tmp[k]);
+ first = 0;
+ } else {
+ snprintf(tmp_str, PATH_MAX, "%s", val_str);
+ rv = snprintf(val_str, str_size, "%s,%u", tmp_str, gref_tmp[k]);
+ }
+ k++;
+ if (rv == -1)
+ break;
+
+ for (i = 0; i < entries_per_pg && j < pg_num ; i++) {
+ ((struct grant_node_item *)ptr)->gref = gref_arr[j];
+ ((struct grant_node_item *)ptr)->pfn = pa_arr[j] >> pg_shift;
+ ptr = RTE_PTR_ADD(ptr, sizeof(struct grant_node_item));
+ j++;
+ }
+ }
+ if (rv == -1) {
+ gntfree(ptr, pg_size * count, start_index);
+ } else
+ rv = 0;
+ free(gref_tmp);
+ return rv;
+}
+
+
+int
+grant_gntalloc_mbuf_pool(struct rte_mempool *mpool, uint32_t pg_num, uint32_t *gref_arr, phys_addr_t *pa_arr, int mempool_idx)
+{
+ char key_str[PATH_MAX] = {0};
+ char val_str[PATH_MAX] = {0};
+ void *mempool_obj_va;
+
+ if (grant_node_create(pg_num, gref_arr, pa_arr, val_str, sizeof(val_str))) {
+ return -1;
+ }
+
+ if (snprintf(key_str, sizeof(key_str),
+ DPDK_XENSTORE_PATH"%d"MEMPOOL_XENSTORE_STR, mempool_idx) == -1)
+ return -1;
+ if (xenstore_write(key_str, val_str) == -1)
+ return -1;
+
+ if (snprintf(key_str, sizeof(key_str),
+ DPDK_XENSTORE_PATH"%d"MEMPOOL_VA_XENSTORE_STR, mempool_idx) == -1)
+ return -1;
+ if (mpool->nb_mem_chunks != 1) {
+ RTE_LOG(ERR, PMD,
+ "mempool with more than 1 chunk is not supported\n");
+ return -1;
+ }
+ mempool_obj_va = STAILQ_FIRST(&mpool->mem_list)->addr;
+ if (snprintf(val_str, sizeof(val_str), "%"PRIxPTR,
+ (uintptr_t)mempool_obj_va) == -1)
+ return -1;
+ if (xenstore_write(key_str, val_str) == -1)
+ return -1;
+
+ return 0;
+}
diff --git a/src/seastar/dpdk/drivers/net/xenvirt/rte_xen_lib.h b/src/seastar/dpdk/drivers/net/xenvirt/rte_xen_lib.h
new file mode 100644
index 00000000..d973eacb
--- /dev/null
+++ b/src/seastar/dpdk/drivers/net/xenvirt/rte_xen_lib.h
@@ -0,0 +1,116 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_XEN_DUMMY_PMD_H
+#define _RTE_XEN_DUMMY_PMD_H
+
+#include <stdint.h>
+
+#include <rte_common.h>
+#include <rte_mempool.h>
+#include <rte_ether.h>
+
+#define PAGEMAP_FNAME "/proc/self/pagemap"
+#define XEN_GNTALLOC_FNAME "/dev/xen/gntalloc"
+#define DPDK_XENSTORE_PATH "/control/dpdk/"
+#define DPDK_XENSTORE_NODE "/control/dpdk"
+/*format 0_mempool_gref = "1537,1524,1533" */
+#define MEMPOOL_XENSTORE_STR "_mempool_gref"
+/*format 0_mempool_va = 0x80340000 */
+#define MEMPOOL_VA_XENSTORE_STR "_mempool_va"
+/*format 0_rx_vring_gref = "1537,1524,1533" */
+#define RXVRING_XENSTORE_STR "_rx_vring_gref"
+/*format 0_tx_vring_gref = "1537,1524,1533" */
+#define TXVRING_XENSTORE_STR "_tx_vring_gref"
+#define VRING_FLAG_STR "_vring_flag"
+/*format: event_type_start_0 = 1*/
+#define EVENT_TYPE_START_STR "event_type_start_"
+
+#define DOM0_DOMID 0
+/*
+ * the pfn (page frame number) are bits 0-54 (see pagemap.txt in linux
+ * Documentation).
+ */
+#define PAGEMAP_PFN_BITS 54
+#define PAGEMAP_PFN_MASK RTE_LEN2MASK(PAGEMAP_PFN_BITS, phys_addr_t)
+
+#define MAP_FLAG 0xA5
+
+#define RTE_ETH_XENVIRT_PAIRS_DELIM ';'
+#define RTE_ETH_XENVIRT_KEY_VALUE_DELIM '='
+#define RTE_ETH_XENVIRT_MAX_ARGS 1
+#define RTE_ETH_XENVIRT_MAC_PARAM "mac"
+struct xenvirt_dict {
+ uint8_t addr_valid;
+ struct ether_addr addr;
+};
+
+extern int gntalloc_fd;
+
+int
+gntalloc_open(void);
+
+void
+gntalloc_close(void);
+
+void *
+gntalloc(size_t sz, uint32_t *gref, uint64_t *start_index);
+
+void
+gntfree(void *va, size_t sz, uint64_t start_index);
+
+int
+xenstore_init(void);
+
+int
+xenstore_uninit(void);
+
+int
+xenstore_write(const char *key_str, const char *val_str);
+
+int
+get_phys_map(void *va, phys_addr_t pa[], uint32_t pg_num, uint32_t pg_sz);
+
+void *
+get_xen_virtual(size_t size, size_t page_sz);
+
+int
+grefwatch_from_alloc(uint32_t *gref, void **pptr);
+
+
+int grant_node_create(uint32_t pg_num, uint32_t *gref_arr, phys_addr_t *pa_arr, char *val_str, size_t str_size);
+
+int
+grant_gntalloc_mbuf_pool(struct rte_mempool *mpool, uint32_t pg_num, uint32_t *gref_arr, phys_addr_t *pa_arr, int mempool_idx);
+
+#endif
diff --git a/src/seastar/dpdk/drivers/net/xenvirt/virtio_logs.h b/src/seastar/dpdk/drivers/net/xenvirt/virtio_logs.h
new file mode 100644
index 00000000..d6c33f7b
--- /dev/null
+++ b/src/seastar/dpdk/drivers/net/xenvirt/virtio_logs.h
@@ -0,0 +1,70 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VIRTIO_LOGS_H_
+#define _VIRTIO_LOGS_H_
+
+#include <rte_log.h>
+
+#ifdef RTE_LIBRTE_VIRTIO_DEBUG_INIT
+#define PMD_INIT_LOG(level, fmt, args...) \
+ RTE_LOG(level, PMD, "%s(): " fmt "\n", __func__, ## args)
+#define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>")
+#else
+#define PMD_INIT_LOG(level, fmt, args...) do { } while(0)
+#define PMD_INIT_FUNC_TRACE() do { } while(0)
+#endif
+
+#ifdef RTE_LIBRTE_VIRTIO_DEBUG_RX
+#define PMD_RX_LOG(level, fmt, args...) \
+ RTE_LOG(level, PMD, "%s() rx: " fmt , __func__, ## args)
+#else
+#define PMD_RX_LOG(level, fmt, args...) do { } while(0)
+#endif
+
+#ifdef RTE_LIBRTE_VIRTIO_DEBUG_TX
+#define PMD_TX_LOG(level, fmt, args...) \
+ RTE_LOG(level, PMD, "%s() tx: " fmt , __func__, ## args)
+#else
+#define PMD_TX_LOG(level, fmt, args...) do { } while(0)
+#endif
+
+
+#ifdef RTE_LIBRTE_VIRTIO_DEBUG_DRIVER
+#define PMD_DRV_LOG(level, fmt, args...) \
+ RTE_LOG(level, PMD, "%s(): " fmt , __func__, ## args)
+#else
+#define PMD_DRV_LOG(level, fmt, args...) do { } while(0)
+#endif
+
+#endif /* _VIRTIO_LOGS_H_ */
diff --git a/src/seastar/dpdk/drivers/net/xenvirt/virtqueue.h b/src/seastar/dpdk/drivers/net/xenvirt/virtqueue.h
new file mode 100644
index 00000000..350eae3e
--- /dev/null
+++ b/src/seastar/dpdk/drivers/net/xenvirt/virtqueue.h
@@ -0,0 +1,273 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VIRTQUEUE_H_
+#define _VIRTQUEUE_H_
+
+#include <stdint.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_net.h>
+
+#include <rte_atomic.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+
+#include "virtio_logs.h"
+
+struct rte_mbuf;
+
+/* The alignment to use between consumer and producer parts of vring. */
+#define VIRTIO_PCI_VRING_ALIGN 4096
+
+enum { VTNET_RQ = 0, VTNET_TQ = 1, VTNET_CQ = 2 };
+
+/**
+ * The maximum virtqueue size is 2^15. Use that value as the end of
+ * descriptor chain terminator since it will never be a valid index
+ * in the descriptor table. This is used to verify we are correctly
+ * handling vq_free_cnt.
+ */
+#define VQ_RING_DESC_CHAIN_END 32768
+
+#define VIRTQUEUE_MAX_NAME_SZ 32
+
+struct pmd_internals {
+ struct rte_eth_stats eth_stats;
+ int port_id;
+ int virtio_idx;
+};
+
+
+struct virtqueue {
+ char vq_name[VIRTQUEUE_MAX_NAME_SZ];
+ struct rte_mempool *mpool; /**< mempool for mbuf allocation */
+ uint16_t queue_id; /**< DPDK queue index. */
+ uint16_t vq_queue_index; /**< PCI queue index */
+ uint8_t port_id; /**< Device port identifier. */
+
+ void *vq_ring_virt_mem; /**< virtual address of vring*/
+ int vq_alignment;
+ int vq_ring_size;
+
+ struct vring vq_ring; /**< vring keeping desc, used and avail */
+ struct pmd_internals *internals; /**< virtio device internal info. */
+ uint16_t vq_nentries; /**< vring desc numbers */
+ uint16_t vq_desc_head_idx;
+ uint16_t vq_free_cnt; /**< num of desc available */
+ uint16_t vq_used_cons_idx; /**< Last consumed desc in used table, trails vq_ring.used->idx*/
+
+ struct vq_desc_extra {
+ void *cookie;
+ uint16_t ndescs;
+ } vq_descx[0] __rte_cache_aligned;
+};
+
+
+#ifdef RTE_LIBRTE_XENVIRT_DEBUG_DUMP
+#define VIRTQUEUE_DUMP(vq) do { \
+ uint16_t used_idx, nused; \
+ used_idx = (vq)->vq_ring.used->idx; \
+ nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \
+ PMD_INIT_LOG(DEBUG, \
+ "VQ: %s - size=%d; free=%d; used=%d; desc_head_idx=%d;" \
+ " avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \
+ " avail.flags=0x%x; used.flags=0x%x\n", \
+ (vq)->vq_name, (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \
+ (vq)->vq_desc_head_idx, (vq)->vq_ring.avail->idx, \
+ (vq)->vq_used_cons_idx, (vq)->vq_ring.used->idx, \
+ (vq)->vq_ring.avail->flags, (vq)->vq_ring.used->flags); \
+} while (0)
+#else
+#define VIRTQUEUE_DUMP(vq) do { } while (0)
+#endif
+
+
+/**
+ * Dump virtqueue internal structures, for debug purpose only.
+ */
+void virtqueue_dump(struct virtqueue *vq);
+
+/**
+ * Get all mbufs to be freed.
+ */
+struct rte_mbuf * virtqueue_detatch_unused(struct virtqueue *vq);
+
+static inline int __attribute__((always_inline))
+virtqueue_full(const struct virtqueue *vq)
+{
+ return vq->vq_free_cnt == 0;
+}
+
+#define VIRTQUEUE_NUSED(vq) ((uint16_t)((vq)->vq_ring.used->idx - (vq)->vq_used_cons_idx))
+
+static inline void __attribute__((always_inline))
+vq_ring_update_avail(struct virtqueue *vq, uint16_t desc_idx)
+{
+ uint16_t avail_idx;
+ /*
+ * Place the head of the descriptor chain into the next slot and make
+ * it usable to the host. The chain is made available now rather than
+ * deferring to virtqueue_notify() in the hopes that if the host is
+ * currently running on another CPU, we can keep it processing the new
+ * descriptor.
+ */
+ avail_idx = (uint16_t)(vq->vq_ring.avail->idx & (vq->vq_nentries - 1));
+ vq->vq_ring.avail->ring[avail_idx] = desc_idx;
+ rte_smp_wmb();
+ vq->vq_ring.avail->idx++;
+}
+
+static inline void __attribute__((always_inline))
+vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx)
+{
+ struct vring_desc *dp;
+ struct vq_desc_extra *dxp;
+
+ dp = &vq->vq_ring.desc[desc_idx];
+ dxp = &vq->vq_descx[desc_idx];
+ vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt + dxp->ndescs);
+ while (dp->flags & VRING_DESC_F_NEXT) {
+ dp = &vq->vq_ring.desc[dp->next];
+ }
+ dxp->ndescs = 0;
+
+ /*
+ * We must append the existing free chain, if any, to the end of
+ * newly freed chain. If the virtqueue was completely used, then
+ * head would be VQ_RING_DESC_CHAIN_END (ASSERTed above).
+ */
+ dp->next = vq->vq_desc_head_idx;
+ vq->vq_desc_head_idx = desc_idx;
+}
+
+static inline int __attribute__((always_inline))
+virtqueue_enqueue_recv_refill(struct virtqueue *rxvq, struct rte_mbuf *cookie)
+{
+ const uint16_t needed = 1;
+ const uint16_t head_idx = rxvq->vq_desc_head_idx;
+ struct vring_desc *start_dp = rxvq->vq_ring.desc;
+ struct vq_desc_extra *dxp;
+
+ if (unlikely(rxvq->vq_free_cnt == 0))
+ return -ENOSPC;
+ if (unlikely(rxvq->vq_free_cnt < needed))
+ return -EMSGSIZE;
+ if (unlikely(head_idx >= rxvq->vq_nentries))
+ return -EFAULT;
+
+ dxp = &rxvq->vq_descx[head_idx];
+ dxp->cookie = (void *)cookie;
+ dxp->ndescs = needed;
+
+ start_dp[head_idx].addr =
+ (uint64_t) ((uintptr_t)cookie->buf_addr + RTE_PKTMBUF_HEADROOM - sizeof(struct virtio_net_hdr));
+ start_dp[head_idx].len = cookie->buf_len - RTE_PKTMBUF_HEADROOM + sizeof(struct virtio_net_hdr);
+ start_dp[head_idx].flags = VRING_DESC_F_WRITE;
+ rxvq->vq_desc_head_idx = start_dp[head_idx].next;
+ rxvq->vq_free_cnt = (uint16_t)(rxvq->vq_free_cnt - needed);
+ vq_ring_update_avail(rxvq, head_idx);
+
+ return 0;
+}
+
+static inline int __attribute__((always_inline))
+virtqueue_enqueue_xmit(struct virtqueue *txvq, struct rte_mbuf *cookie)
+{
+
+ const uint16_t needed = 2;
+ struct vring_desc *start_dp = txvq->vq_ring.desc;
+ uint16_t head_idx = txvq->vq_desc_head_idx;
+ uint16_t idx = head_idx;
+ struct vq_desc_extra *dxp;
+
+ if (unlikely(txvq->vq_free_cnt == 0))
+ return -ENOSPC;
+ if (unlikely(txvq->vq_free_cnt < needed))
+ return -EMSGSIZE;
+ if (unlikely(head_idx >= txvq->vq_nentries))
+ return -EFAULT;
+
+ dxp = &txvq->vq_descx[idx];
+ dxp->cookie = (void *)cookie;
+ dxp->ndescs = needed;
+
+ start_dp = txvq->vq_ring.desc;
+ start_dp[idx].addr = 0;
+/*
+ * TODO: save one desc here?
+ */
+ start_dp[idx].len = sizeof(struct virtio_net_hdr);
+ start_dp[idx].flags = VRING_DESC_F_NEXT;
+ start_dp[idx].addr = (uintptr_t)NULL;
+ idx = start_dp[idx].next;
+ start_dp[idx].addr = (uint64_t)rte_pktmbuf_mtod(cookie, uintptr_t);
+ start_dp[idx].len = cookie->data_len;
+ start_dp[idx].flags = 0;
+ idx = start_dp[idx].next;
+ txvq->vq_desc_head_idx = idx;
+ txvq->vq_free_cnt = (uint16_t)(txvq->vq_free_cnt - needed);
+ vq_ring_update_avail(txvq, head_idx);
+
+ return 0;
+}
+
+static inline uint16_t __attribute__((always_inline))
+virtqueue_dequeue_burst(struct virtqueue *vq, struct rte_mbuf **rx_pkts, uint32_t *len, uint16_t num)
+{
+ struct vring_used_elem *uep;
+ struct rte_mbuf *cookie;
+ uint16_t used_idx, desc_idx;
+ uint16_t i;
+ /* Caller does the check */
+ for (i = 0; i < num ; i ++) {
+ used_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
+ uep = &vq->vq_ring.used->ring[used_idx];
+ desc_idx = (uint16_t) uep->id;
+ cookie = (struct rte_mbuf *)vq->vq_descx[desc_idx].cookie;
+ if (unlikely(cookie == NULL)) {
+ PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie at %u\n",
+ vq->vq_used_cons_idx);
+ RTE_LOG(ERR, PMD, "%s: inconsistent (%u, %u)\n", __func__, used_idx , desc_idx);
+ break;
+ }
+ len[i] = uep->len;
+ rx_pkts[i] = cookie;
+ vq->vq_used_cons_idx++;
+ vq_ring_free_chain(vq, desc_idx);
+ vq->vq_descx[desc_idx].cookie = NULL;
+ }
+ return i;
+}
+
+#endif /* _VIRTQUEUE_H_ */