diff options
Diffstat (limited to 'src/seastar/dpdk/drivers/net/virtio')
24 files changed, 8170 insertions, 0 deletions
diff --git a/src/seastar/dpdk/drivers/net/virtio/Makefile b/src/seastar/dpdk/drivers/net/virtio/Makefile new file mode 100644 index 00000000..b21b8781 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/Makefile @@ -0,0 +1,69 @@ +# BSD LICENSE +# +# Copyright(c) 2010-2014 Intel Corporation. All rights reserved. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +include $(RTE_SDK)/mk/rte.vars.mk + +# +# library name +# +LIB = librte_pmd_virtio.a + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) + +EXPORT_MAP := rte_pmd_virtio_version.map + +LIBABIVER := 1 + +# +# all source are stored in SRCS-y +# +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtqueue.c +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_pci.c +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx.c +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_ethdev.c +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple.c + +ifeq ($(CONFIG_RTE_ARCH_X86),y) +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple_sse.c +else ifneq ($(filter y,$(CONFIG_RTE_ARCH_ARM) $(CONFIG_RTE_ARCH_ARM64)),) +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple_neon.c +endif + +ifeq ($(CONFIG_RTE_VIRTIO_USER),y) +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_kernel.c +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_kernel_tap.c +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/virtio_user_dev.c +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user_ethdev.c +endif + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/seastar/dpdk/drivers/net/virtio/rte_pmd_virtio_version.map b/src/seastar/dpdk/drivers/net/virtio/rte_pmd_virtio_version.map new file mode 100644 index 00000000..ef353984 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/rte_pmd_virtio_version.map @@ -0,0 +1,4 @@ +DPDK_2.0 { + + local: *; +}; diff --git a/src/seastar/dpdk/drivers/net/virtio/virtio_ethdev.c b/src/seastar/dpdk/drivers/net/virtio/virtio_ethdev.c new file mode 100644 index 00000000..983b95f1 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtio_ethdev.c @@ -0,0 +1,1946 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdint.h> +#include <string.h> +#include <stdio.h> +#include <errno.h> +#include <unistd.h> + +#include <rte_ethdev.h> +#include <rte_ethdev_pci.h> +#include <rte_memcpy.h> +#include <rte_string_fns.h> +#include <rte_memzone.h> +#include <rte_malloc.h> +#include <rte_atomic.h> +#include <rte_branch_prediction.h> +#include <rte_pci.h> +#include <rte_ether.h> +#include <rte_common.h> +#include <rte_errno.h> + +#include <rte_memory.h> +#include <rte_eal.h> +#include <rte_dev.h> + +#include "virtio_ethdev.h" +#include "virtio_pci.h" +#include "virtio_logs.h" +#include "virtqueue.h" +#include "virtio_rxtx.h" + +static int eth_virtio_dev_uninit(struct rte_eth_dev *eth_dev); +static int virtio_dev_configure(struct rte_eth_dev *dev); +static int virtio_dev_start(struct rte_eth_dev *dev); +static void virtio_dev_stop(struct rte_eth_dev *dev); +static void virtio_dev_promiscuous_enable(struct rte_eth_dev *dev); +static void virtio_dev_promiscuous_disable(struct rte_eth_dev *dev); +static void virtio_dev_allmulticast_enable(struct rte_eth_dev *dev); +static void virtio_dev_allmulticast_disable(struct rte_eth_dev *dev); +static void virtio_dev_info_get(struct rte_eth_dev *dev, + struct rte_eth_dev_info *dev_info); +static int virtio_dev_link_update(struct rte_eth_dev *dev, + __rte_unused int wait_to_complete); + +static void virtio_set_hwaddr(struct virtio_hw *hw); +static void virtio_get_hwaddr(struct virtio_hw *hw); + +static void virtio_dev_stats_get(struct rte_eth_dev *dev, + struct rte_eth_stats *stats); +static int virtio_dev_xstats_get(struct rte_eth_dev *dev, + struct rte_eth_xstat *xstats, unsigned n); +static int virtio_dev_xstats_get_names(struct rte_eth_dev *dev, + struct rte_eth_xstat_name *xstats_names, + unsigned limit); +static void virtio_dev_stats_reset(struct rte_eth_dev *dev); +static void virtio_dev_free_mbufs(struct rte_eth_dev *dev); +static int virtio_vlan_filter_set(struct rte_eth_dev *dev, + uint16_t vlan_id, int on); +static int virtio_mac_addr_add(struct rte_eth_dev *dev, + struct ether_addr *mac_addr, + uint32_t index, uint32_t vmdq __rte_unused); +static void virtio_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index); +static void virtio_mac_addr_set(struct rte_eth_dev *dev, + struct ether_addr *mac_addr); + +static int virtio_dev_queue_stats_mapping_set( + __rte_unused struct rte_eth_dev *eth_dev, + __rte_unused uint16_t queue_id, + __rte_unused uint8_t stat_idx, + __rte_unused uint8_t is_rx); + +/* + * The set of PCI devices this driver supports + */ +static const struct rte_pci_id pci_id_virtio_map[] = { + { RTE_PCI_DEVICE(VIRTIO_PCI_VENDORID, VIRTIO_PCI_LEGACY_DEVICEID_NET) }, + { RTE_PCI_DEVICE(VIRTIO_PCI_VENDORID, VIRTIO_PCI_MODERN_DEVICEID_NET) }, + { .vendor_id = 0, /* sentinel */ }, +}; + +struct rte_virtio_xstats_name_off { + char name[RTE_ETH_XSTATS_NAME_SIZE]; + unsigned offset; +}; + +/* [rt]x_qX_ is prepended to the name string here */ +static const struct rte_virtio_xstats_name_off rte_virtio_rxq_stat_strings[] = { + {"good_packets", offsetof(struct virtnet_rx, stats.packets)}, + {"good_bytes", offsetof(struct virtnet_rx, stats.bytes)}, + {"errors", offsetof(struct virtnet_rx, stats.errors)}, + {"multicast_packets", offsetof(struct virtnet_rx, stats.multicast)}, + {"broadcast_packets", offsetof(struct virtnet_rx, stats.broadcast)}, + {"undersize_packets", offsetof(struct virtnet_rx, stats.size_bins[0])}, + {"size_64_packets", offsetof(struct virtnet_rx, stats.size_bins[1])}, + {"size_65_127_packets", offsetof(struct virtnet_rx, stats.size_bins[2])}, + {"size_128_255_packets", offsetof(struct virtnet_rx, stats.size_bins[3])}, + {"size_256_511_packets", offsetof(struct virtnet_rx, stats.size_bins[4])}, + {"size_512_1023_packets", offsetof(struct virtnet_rx, stats.size_bins[5])}, + {"size_1024_1518_packets", offsetof(struct virtnet_rx, stats.size_bins[6])}, + {"size_1519_max_packets", offsetof(struct virtnet_rx, stats.size_bins[7])}, +}; + +/* [rt]x_qX_ is prepended to the name string here */ +static const struct rte_virtio_xstats_name_off rte_virtio_txq_stat_strings[] = { + {"good_packets", offsetof(struct virtnet_tx, stats.packets)}, + {"good_bytes", offsetof(struct virtnet_tx, stats.bytes)}, + {"errors", offsetof(struct virtnet_tx, stats.errors)}, + {"multicast_packets", offsetof(struct virtnet_tx, stats.multicast)}, + {"broadcast_packets", offsetof(struct virtnet_tx, stats.broadcast)}, + {"undersize_packets", offsetof(struct virtnet_tx, stats.size_bins[0])}, + {"size_64_packets", offsetof(struct virtnet_tx, stats.size_bins[1])}, + {"size_65_127_packets", offsetof(struct virtnet_tx, stats.size_bins[2])}, + {"size_128_255_packets", offsetof(struct virtnet_tx, stats.size_bins[3])}, + {"size_256_511_packets", offsetof(struct virtnet_tx, stats.size_bins[4])}, + {"size_512_1023_packets", offsetof(struct virtnet_tx, stats.size_bins[5])}, + {"size_1024_1518_packets", offsetof(struct virtnet_tx, stats.size_bins[6])}, + {"size_1519_max_packets", offsetof(struct virtnet_tx, stats.size_bins[7])}, +}; + +#define VIRTIO_NB_RXQ_XSTATS (sizeof(rte_virtio_rxq_stat_strings) / \ + sizeof(rte_virtio_rxq_stat_strings[0])) +#define VIRTIO_NB_TXQ_XSTATS (sizeof(rte_virtio_txq_stat_strings) / \ + sizeof(rte_virtio_txq_stat_strings[0])) + +struct virtio_hw_internal virtio_hw_internal[RTE_MAX_ETHPORTS]; + +static int +virtio_send_command(struct virtnet_ctl *cvq, struct virtio_pmd_ctrl *ctrl, + int *dlen, int pkt_num) +{ + uint32_t head, i; + int k, sum = 0; + virtio_net_ctrl_ack status = ~0; + struct virtio_pmd_ctrl result; + struct virtqueue *vq; + + ctrl->status = status; + + if (!cvq || !cvq->vq) { + PMD_INIT_LOG(ERR, "Control queue is not supported."); + return -1; + } + vq = cvq->vq; + head = vq->vq_desc_head_idx; + + PMD_INIT_LOG(DEBUG, "vq->vq_desc_head_idx = %d, status = %d, " + "vq->hw->cvq = %p vq = %p", + vq->vq_desc_head_idx, status, vq->hw->cvq, vq); + + if ((vq->vq_free_cnt < ((uint32_t)pkt_num + 2)) || (pkt_num < 1)) + return -1; + + memcpy(cvq->virtio_net_hdr_mz->addr, ctrl, + sizeof(struct virtio_pmd_ctrl)); + + /* + * Format is enforced in qemu code: + * One TX packet for header; + * At least one TX packet per argument; + * One RX packet for ACK. + */ + vq->vq_ring.desc[head].flags = VRING_DESC_F_NEXT; + vq->vq_ring.desc[head].addr = cvq->virtio_net_hdr_mem; + vq->vq_ring.desc[head].len = sizeof(struct virtio_net_ctrl_hdr); + vq->vq_free_cnt--; + i = vq->vq_ring.desc[head].next; + + for (k = 0; k < pkt_num; k++) { + vq->vq_ring.desc[i].flags = VRING_DESC_F_NEXT; + vq->vq_ring.desc[i].addr = cvq->virtio_net_hdr_mem + + sizeof(struct virtio_net_ctrl_hdr) + + sizeof(ctrl->status) + sizeof(uint8_t)*sum; + vq->vq_ring.desc[i].len = dlen[k]; + sum += dlen[k]; + vq->vq_free_cnt--; + i = vq->vq_ring.desc[i].next; + } + + vq->vq_ring.desc[i].flags = VRING_DESC_F_WRITE; + vq->vq_ring.desc[i].addr = cvq->virtio_net_hdr_mem + + sizeof(struct virtio_net_ctrl_hdr); + vq->vq_ring.desc[i].len = sizeof(ctrl->status); + vq->vq_free_cnt--; + + vq->vq_desc_head_idx = vq->vq_ring.desc[i].next; + + vq_update_avail_ring(vq, head); + vq_update_avail_idx(vq); + + PMD_INIT_LOG(DEBUG, "vq->vq_queue_index = %d", vq->vq_queue_index); + + virtqueue_notify(vq); + + rte_rmb(); + while (VIRTQUEUE_NUSED(vq) == 0) { + rte_rmb(); + usleep(100); + } + + while (VIRTQUEUE_NUSED(vq)) { + uint32_t idx, desc_idx, used_idx; + struct vring_used_elem *uep; + + used_idx = (uint32_t)(vq->vq_used_cons_idx + & (vq->vq_nentries - 1)); + uep = &vq->vq_ring.used->ring[used_idx]; + idx = (uint32_t) uep->id; + desc_idx = idx; + + while (vq->vq_ring.desc[desc_idx].flags & VRING_DESC_F_NEXT) { + desc_idx = vq->vq_ring.desc[desc_idx].next; + vq->vq_free_cnt++; + } + + vq->vq_ring.desc[desc_idx].next = vq->vq_desc_head_idx; + vq->vq_desc_head_idx = idx; + + vq->vq_used_cons_idx++; + vq->vq_free_cnt++; + } + + PMD_INIT_LOG(DEBUG, "vq->vq_free_cnt=%d\nvq->vq_desc_head_idx=%d", + vq->vq_free_cnt, vq->vq_desc_head_idx); + + memcpy(&result, cvq->virtio_net_hdr_mz->addr, + sizeof(struct virtio_pmd_ctrl)); + + return result.status; +} + +static int +virtio_set_multiple_queues(struct rte_eth_dev *dev, uint16_t nb_queues) +{ + struct virtio_hw *hw = dev->data->dev_private; + struct virtio_pmd_ctrl ctrl; + int dlen[1]; + int ret; + + ctrl.hdr.class = VIRTIO_NET_CTRL_MQ; + ctrl.hdr.cmd = VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET; + memcpy(ctrl.data, &nb_queues, sizeof(uint16_t)); + + dlen[0] = sizeof(uint16_t); + + ret = virtio_send_command(hw->cvq, &ctrl, dlen, 1); + if (ret) { + PMD_INIT_LOG(ERR, "Multiqueue configured but send command " + "failed, this is too late now..."); + return -EINVAL; + } + + return 0; +} + +static void +virtio_dev_queue_release(void *queue __rte_unused) +{ + /* do nothing */ +} + +static int +virtio_get_queue_type(struct virtio_hw *hw, uint16_t vtpci_queue_idx) +{ + if (vtpci_queue_idx == hw->max_queue_pairs * 2) + return VTNET_CQ; + else if (vtpci_queue_idx % 2 == 0) + return VTNET_RQ; + else + return VTNET_TQ; +} + +static uint16_t +virtio_get_nr_vq(struct virtio_hw *hw) +{ + uint16_t nr_vq = hw->max_queue_pairs * 2; + + if (vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_VQ)) + nr_vq += 1; + + return nr_vq; +} + +static void +virtio_init_vring(struct virtqueue *vq) +{ + int size = vq->vq_nentries; + struct vring *vr = &vq->vq_ring; + uint8_t *ring_mem = vq->vq_ring_virt_mem; + + PMD_INIT_FUNC_TRACE(); + + /* + * Reinitialise since virtio port might have been stopped and restarted + */ + memset(ring_mem, 0, vq->vq_ring_size); + vring_init(vr, size, ring_mem, VIRTIO_PCI_VRING_ALIGN); + vq->vq_used_cons_idx = 0; + vq->vq_desc_head_idx = 0; + vq->vq_avail_idx = 0; + vq->vq_desc_tail_idx = (uint16_t)(vq->vq_nentries - 1); + vq->vq_free_cnt = vq->vq_nentries; + memset(vq->vq_descx, 0, sizeof(struct vq_desc_extra) * vq->vq_nentries); + + vring_desc_init(vr->desc, size); + + /* + * Disable device(host) interrupting guest + */ + virtqueue_disable_intr(vq); +} + +static int +virtio_init_queue(struct rte_eth_dev *dev, uint16_t vtpci_queue_idx) +{ + char vq_name[VIRTQUEUE_MAX_NAME_SZ]; + char vq_hdr_name[VIRTQUEUE_MAX_NAME_SZ]; + const struct rte_memzone *mz = NULL, *hdr_mz = NULL; + unsigned int vq_size, size; + struct virtio_hw *hw = dev->data->dev_private; + struct virtnet_rx *rxvq = NULL; + struct virtnet_tx *txvq = NULL; + struct virtnet_ctl *cvq = NULL; + struct virtqueue *vq; + size_t sz_hdr_mz = 0; + void *sw_ring = NULL; + int queue_type = virtio_get_queue_type(hw, vtpci_queue_idx); + int ret; + + PMD_INIT_LOG(DEBUG, "setting up queue: %u", vtpci_queue_idx); + + /* + * Read the virtqueue size from the Queue Size field + * Always power of 2 and if 0 virtqueue does not exist + */ + vq_size = VTPCI_OPS(hw)->get_queue_num(hw, vtpci_queue_idx); + PMD_INIT_LOG(DEBUG, "vq_size: %u", vq_size); + if (vq_size == 0) { + PMD_INIT_LOG(ERR, "virtqueue does not exist"); + return -EINVAL; + } + + if (!rte_is_power_of_2(vq_size)) { + PMD_INIT_LOG(ERR, "virtqueue size is not powerof 2"); + return -EINVAL; + } + + snprintf(vq_name, sizeof(vq_name), "port%d_vq%d", + dev->data->port_id, vtpci_queue_idx); + + size = RTE_ALIGN_CEIL(sizeof(*vq) + + vq_size * sizeof(struct vq_desc_extra), + RTE_CACHE_LINE_SIZE); + if (queue_type == VTNET_TQ) { + /* + * For each xmit packet, allocate a virtio_net_hdr + * and indirect ring elements + */ + sz_hdr_mz = vq_size * sizeof(struct virtio_tx_region); + } else if (queue_type == VTNET_CQ) { + /* Allocate a page for control vq command, data and status */ + sz_hdr_mz = PAGE_SIZE; + } + + vq = rte_zmalloc_socket(vq_name, size, RTE_CACHE_LINE_SIZE, + SOCKET_ID_ANY); + if (vq == NULL) { + PMD_INIT_LOG(ERR, "can not allocate vq"); + return -ENOMEM; + } + hw->vqs[vtpci_queue_idx] = vq; + + vq->hw = hw; + vq->vq_queue_index = vtpci_queue_idx; + vq->vq_nentries = vq_size; + + /* + * Reserve a memzone for vring elements + */ + size = vring_size(vq_size, VIRTIO_PCI_VRING_ALIGN); + vq->vq_ring_size = RTE_ALIGN_CEIL(size, VIRTIO_PCI_VRING_ALIGN); + PMD_INIT_LOG(DEBUG, "vring_size: %d, rounded_vring_size: %d", + size, vq->vq_ring_size); + + mz = rte_memzone_reserve_aligned(vq_name, vq->vq_ring_size, + SOCKET_ID_ANY, + 0, VIRTIO_PCI_VRING_ALIGN); + if (mz == NULL) { + if (rte_errno == EEXIST) + mz = rte_memzone_lookup(vq_name); + if (mz == NULL) { + ret = -ENOMEM; + goto fail_q_alloc; + } + } + + memset(mz->addr, 0, sizeof(mz->len)); + + vq->vq_ring_mem = mz->phys_addr; + vq->vq_ring_virt_mem = mz->addr; + PMD_INIT_LOG(DEBUG, "vq->vq_ring_mem: 0x%" PRIx64, + (uint64_t)mz->phys_addr); + PMD_INIT_LOG(DEBUG, "vq->vq_ring_virt_mem: 0x%" PRIx64, + (uint64_t)(uintptr_t)mz->addr); + + virtio_init_vring(vq); + + if (sz_hdr_mz) { + snprintf(vq_hdr_name, sizeof(vq_hdr_name), "port%d_vq%d_hdr", + dev->data->port_id, vtpci_queue_idx); + hdr_mz = rte_memzone_reserve_aligned(vq_hdr_name, sz_hdr_mz, + SOCKET_ID_ANY, 0, + RTE_CACHE_LINE_SIZE); + if (hdr_mz == NULL) { + if (rte_errno == EEXIST) + hdr_mz = rte_memzone_lookup(vq_hdr_name); + if (hdr_mz == NULL) { + ret = -ENOMEM; + goto fail_q_alloc; + } + } + } + + if (queue_type == VTNET_RQ) { + size_t sz_sw = (RTE_PMD_VIRTIO_RX_MAX_BURST + vq_size) * + sizeof(vq->sw_ring[0]); + + sw_ring = rte_zmalloc_socket("sw_ring", sz_sw, + RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY); + if (!sw_ring) { + PMD_INIT_LOG(ERR, "can not allocate RX soft ring"); + ret = -ENOMEM; + goto fail_q_alloc; + } + + vq->sw_ring = sw_ring; + rxvq = &vq->rxq; + rxvq->vq = vq; + rxvq->port_id = dev->data->port_id; + rxvq->mz = mz; + } else if (queue_type == VTNET_TQ) { + txvq = &vq->txq; + txvq->vq = vq; + txvq->port_id = dev->data->port_id; + txvq->mz = mz; + txvq->virtio_net_hdr_mz = hdr_mz; + txvq->virtio_net_hdr_mem = hdr_mz->phys_addr; + } else if (queue_type == VTNET_CQ) { + cvq = &vq->cq; + cvq->vq = vq; + cvq->mz = mz; + cvq->virtio_net_hdr_mz = hdr_mz; + cvq->virtio_net_hdr_mem = hdr_mz->phys_addr; + memset(cvq->virtio_net_hdr_mz->addr, 0, PAGE_SIZE); + + hw->cvq = cvq; + } + + /* For virtio_user case (that is when hw->dev is NULL), we use + * virtual address. And we need properly set _offset_, please see + * VIRTIO_MBUF_DATA_DMA_ADDR in virtqueue.h for more information. + */ + if (!hw->virtio_user_dev) + vq->offset = offsetof(struct rte_mbuf, buf_physaddr); + else { + vq->vq_ring_mem = (uintptr_t)mz->addr; + vq->offset = offsetof(struct rte_mbuf, buf_addr); + if (queue_type == VTNET_TQ) + txvq->virtio_net_hdr_mem = (uintptr_t)hdr_mz->addr; + else if (queue_type == VTNET_CQ) + cvq->virtio_net_hdr_mem = (uintptr_t)hdr_mz->addr; + } + + if (queue_type == VTNET_TQ) { + struct virtio_tx_region *txr; + unsigned int i; + + txr = hdr_mz->addr; + memset(txr, 0, vq_size * sizeof(*txr)); + for (i = 0; i < vq_size; i++) { + struct vring_desc *start_dp = txr[i].tx_indir; + + vring_desc_init(start_dp, RTE_DIM(txr[i].tx_indir)); + + /* first indirect descriptor is always the tx header */ + start_dp->addr = txvq->virtio_net_hdr_mem + + i * sizeof(*txr) + + offsetof(struct virtio_tx_region, tx_hdr); + + start_dp->len = hw->vtnet_hdr_size; + start_dp->flags = VRING_DESC_F_NEXT; + } + } + + if (VTPCI_OPS(hw)->setup_queue(hw, vq) < 0) { + PMD_INIT_LOG(ERR, "setup_queue failed"); + return -EINVAL; + } + + return 0; + +fail_q_alloc: + rte_free(sw_ring); + rte_memzone_free(hdr_mz); + rte_memzone_free(mz); + rte_free(vq); + + return ret; +} + +static void +virtio_free_queues(struct virtio_hw *hw) +{ + uint16_t nr_vq = virtio_get_nr_vq(hw); + struct virtqueue *vq; + int queue_type; + uint16_t i; + + if (hw->vqs == NULL) + return; + + for (i = 0; i < nr_vq; i++) { + vq = hw->vqs[i]; + if (!vq) + continue; + + queue_type = virtio_get_queue_type(hw, i); + if (queue_type == VTNET_RQ) { + rte_free(vq->sw_ring); + rte_memzone_free(vq->rxq.mz); + } else if (queue_type == VTNET_TQ) { + rte_memzone_free(vq->txq.mz); + rte_memzone_free(vq->txq.virtio_net_hdr_mz); + } else { + rte_memzone_free(vq->cq.mz); + rte_memzone_free(vq->cq.virtio_net_hdr_mz); + } + + rte_free(vq); + hw->vqs[i] = NULL; + } + + rte_free(hw->vqs); + hw->vqs = NULL; +} + +static int +virtio_alloc_queues(struct rte_eth_dev *dev) +{ + struct virtio_hw *hw = dev->data->dev_private; + uint16_t nr_vq = virtio_get_nr_vq(hw); + uint16_t i; + int ret; + + hw->vqs = rte_zmalloc(NULL, sizeof(struct virtqueue *) * nr_vq, 0); + if (!hw->vqs) { + PMD_INIT_LOG(ERR, "failed to allocate vqs"); + return -ENOMEM; + } + + for (i = 0; i < nr_vq; i++) { + ret = virtio_init_queue(dev, i); + if (ret < 0) { + virtio_free_queues(hw); + return ret; + } + } + + return 0; +} + +static void virtio_queues_unbind_intr(struct rte_eth_dev *dev); + +static void +virtio_dev_close(struct rte_eth_dev *dev) +{ + struct virtio_hw *hw = dev->data->dev_private; + struct rte_intr_conf *intr_conf = &dev->data->dev_conf.intr_conf; + + PMD_INIT_LOG(DEBUG, "virtio_dev_close"); + + /* reset the NIC */ + if (dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC) + VTPCI_OPS(hw)->set_config_irq(hw, VIRTIO_MSI_NO_VECTOR); + if (intr_conf->rxq) + virtio_queues_unbind_intr(dev); + + if (intr_conf->lsc || intr_conf->rxq) { + rte_intr_disable(dev->intr_handle); + rte_intr_efd_disable(dev->intr_handle); + rte_free(dev->intr_handle->intr_vec); + dev->intr_handle->intr_vec = NULL; + } + + vtpci_reset(hw); + virtio_dev_free_mbufs(dev); + virtio_free_queues(hw); +} + +static void +virtio_dev_promiscuous_enable(struct rte_eth_dev *dev) +{ + struct virtio_hw *hw = dev->data->dev_private; + struct virtio_pmd_ctrl ctrl; + int dlen[1]; + int ret; + + if (!vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_RX)) { + PMD_INIT_LOG(INFO, "host does not support rx control"); + return; + } + + ctrl.hdr.class = VIRTIO_NET_CTRL_RX; + ctrl.hdr.cmd = VIRTIO_NET_CTRL_RX_PROMISC; + ctrl.data[0] = 1; + dlen[0] = 1; + + ret = virtio_send_command(hw->cvq, &ctrl, dlen, 1); + if (ret) + PMD_INIT_LOG(ERR, "Failed to enable promisc"); +} + +static void +virtio_dev_promiscuous_disable(struct rte_eth_dev *dev) +{ + struct virtio_hw *hw = dev->data->dev_private; + struct virtio_pmd_ctrl ctrl; + int dlen[1]; + int ret; + + if (!vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_RX)) { + PMD_INIT_LOG(INFO, "host does not support rx control"); + return; + } + + ctrl.hdr.class = VIRTIO_NET_CTRL_RX; + ctrl.hdr.cmd = VIRTIO_NET_CTRL_RX_PROMISC; + ctrl.data[0] = 0; + dlen[0] = 1; + + ret = virtio_send_command(hw->cvq, &ctrl, dlen, 1); + if (ret) + PMD_INIT_LOG(ERR, "Failed to disable promisc"); +} + +static void +virtio_dev_allmulticast_enable(struct rte_eth_dev *dev) +{ + struct virtio_hw *hw = dev->data->dev_private; + struct virtio_pmd_ctrl ctrl; + int dlen[1]; + int ret; + + if (!vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_RX)) { + PMD_INIT_LOG(INFO, "host does not support rx control"); + return; + } + + ctrl.hdr.class = VIRTIO_NET_CTRL_RX; + ctrl.hdr.cmd = VIRTIO_NET_CTRL_RX_ALLMULTI; + ctrl.data[0] = 1; + dlen[0] = 1; + + ret = virtio_send_command(hw->cvq, &ctrl, dlen, 1); + if (ret) + PMD_INIT_LOG(ERR, "Failed to enable allmulticast"); +} + +static void +virtio_dev_allmulticast_disable(struct rte_eth_dev *dev) +{ + struct virtio_hw *hw = dev->data->dev_private; + struct virtio_pmd_ctrl ctrl; + int dlen[1]; + int ret; + + if (!vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_RX)) { + PMD_INIT_LOG(INFO, "host does not support rx control"); + return; + } + + ctrl.hdr.class = VIRTIO_NET_CTRL_RX; + ctrl.hdr.cmd = VIRTIO_NET_CTRL_RX_ALLMULTI; + ctrl.data[0] = 0; + dlen[0] = 1; + + ret = virtio_send_command(hw->cvq, &ctrl, dlen, 1); + if (ret) + PMD_INIT_LOG(ERR, "Failed to disable allmulticast"); +} + +#define VLAN_TAG_LEN 4 /* 802.3ac tag (not DMA'd) */ +static int +virtio_mtu_set(struct rte_eth_dev *dev, uint16_t mtu) +{ + struct virtio_hw *hw = dev->data->dev_private; + uint32_t ether_hdr_len = ETHER_HDR_LEN + VLAN_TAG_LEN + + hw->vtnet_hdr_size; + uint32_t frame_size = mtu + ether_hdr_len; + uint32_t max_frame_size = hw->max_mtu + ether_hdr_len; + + max_frame_size = RTE_MIN(max_frame_size, VIRTIO_MAX_RX_PKTLEN); + + if (mtu < ETHER_MIN_MTU || frame_size > max_frame_size) { + PMD_INIT_LOG(ERR, "MTU should be between %d and %d", + ETHER_MIN_MTU, max_frame_size - ether_hdr_len); + return -EINVAL; + } + return 0; +} + +static int +virtio_dev_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id) +{ + struct virtnet_rx *rxvq = dev->data->rx_queues[queue_id]; + struct virtqueue *vq = rxvq->vq; + + virtqueue_enable_intr(vq); + return 0; +} + +static int +virtio_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id) +{ + struct virtnet_rx *rxvq = dev->data->rx_queues[queue_id]; + struct virtqueue *vq = rxvq->vq; + + virtqueue_disable_intr(vq); + return 0; +} + +/* + * dev_ops for virtio, bare necessities for basic operation + */ +static const struct eth_dev_ops virtio_eth_dev_ops = { + .dev_configure = virtio_dev_configure, + .dev_start = virtio_dev_start, + .dev_stop = virtio_dev_stop, + .dev_close = virtio_dev_close, + .promiscuous_enable = virtio_dev_promiscuous_enable, + .promiscuous_disable = virtio_dev_promiscuous_disable, + .allmulticast_enable = virtio_dev_allmulticast_enable, + .allmulticast_disable = virtio_dev_allmulticast_disable, + .mtu_set = virtio_mtu_set, + .dev_infos_get = virtio_dev_info_get, + .stats_get = virtio_dev_stats_get, + .xstats_get = virtio_dev_xstats_get, + .xstats_get_names = virtio_dev_xstats_get_names, + .stats_reset = virtio_dev_stats_reset, + .xstats_reset = virtio_dev_stats_reset, + .link_update = virtio_dev_link_update, + .rx_queue_setup = virtio_dev_rx_queue_setup, + .rx_queue_intr_enable = virtio_dev_rx_queue_intr_enable, + .rx_queue_intr_disable = virtio_dev_rx_queue_intr_disable, + .rx_queue_release = virtio_dev_queue_release, + .rx_descriptor_done = virtio_dev_rx_queue_done, + .tx_queue_setup = virtio_dev_tx_queue_setup, + .tx_queue_release = virtio_dev_queue_release, + /* collect stats per queue */ + .queue_stats_mapping_set = virtio_dev_queue_stats_mapping_set, + .vlan_filter_set = virtio_vlan_filter_set, + .mac_addr_add = virtio_mac_addr_add, + .mac_addr_remove = virtio_mac_addr_remove, + .mac_addr_set = virtio_mac_addr_set, +}; + +static inline int +virtio_dev_atomic_read_link_status(struct rte_eth_dev *dev, + struct rte_eth_link *link) +{ + struct rte_eth_link *dst = link; + struct rte_eth_link *src = &(dev->data->dev_link); + + if (rte_atomic64_cmpset((uint64_t *)dst, *(uint64_t *)dst, + *(uint64_t *)src) == 0) + return -1; + + return 0; +} + +/** + * Atomically writes the link status information into global + * structure rte_eth_dev. + * + * @param dev + * - Pointer to the structure rte_eth_dev to read from. + * - Pointer to the buffer to be saved with the link status. + * + * @return + * - On success, zero. + * - On failure, negative value. + */ +static inline int +virtio_dev_atomic_write_link_status(struct rte_eth_dev *dev, + struct rte_eth_link *link) +{ + struct rte_eth_link *dst = &(dev->data->dev_link); + struct rte_eth_link *src = link; + + if (rte_atomic64_cmpset((uint64_t *)dst, *(uint64_t *)dst, + *(uint64_t *)src) == 0) + return -1; + + return 0; +} + +static void +virtio_update_stats(struct rte_eth_dev *dev, struct rte_eth_stats *stats) +{ + unsigned i; + + for (i = 0; i < dev->data->nb_tx_queues; i++) { + const struct virtnet_tx *txvq = dev->data->tx_queues[i]; + if (txvq == NULL) + continue; + + stats->opackets += txvq->stats.packets; + stats->obytes += txvq->stats.bytes; + stats->oerrors += txvq->stats.errors; + + if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) { + stats->q_opackets[i] = txvq->stats.packets; + stats->q_obytes[i] = txvq->stats.bytes; + } + } + + for (i = 0; i < dev->data->nb_rx_queues; i++) { + const struct virtnet_rx *rxvq = dev->data->rx_queues[i]; + if (rxvq == NULL) + continue; + + stats->ipackets += rxvq->stats.packets; + stats->ibytes += rxvq->stats.bytes; + stats->ierrors += rxvq->stats.errors; + + if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) { + stats->q_ipackets[i] = rxvq->stats.packets; + stats->q_ibytes[i] = rxvq->stats.bytes; + } + } + + stats->rx_nombuf = dev->data->rx_mbuf_alloc_failed; +} + +static int virtio_dev_xstats_get_names(struct rte_eth_dev *dev, + struct rte_eth_xstat_name *xstats_names, + __rte_unused unsigned limit) +{ + unsigned i; + unsigned count = 0; + unsigned t; + + unsigned nstats = dev->data->nb_tx_queues * VIRTIO_NB_TXQ_XSTATS + + dev->data->nb_rx_queues * VIRTIO_NB_RXQ_XSTATS; + + if (xstats_names != NULL) { + /* Note: limit checked in rte_eth_xstats_names() */ + + for (i = 0; i < dev->data->nb_rx_queues; i++) { + struct virtqueue *rxvq = dev->data->rx_queues[i]; + if (rxvq == NULL) + continue; + for (t = 0; t < VIRTIO_NB_RXQ_XSTATS; t++) { + snprintf(xstats_names[count].name, + sizeof(xstats_names[count].name), + "rx_q%u_%s", i, + rte_virtio_rxq_stat_strings[t].name); + count++; + } + } + + for (i = 0; i < dev->data->nb_tx_queues; i++) { + struct virtqueue *txvq = dev->data->tx_queues[i]; + if (txvq == NULL) + continue; + for (t = 0; t < VIRTIO_NB_TXQ_XSTATS; t++) { + snprintf(xstats_names[count].name, + sizeof(xstats_names[count].name), + "tx_q%u_%s", i, + rte_virtio_txq_stat_strings[t].name); + count++; + } + } + return count; + } + return nstats; +} + +static int +virtio_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats, + unsigned n) +{ + unsigned i; + unsigned count = 0; + + unsigned nstats = dev->data->nb_tx_queues * VIRTIO_NB_TXQ_XSTATS + + dev->data->nb_rx_queues * VIRTIO_NB_RXQ_XSTATS; + + if (n < nstats) + return nstats; + + for (i = 0; i < dev->data->nb_rx_queues; i++) { + struct virtnet_rx *rxvq = dev->data->rx_queues[i]; + + if (rxvq == NULL) + continue; + + unsigned t; + + for (t = 0; t < VIRTIO_NB_RXQ_XSTATS; t++) { + xstats[count].value = *(uint64_t *)(((char *)rxvq) + + rte_virtio_rxq_stat_strings[t].offset); + xstats[count].id = count; + count++; + } + } + + for (i = 0; i < dev->data->nb_tx_queues; i++) { + struct virtnet_tx *txvq = dev->data->tx_queues[i]; + + if (txvq == NULL) + continue; + + unsigned t; + + for (t = 0; t < VIRTIO_NB_TXQ_XSTATS; t++) { + xstats[count].value = *(uint64_t *)(((char *)txvq) + + rte_virtio_txq_stat_strings[t].offset); + xstats[count].id = count; + count++; + } + } + + return count; +} + +static void +virtio_dev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats) +{ + virtio_update_stats(dev, stats); +} + +static void +virtio_dev_stats_reset(struct rte_eth_dev *dev) +{ + unsigned int i; + + for (i = 0; i < dev->data->nb_tx_queues; i++) { + struct virtnet_tx *txvq = dev->data->tx_queues[i]; + if (txvq == NULL) + continue; + + txvq->stats.packets = 0; + txvq->stats.bytes = 0; + txvq->stats.errors = 0; + txvq->stats.multicast = 0; + txvq->stats.broadcast = 0; + memset(txvq->stats.size_bins, 0, + sizeof(txvq->stats.size_bins[0]) * 8); + } + + for (i = 0; i < dev->data->nb_rx_queues; i++) { + struct virtnet_rx *rxvq = dev->data->rx_queues[i]; + if (rxvq == NULL) + continue; + + rxvq->stats.packets = 0; + rxvq->stats.bytes = 0; + rxvq->stats.errors = 0; + rxvq->stats.multicast = 0; + rxvq->stats.broadcast = 0; + memset(rxvq->stats.size_bins, 0, + sizeof(rxvq->stats.size_bins[0]) * 8); + } +} + +static void +virtio_set_hwaddr(struct virtio_hw *hw) +{ + vtpci_write_dev_config(hw, + offsetof(struct virtio_net_config, mac), + &hw->mac_addr, ETHER_ADDR_LEN); +} + +static void +virtio_get_hwaddr(struct virtio_hw *hw) +{ + if (vtpci_with_feature(hw, VIRTIO_NET_F_MAC)) { + vtpci_read_dev_config(hw, + offsetof(struct virtio_net_config, mac), + &hw->mac_addr, ETHER_ADDR_LEN); + } else { + eth_random_addr(&hw->mac_addr[0]); + virtio_set_hwaddr(hw); + } +} + +static int +virtio_mac_table_set(struct virtio_hw *hw, + const struct virtio_net_ctrl_mac *uc, + const struct virtio_net_ctrl_mac *mc) +{ + struct virtio_pmd_ctrl ctrl; + int err, len[2]; + + if (!vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_MAC_ADDR)) { + PMD_DRV_LOG(INFO, "host does not support mac table"); + return -1; + } + + ctrl.hdr.class = VIRTIO_NET_CTRL_MAC; + ctrl.hdr.cmd = VIRTIO_NET_CTRL_MAC_TABLE_SET; + + len[0] = uc->entries * ETHER_ADDR_LEN + sizeof(uc->entries); + memcpy(ctrl.data, uc, len[0]); + + len[1] = mc->entries * ETHER_ADDR_LEN + sizeof(mc->entries); + memcpy(ctrl.data + len[0], mc, len[1]); + + err = virtio_send_command(hw->cvq, &ctrl, len, 2); + if (err != 0) + PMD_DRV_LOG(NOTICE, "mac table set failed: %d", err); + return err; +} + +static int +virtio_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr, + uint32_t index, uint32_t vmdq __rte_unused) +{ + struct virtio_hw *hw = dev->data->dev_private; + const struct ether_addr *addrs = dev->data->mac_addrs; + unsigned int i; + struct virtio_net_ctrl_mac *uc, *mc; + + if (index >= VIRTIO_MAX_MAC_ADDRS) { + PMD_DRV_LOG(ERR, "mac address index %u out of range", index); + return -EINVAL; + } + + uc = alloca(VIRTIO_MAX_MAC_ADDRS * ETHER_ADDR_LEN + sizeof(uc->entries)); + uc->entries = 0; + mc = alloca(VIRTIO_MAX_MAC_ADDRS * ETHER_ADDR_LEN + sizeof(mc->entries)); + mc->entries = 0; + + for (i = 0; i < VIRTIO_MAX_MAC_ADDRS; i++) { + const struct ether_addr *addr + = (i == index) ? mac_addr : addrs + i; + struct virtio_net_ctrl_mac *tbl + = is_multicast_ether_addr(addr) ? mc : uc; + + memcpy(&tbl->macs[tbl->entries++], addr, ETHER_ADDR_LEN); + } + + return virtio_mac_table_set(hw, uc, mc); +} + +static void +virtio_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) +{ + struct virtio_hw *hw = dev->data->dev_private; + struct ether_addr *addrs = dev->data->mac_addrs; + struct virtio_net_ctrl_mac *uc, *mc; + unsigned int i; + + if (index >= VIRTIO_MAX_MAC_ADDRS) { + PMD_DRV_LOG(ERR, "mac address index %u out of range", index); + return; + } + + uc = alloca(VIRTIO_MAX_MAC_ADDRS * ETHER_ADDR_LEN + sizeof(uc->entries)); + uc->entries = 0; + mc = alloca(VIRTIO_MAX_MAC_ADDRS * ETHER_ADDR_LEN + sizeof(mc->entries)); + mc->entries = 0; + + for (i = 0; i < VIRTIO_MAX_MAC_ADDRS; i++) { + struct virtio_net_ctrl_mac *tbl; + + if (i == index || is_zero_ether_addr(addrs + i)) + continue; + + tbl = is_multicast_ether_addr(addrs + i) ? mc : uc; + memcpy(&tbl->macs[tbl->entries++], addrs + i, ETHER_ADDR_LEN); + } + + virtio_mac_table_set(hw, uc, mc); +} + +static void +virtio_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr) +{ + struct virtio_hw *hw = dev->data->dev_private; + + memcpy(hw->mac_addr, mac_addr, ETHER_ADDR_LEN); + + /* Use atomic update if available */ + if (vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_MAC_ADDR)) { + struct virtio_pmd_ctrl ctrl; + int len = ETHER_ADDR_LEN; + + ctrl.hdr.class = VIRTIO_NET_CTRL_MAC; + ctrl.hdr.cmd = VIRTIO_NET_CTRL_MAC_ADDR_SET; + + memcpy(ctrl.data, mac_addr, ETHER_ADDR_LEN); + virtio_send_command(hw->cvq, &ctrl, &len, 1); + } else if (vtpci_with_feature(hw, VIRTIO_NET_F_MAC)) + virtio_set_hwaddr(hw); +} + +static int +virtio_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on) +{ + struct virtio_hw *hw = dev->data->dev_private; + struct virtio_pmd_ctrl ctrl; + int len; + + if (!vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_VLAN)) + return -ENOTSUP; + + ctrl.hdr.class = VIRTIO_NET_CTRL_VLAN; + ctrl.hdr.cmd = on ? VIRTIO_NET_CTRL_VLAN_ADD : VIRTIO_NET_CTRL_VLAN_DEL; + memcpy(ctrl.data, &vlan_id, sizeof(vlan_id)); + len = sizeof(vlan_id); + + return virtio_send_command(hw->cvq, &ctrl, &len, 1); +} + +static int +virtio_negotiate_features(struct virtio_hw *hw, uint64_t req_features) +{ + uint64_t host_features; + + /* Prepare guest_features: feature that driver wants to support */ + PMD_INIT_LOG(DEBUG, "guest_features before negotiate = %" PRIx64, + req_features); + + /* Read device(host) feature bits */ + host_features = VTPCI_OPS(hw)->get_features(hw); + PMD_INIT_LOG(DEBUG, "host_features before negotiate = %" PRIx64, + host_features); + + /* If supported, ensure MTU value is valid before acknowledging it. */ + if (host_features & req_features & (1ULL << VIRTIO_NET_F_MTU)) { + struct virtio_net_config config; + + vtpci_read_dev_config(hw, + offsetof(struct virtio_net_config, mtu), + &config.mtu, sizeof(config.mtu)); + + if (config.mtu < ETHER_MIN_MTU) + req_features &= ~(1ULL << VIRTIO_NET_F_MTU); + } + + /* + * Negotiate features: Subset of device feature bits are written back + * guest feature bits. + */ + hw->guest_features = req_features; + hw->guest_features = vtpci_negotiate_features(hw, host_features); + PMD_INIT_LOG(DEBUG, "features after negotiate = %" PRIx64, + hw->guest_features); + + if (hw->modern) { + if (!vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) { + PMD_INIT_LOG(ERR, + "VIRTIO_F_VERSION_1 features is not enabled."); + return -1; + } + vtpci_set_status(hw, VIRTIO_CONFIG_STATUS_FEATURES_OK); + if (!(vtpci_get_status(hw) & VIRTIO_CONFIG_STATUS_FEATURES_OK)) { + PMD_INIT_LOG(ERR, + "failed to set FEATURES_OK status!"); + return -1; + } + } + + hw->req_guest_features = req_features; + + return 0; +} + +/* + * Process Virtio Config changed interrupt and call the callback + * if link state changed. + */ +void +virtio_interrupt_handler(void *param) +{ + struct rte_eth_dev *dev = param; + struct virtio_hw *hw = dev->data->dev_private; + uint8_t isr; + + /* Read interrupt status which clears interrupt */ + isr = vtpci_isr(hw); + PMD_DRV_LOG(INFO, "interrupt status = %#x", isr); + + if (rte_intr_enable(dev->intr_handle) < 0) + PMD_DRV_LOG(ERR, "interrupt enable failed"); + + if (isr & VIRTIO_PCI_ISR_CONFIG) { + if (virtio_dev_link_update(dev, 0) == 0) + _rte_eth_dev_callback_process(dev, + RTE_ETH_EVENT_INTR_LSC, NULL); + } + +} + +static void +rx_func_get(struct rte_eth_dev *eth_dev) +{ + struct virtio_hw *hw = eth_dev->data->dev_private; + if (vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) + eth_dev->rx_pkt_burst = &virtio_recv_mergeable_pkts; + else + eth_dev->rx_pkt_burst = &virtio_recv_pkts; +} + +/* Only support 1:1 queue/interrupt mapping so far. + * TODO: support n:1 queue/interrupt mapping when there are limited number of + * interrupt vectors (<N+1). + */ +static int +virtio_queues_bind_intr(struct rte_eth_dev *dev) +{ + uint32_t i; + struct virtio_hw *hw = dev->data->dev_private; + + PMD_INIT_LOG(INFO, "queue/interrupt binding"); + for (i = 0; i < dev->data->nb_rx_queues; ++i) { + dev->intr_handle->intr_vec[i] = i + 1; + if (VTPCI_OPS(hw)->set_queue_irq(hw, hw->vqs[i * 2], i + 1) == + VIRTIO_MSI_NO_VECTOR) { + PMD_DRV_LOG(ERR, "failed to set queue vector"); + return -EBUSY; + } + } + + return 0; +} + +static void +virtio_queues_unbind_intr(struct rte_eth_dev *dev) +{ + uint32_t i; + struct virtio_hw *hw = dev->data->dev_private; + + PMD_INIT_LOG(INFO, "queue/interrupt unbinding"); + for (i = 0; i < dev->data->nb_rx_queues; ++i) + VTPCI_OPS(hw)->set_queue_irq(hw, + hw->vqs[i * VTNET_CQ], + VIRTIO_MSI_NO_VECTOR); +} + +static int +virtio_configure_intr(struct rte_eth_dev *dev) +{ + struct virtio_hw *hw = dev->data->dev_private; + + if (!rte_intr_cap_multiple(dev->intr_handle)) { + PMD_INIT_LOG(ERR, "Multiple intr vector not supported"); + return -ENOTSUP; + } + + if (rte_intr_efd_enable(dev->intr_handle, dev->data->nb_rx_queues)) { + PMD_INIT_LOG(ERR, "Fail to create eventfd"); + return -1; + } + + if (!dev->intr_handle->intr_vec) { + dev->intr_handle->intr_vec = + rte_zmalloc("intr_vec", + hw->max_queue_pairs * sizeof(int), 0); + if (!dev->intr_handle->intr_vec) { + PMD_INIT_LOG(ERR, "Failed to allocate %u rxq vectors", + hw->max_queue_pairs); + return -ENOMEM; + } + } + + /* Re-register callback to update max_intr */ + rte_intr_callback_unregister(dev->intr_handle, + virtio_interrupt_handler, + dev); + rte_intr_callback_register(dev->intr_handle, + virtio_interrupt_handler, + dev); + + /* DO NOT try to remove this! This function will enable msix, or QEMU + * will encounter SIGSEGV when DRIVER_OK is sent. + * And for legacy devices, this should be done before queue/vec binding + * to change the config size from 20 to 24, or VIRTIO_MSI_QUEUE_VECTOR + * (22) will be ignored. + */ + if (rte_intr_enable(dev->intr_handle) < 0) { + PMD_DRV_LOG(ERR, "interrupt enable failed"); + return -1; + } + + if (virtio_queues_bind_intr(dev) < 0) { + PMD_INIT_LOG(ERR, "Failed to bind queue/interrupt"); + return -1; + } + + return 0; +} + +/* reset device and renegotiate features if needed */ +static int +virtio_init_device(struct rte_eth_dev *eth_dev, uint64_t req_features) +{ + struct virtio_hw *hw = eth_dev->data->dev_private; + struct virtio_net_config *config; + struct virtio_net_config local_config; + struct rte_pci_device *pci_dev = NULL; + int ret; + + /* Reset the device although not necessary at startup */ + vtpci_reset(hw); + + /* Tell the host we've noticed this device. */ + vtpci_set_status(hw, VIRTIO_CONFIG_STATUS_ACK); + + /* Tell the host we've known how to drive the device. */ + vtpci_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER); + if (virtio_negotiate_features(hw, req_features) < 0) + return -1; + + if (!hw->virtio_user_dev) { + pci_dev = RTE_DEV_TO_PCI(eth_dev->device); + rte_eth_copy_pci_info(eth_dev, pci_dev); + } + + eth_dev->data->dev_flags = RTE_ETH_DEV_DETACHABLE; + /* If host does not support both status and MSI-X then disable LSC */ + if (vtpci_with_feature(hw, VIRTIO_NET_F_STATUS) && hw->use_msix) + eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_LSC; + else + eth_dev->data->dev_flags &= ~RTE_ETH_DEV_INTR_LSC; + + rx_func_get(eth_dev); + + /* Setting up rx_header size for the device */ + if (vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF) || + vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) + hw->vtnet_hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf); + else + hw->vtnet_hdr_size = sizeof(struct virtio_net_hdr); + + /* Copy the permanent MAC address to: virtio_hw */ + virtio_get_hwaddr(hw); + ether_addr_copy((struct ether_addr *) hw->mac_addr, + ð_dev->data->mac_addrs[0]); + PMD_INIT_LOG(DEBUG, + "PORT MAC: %02X:%02X:%02X:%02X:%02X:%02X", + hw->mac_addr[0], hw->mac_addr[1], hw->mac_addr[2], + hw->mac_addr[3], hw->mac_addr[4], hw->mac_addr[5]); + + if (vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_VQ)) { + config = &local_config; + + vtpci_read_dev_config(hw, + offsetof(struct virtio_net_config, mac), + &config->mac, sizeof(config->mac)); + + if (vtpci_with_feature(hw, VIRTIO_NET_F_STATUS)) { + vtpci_read_dev_config(hw, + offsetof(struct virtio_net_config, status), + &config->status, sizeof(config->status)); + } else { + PMD_INIT_LOG(DEBUG, + "VIRTIO_NET_F_STATUS is not supported"); + config->status = 0; + } + + if (vtpci_with_feature(hw, VIRTIO_NET_F_MQ)) { + vtpci_read_dev_config(hw, + offsetof(struct virtio_net_config, max_virtqueue_pairs), + &config->max_virtqueue_pairs, + sizeof(config->max_virtqueue_pairs)); + } else { + PMD_INIT_LOG(DEBUG, + "VIRTIO_NET_F_MQ is not supported"); + config->max_virtqueue_pairs = 1; + } + + hw->max_queue_pairs = config->max_virtqueue_pairs; + + if (vtpci_with_feature(hw, VIRTIO_NET_F_MTU)) { + vtpci_read_dev_config(hw, + offsetof(struct virtio_net_config, mtu), + &config->mtu, + sizeof(config->mtu)); + + /* + * MTU value has already been checked at negotiation + * time, but check again in case it has changed since + * then, which should not happen. + */ + if (config->mtu < ETHER_MIN_MTU) { + PMD_INIT_LOG(ERR, "invalid max MTU value (%u)", + config->mtu); + return -1; + } + + hw->max_mtu = config->mtu; + /* Set initial MTU to maximum one supported by vhost */ + eth_dev->data->mtu = config->mtu; + + } else { + hw->max_mtu = VIRTIO_MAX_RX_PKTLEN - ETHER_HDR_LEN - + VLAN_TAG_LEN - hw->vtnet_hdr_size; + } + + PMD_INIT_LOG(DEBUG, "config->max_virtqueue_pairs=%d", + config->max_virtqueue_pairs); + PMD_INIT_LOG(DEBUG, "config->status=%d", config->status); + PMD_INIT_LOG(DEBUG, + "PORT MAC: %02X:%02X:%02X:%02X:%02X:%02X", + config->mac[0], config->mac[1], + config->mac[2], config->mac[3], + config->mac[4], config->mac[5]); + } else { + PMD_INIT_LOG(DEBUG, "config->max_virtqueue_pairs=1"); + hw->max_queue_pairs = 1; + } + + ret = virtio_alloc_queues(eth_dev); + if (ret < 0) + return ret; + + if (eth_dev->data->dev_conf.intr_conf.rxq) { + if (virtio_configure_intr(eth_dev) < 0) { + PMD_INIT_LOG(ERR, "failed to configure interrupt"); + return -1; + } + } + + vtpci_reinit_complete(hw); + + if (pci_dev) + PMD_INIT_LOG(DEBUG, "port %d vendorID=0x%x deviceID=0x%x", + eth_dev->data->port_id, pci_dev->id.vendor_id, + pci_dev->id.device_id); + + return 0; +} + +/* + * Remap the PCI device again (IO port map for legacy device and + * memory map for modern device), so that the secondary process + * could have the PCI initiated correctly. + */ +static int +virtio_remap_pci(struct rte_pci_device *pci_dev, struct virtio_hw *hw) +{ + if (hw->modern) { + /* + * We don't have to re-parse the PCI config space, since + * rte_pci_map_device() makes sure the mapped address + * in secondary process would equal to the one mapped in + * the primary process: error will be returned if that + * requirement is not met. + * + * That said, we could simply reuse all cap pointers + * (such as dev_cfg, common_cfg, etc.) parsed from the + * primary process, which is stored in shared memory. + */ + if (rte_pci_map_device(pci_dev)) { + PMD_INIT_LOG(DEBUG, "failed to map pci device!"); + return -1; + } + } else { + if (rte_pci_ioport_map(pci_dev, 0, VTPCI_IO(hw)) < 0) + return -1; + } + + return 0; +} + +static void +virtio_set_vtpci_ops(struct virtio_hw *hw) +{ +#ifdef RTE_VIRTIO_USER + if (hw->virtio_user_dev) + VTPCI_OPS(hw) = &virtio_user_ops; + else +#endif + if (hw->modern) + VTPCI_OPS(hw) = &modern_ops; + else + VTPCI_OPS(hw) = &legacy_ops; +} + +/* + * This function is based on probe() function in virtio_pci.c + * It returns 0 on success. + */ +int +eth_virtio_dev_init(struct rte_eth_dev *eth_dev) +{ + struct virtio_hw *hw = eth_dev->data->dev_private; + int ret; + + RTE_BUILD_BUG_ON(RTE_PKTMBUF_HEADROOM < sizeof(struct virtio_net_hdr_mrg_rxbuf)); + + eth_dev->dev_ops = &virtio_eth_dev_ops; + eth_dev->tx_pkt_burst = &virtio_xmit_pkts; + + if (rte_eal_process_type() == RTE_PROC_SECONDARY) { + if (!hw->virtio_user_dev) { + ret = virtio_remap_pci(RTE_DEV_TO_PCI(eth_dev->device), + hw); + if (ret) + return ret; + } + + virtio_set_vtpci_ops(hw); + if (hw->use_simple_rxtx) { + eth_dev->tx_pkt_burst = virtio_xmit_pkts_simple; + eth_dev->rx_pkt_burst = virtio_recv_pkts_vec; + } else { + rx_func_get(eth_dev); + } + return 0; + } + + /* Allocate memory for storing MAC addresses */ + eth_dev->data->mac_addrs = rte_zmalloc("virtio", VIRTIO_MAX_MAC_ADDRS * ETHER_ADDR_LEN, 0); + if (eth_dev->data->mac_addrs == NULL) { + PMD_INIT_LOG(ERR, + "Failed to allocate %d bytes needed to store MAC addresses", + VIRTIO_MAX_MAC_ADDRS * ETHER_ADDR_LEN); + return -ENOMEM; + } + + hw->port_id = eth_dev->data->port_id; + /* For virtio_user case the hw->virtio_user_dev is populated by + * virtio_user_eth_dev_alloc() before eth_virtio_dev_init() is called. + */ + if (!hw->virtio_user_dev) { + ret = vtpci_init(RTE_DEV_TO_PCI(eth_dev->device), hw); + if (ret) + return ret; + } + + /* reset device and negotiate default features */ + ret = virtio_init_device(eth_dev, VIRTIO_PMD_DEFAULT_GUEST_FEATURES); + if (ret < 0) + return ret; + + /* Setup interrupt callback */ + if (eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC) + rte_intr_callback_register(eth_dev->intr_handle, + virtio_interrupt_handler, eth_dev); + + return 0; +} + +static int +eth_virtio_dev_uninit(struct rte_eth_dev *eth_dev) +{ + PMD_INIT_FUNC_TRACE(); + + if (rte_eal_process_type() == RTE_PROC_SECONDARY) + return -EPERM; + + virtio_dev_stop(eth_dev); + virtio_dev_close(eth_dev); + + eth_dev->dev_ops = NULL; + eth_dev->tx_pkt_burst = NULL; + eth_dev->rx_pkt_burst = NULL; + + rte_free(eth_dev->data->mac_addrs); + eth_dev->data->mac_addrs = NULL; + + /* reset interrupt callback */ + if (eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC) + rte_intr_callback_unregister(eth_dev->intr_handle, + virtio_interrupt_handler, + eth_dev); + if (eth_dev->device) + rte_pci_unmap_device(RTE_DEV_TO_PCI(eth_dev->device)); + + PMD_INIT_LOG(DEBUG, "dev_uninit completed"); + + return 0; +} + +static int eth_virtio_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, + struct rte_pci_device *pci_dev) +{ + return rte_eth_dev_pci_generic_probe(pci_dev, sizeof(struct virtio_hw), + eth_virtio_dev_init); +} + +static int eth_virtio_pci_remove(struct rte_pci_device *pci_dev) +{ + return rte_eth_dev_pci_generic_remove(pci_dev, eth_virtio_dev_uninit); +} + +static struct rte_pci_driver rte_virtio_pmd = { + .driver = { + .name = "net_virtio", + }, + .id_table = pci_id_virtio_map, + .drv_flags = 0, + .probe = eth_virtio_pci_probe, + .remove = eth_virtio_pci_remove, +}; + +RTE_INIT(rte_virtio_pmd_init); +static void +rte_virtio_pmd_init(void) +{ + if (rte_eal_iopl_init() != 0) { + PMD_INIT_LOG(ERR, "IOPL call failed - cannot use virtio PMD"); + return; + } + + rte_pci_register(&rte_virtio_pmd); +} + +/* + * Configure virtio device + * It returns 0 on success. + */ +static int +virtio_dev_configure(struct rte_eth_dev *dev) +{ + const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode; + struct virtio_hw *hw = dev->data->dev_private; + uint64_t req_features; + int ret; + + PMD_INIT_LOG(DEBUG, "configure"); + req_features = VIRTIO_PMD_DEFAULT_GUEST_FEATURES; + if (rxmode->hw_ip_checksum) + req_features |= (1ULL << VIRTIO_NET_F_GUEST_CSUM); + if (rxmode->enable_lro) + req_features |= + (1ULL << VIRTIO_NET_F_GUEST_TSO4) | + (1ULL << VIRTIO_NET_F_GUEST_TSO6); + + /* if request features changed, reinit the device */ + if (req_features != hw->req_guest_features) { + ret = virtio_init_device(dev, req_features); + if (ret < 0) + return ret; + } + + if (rxmode->hw_ip_checksum && + !vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_CSUM)) { + PMD_DRV_LOG(NOTICE, + "rx ip checksum not available on this host"); + return -ENOTSUP; + } + + if (rxmode->enable_lro && + (!vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_TSO4) || + !vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_TSO4))) { + PMD_DRV_LOG(NOTICE, + "lro not available on this host"); + return -ENOTSUP; + } + + /* start control queue */ + if (vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_VQ)) + virtio_dev_cq_start(dev); + + hw->vlan_strip = rxmode->hw_vlan_strip; + + if (rxmode->hw_vlan_filter + && !vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_VLAN)) { + PMD_DRV_LOG(NOTICE, + "vlan filtering not available on this host"); + return -ENOTSUP; + } + + if (dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC) + /* Enable vector (0) for Link State Intrerrupt */ + if (VTPCI_OPS(hw)->set_config_irq(hw, 0) == + VIRTIO_MSI_NO_VECTOR) { + PMD_DRV_LOG(ERR, "failed to set config vector"); + return -EBUSY; + } + + return 0; +} + + +static int +virtio_dev_start(struct rte_eth_dev *dev) +{ + uint16_t nb_queues, i; + struct virtnet_rx *rxvq; + struct virtnet_tx *txvq __rte_unused; + struct virtio_hw *hw = dev->data->dev_private; + + /* check if lsc interrupt feature is enabled */ + if (dev->data->dev_conf.intr_conf.lsc) { + if (!(dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)) { + PMD_DRV_LOG(ERR, "link status not supported by host"); + return -ENOTSUP; + } + } + + /* Enable uio/vfio intr/eventfd mapping: althrough we already did that + * in device configure, but it could be unmapped when device is + * stopped. + */ + if (dev->data->dev_conf.intr_conf.lsc || + dev->data->dev_conf.intr_conf.rxq) { + rte_intr_disable(dev->intr_handle); + + if (rte_intr_enable(dev->intr_handle) < 0) { + PMD_DRV_LOG(ERR, "interrupt enable failed"); + return -EIO; + } + } + + /*Notify the backend + *Otherwise the tap backend might already stop its queue due to fullness. + *vhost backend will have no chance to be waked up + */ + nb_queues = RTE_MAX(dev->data->nb_rx_queues, dev->data->nb_tx_queues); + if (hw->max_queue_pairs > 1) { + if (virtio_set_multiple_queues(dev, nb_queues) != 0) + return -EINVAL; + } + + PMD_INIT_LOG(DEBUG, "nb_queues=%d", nb_queues); + + for (i = 0; i < dev->data->nb_rx_queues; i++) { + rxvq = dev->data->rx_queues[i]; + virtqueue_notify(rxvq->vq); + } + + PMD_INIT_LOG(DEBUG, "Notified backend at initialization"); + + for (i = 0; i < dev->data->nb_rx_queues; i++) { + rxvq = dev->data->rx_queues[i]; + VIRTQUEUE_DUMP(rxvq->vq); + } + + for (i = 0; i < dev->data->nb_tx_queues; i++) { + txvq = dev->data->tx_queues[i]; + VIRTQUEUE_DUMP(txvq->vq); + } + + hw->started = 1; + + /* Initialize Link state */ + virtio_dev_link_update(dev, 0); + + return 0; +} + +static void virtio_dev_free_mbufs(struct rte_eth_dev *dev) +{ + struct rte_mbuf *buf; + int i, mbuf_num = 0; + + for (i = 0; i < dev->data->nb_rx_queues; i++) { + struct virtnet_rx *rxvq = dev->data->rx_queues[i]; + + PMD_INIT_LOG(DEBUG, + "Before freeing rxq[%d] used and unused buf", i); + VIRTQUEUE_DUMP(rxvq->vq); + + PMD_INIT_LOG(DEBUG, "rx_queues[%d]=%p", i, rxvq); + while ((buf = virtqueue_detatch_unused(rxvq->vq)) != NULL) { + rte_pktmbuf_free(buf); + mbuf_num++; + } + + PMD_INIT_LOG(DEBUG, "free %d mbufs", mbuf_num); + PMD_INIT_LOG(DEBUG, + "After freeing rxq[%d] used and unused buf", i); + VIRTQUEUE_DUMP(rxvq->vq); + } + + for (i = 0; i < dev->data->nb_tx_queues; i++) { + struct virtnet_tx *txvq = dev->data->tx_queues[i]; + + PMD_INIT_LOG(DEBUG, + "Before freeing txq[%d] used and unused bufs", + i); + VIRTQUEUE_DUMP(txvq->vq); + + mbuf_num = 0; + while ((buf = virtqueue_detatch_unused(txvq->vq)) != NULL) { + rte_pktmbuf_free(buf); + mbuf_num++; + } + + PMD_INIT_LOG(DEBUG, "free %d mbufs", mbuf_num); + PMD_INIT_LOG(DEBUG, + "After freeing txq[%d] used and unused buf", i); + VIRTQUEUE_DUMP(txvq->vq); + } +} + +/* + * Stop device: disable interrupt and mark link down + */ +static void +virtio_dev_stop(struct rte_eth_dev *dev) +{ + struct virtio_hw *hw = dev->data->dev_private; + struct rte_eth_link link; + struct rte_intr_conf *intr_conf = &dev->data->dev_conf.intr_conf; + + PMD_INIT_LOG(DEBUG, "stop"); + + if (intr_conf->lsc || intr_conf->rxq) + rte_intr_disable(dev->intr_handle); + + hw->started = 0; + memset(&link, 0, sizeof(link)); + virtio_dev_atomic_write_link_status(dev, &link); +} + +static int +virtio_dev_link_update(struct rte_eth_dev *dev, __rte_unused int wait_to_complete) +{ + struct rte_eth_link link, old; + uint16_t status; + struct virtio_hw *hw = dev->data->dev_private; + memset(&link, 0, sizeof(link)); + virtio_dev_atomic_read_link_status(dev, &link); + old = link; + link.link_duplex = ETH_LINK_FULL_DUPLEX; + link.link_speed = SPEED_10G; + + if (hw->started == 0) { + link.link_status = ETH_LINK_DOWN; + } else if (vtpci_with_feature(hw, VIRTIO_NET_F_STATUS)) { + PMD_INIT_LOG(DEBUG, "Get link status from hw"); + vtpci_read_dev_config(hw, + offsetof(struct virtio_net_config, status), + &status, sizeof(status)); + if ((status & VIRTIO_NET_S_LINK_UP) == 0) { + link.link_status = ETH_LINK_DOWN; + PMD_INIT_LOG(DEBUG, "Port %d is down", + dev->data->port_id); + } else { + link.link_status = ETH_LINK_UP; + PMD_INIT_LOG(DEBUG, "Port %d is up", + dev->data->port_id); + } + } else { + link.link_status = ETH_LINK_UP; + } + virtio_dev_atomic_write_link_status(dev, &link); + + return (old.link_status == link.link_status) ? -1 : 0; +} + +static void +virtio_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) +{ + uint64_t tso_mask, host_features; + struct virtio_hw *hw = dev->data->dev_private; + + dev_info->speed_capa = ETH_LINK_SPEED_10G; /* fake value */ + + dev_info->pci_dev = dev->device ? RTE_DEV_TO_PCI(dev->device) : NULL; + dev_info->max_rx_queues = + RTE_MIN(hw->max_queue_pairs, VIRTIO_MAX_RX_QUEUES); + dev_info->max_tx_queues = + RTE_MIN(hw->max_queue_pairs, VIRTIO_MAX_TX_QUEUES); + dev_info->min_rx_bufsize = VIRTIO_MIN_RX_BUFSIZE; + dev_info->max_rx_pktlen = VIRTIO_MAX_RX_PKTLEN; + dev_info->max_mac_addrs = VIRTIO_MAX_MAC_ADDRS; + dev_info->default_txconf = (struct rte_eth_txconf) { + .txq_flags = ETH_TXQ_FLAGS_NOOFFLOADS + }; + + host_features = VTPCI_OPS(hw)->get_features(hw); + dev_info->rx_offload_capa = 0; + if (host_features & (1ULL << VIRTIO_NET_F_GUEST_CSUM)) { + dev_info->rx_offload_capa |= + DEV_RX_OFFLOAD_TCP_CKSUM | + DEV_RX_OFFLOAD_UDP_CKSUM; + } + tso_mask = (1ULL << VIRTIO_NET_F_GUEST_TSO4) | + (1ULL << VIRTIO_NET_F_GUEST_TSO6); + if ((host_features & tso_mask) == tso_mask) + dev_info->rx_offload_capa |= DEV_RX_OFFLOAD_TCP_LRO; + + dev_info->tx_offload_capa = 0; + if (hw->guest_features & (1ULL << VIRTIO_NET_F_CSUM)) { + dev_info->tx_offload_capa |= + DEV_TX_OFFLOAD_UDP_CKSUM | + DEV_TX_OFFLOAD_TCP_CKSUM; + } + tso_mask = (1ULL << VIRTIO_NET_F_HOST_TSO4) | + (1ULL << VIRTIO_NET_F_HOST_TSO6); + if ((hw->guest_features & tso_mask) == tso_mask) + dev_info->tx_offload_capa |= DEV_TX_OFFLOAD_TCP_TSO; +} + +/* + * It enables testpmd to collect per queue stats. + */ +static int +virtio_dev_queue_stats_mapping_set(__rte_unused struct rte_eth_dev *eth_dev, +__rte_unused uint16_t queue_id, __rte_unused uint8_t stat_idx, +__rte_unused uint8_t is_rx) +{ + return 0; +} + +RTE_PMD_EXPORT_NAME(net_virtio, __COUNTER__); +RTE_PMD_REGISTER_PCI_TABLE(net_virtio, pci_id_virtio_map); +RTE_PMD_REGISTER_KMOD_DEP(net_virtio, "* igb_uio | uio_pci_generic | vfio"); diff --git a/src/seastar/dpdk/drivers/net/virtio/virtio_ethdev.h b/src/seastar/dpdk/drivers/net/virtio/virtio_ethdev.h new file mode 100644 index 00000000..c3413c6d --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtio_ethdev.h @@ -0,0 +1,118 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VIRTIO_ETHDEV_H_ +#define _VIRTIO_ETHDEV_H_ + +#include <stdint.h> + +#include "virtio_pci.h" + +#define SPEED_10 10 +#define SPEED_100 100 +#define SPEED_1000 1000 +#define SPEED_10G 10000 + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +#define VIRTIO_MAX_RX_QUEUES 128U +#define VIRTIO_MAX_TX_QUEUES 128U +#define VIRTIO_MAX_MAC_ADDRS 64 +#define VIRTIO_MIN_RX_BUFSIZE 64 +#define VIRTIO_MAX_RX_PKTLEN 9728U + +/* Features desired/implemented by this driver. */ +#define VIRTIO_PMD_DEFAULT_GUEST_FEATURES \ + (1u << VIRTIO_NET_F_MAC | \ + 1u << VIRTIO_NET_F_STATUS | \ + 1u << VIRTIO_NET_F_MQ | \ + 1u << VIRTIO_NET_F_CTRL_MAC_ADDR | \ + 1u << VIRTIO_NET_F_CTRL_VQ | \ + 1u << VIRTIO_NET_F_CTRL_RX | \ + 1u << VIRTIO_NET_F_CTRL_VLAN | \ + 1u << VIRTIO_NET_F_CSUM | \ + 1u << VIRTIO_NET_F_HOST_TSO4 | \ + 1u << VIRTIO_NET_F_HOST_TSO6 | \ + 1u << VIRTIO_NET_F_MRG_RXBUF | \ + 1u << VIRTIO_NET_F_MTU | \ + 1u << VIRTIO_RING_F_INDIRECT_DESC | \ + 1ULL << VIRTIO_F_VERSION_1 | \ + 1ULL << VIRTIO_F_IOMMU_PLATFORM) + +#define VIRTIO_PMD_SUPPORTED_GUEST_FEATURES \ + (VIRTIO_PMD_DEFAULT_GUEST_FEATURES | \ + 1u << VIRTIO_NET_F_GUEST_CSUM | \ + 1u << VIRTIO_NET_F_GUEST_TSO4 | \ + 1u << VIRTIO_NET_F_GUEST_TSO6) +/* + * CQ function prototype + */ +void virtio_dev_cq_start(struct rte_eth_dev *dev); + +/* + * RX/TX function prototypes + */ + +int virtio_dev_rx_queue_done(void *rxq, uint16_t offset); + +int virtio_dev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id, + uint16_t nb_rx_desc, unsigned int socket_id, + const struct rte_eth_rxconf *rx_conf, + struct rte_mempool *mb_pool); + +int virtio_dev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id, + uint16_t nb_tx_desc, unsigned int socket_id, + const struct rte_eth_txconf *tx_conf); + +uint16_t virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts); + +uint16_t virtio_recv_mergeable_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts); + +uint16_t virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts); + +uint16_t virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts); + +uint16_t virtio_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts); + +int eth_virtio_dev_init(struct rte_eth_dev *eth_dev); + +void virtio_interrupt_handler(void *param); + +#endif /* _VIRTIO_ETHDEV_H_ */ diff --git a/src/seastar/dpdk/drivers/net/virtio/virtio_logs.h b/src/seastar/dpdk/drivers/net/virtio/virtio_logs.h new file mode 100644 index 00000000..90a79eaa --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtio_logs.h @@ -0,0 +1,70 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VIRTIO_LOGS_H_ +#define _VIRTIO_LOGS_H_ + +#include <rte_log.h> + +#ifdef RTE_LIBRTE_VIRTIO_DEBUG_INIT +#define PMD_INIT_LOG(level, fmt, args...) \ + RTE_LOG(level, PMD, "%s(): " fmt "\n", __func__, ## args) +#define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>") +#else +#define PMD_INIT_LOG(level, fmt, args...) do { } while(0) +#define PMD_INIT_FUNC_TRACE() do { } while(0) +#endif + +#ifdef RTE_LIBRTE_VIRTIO_DEBUG_RX +#define PMD_RX_LOG(level, fmt, args...) \ + RTE_LOG(level, PMD, "%s() rx: " fmt "\n", __func__, ## args) +#else +#define PMD_RX_LOG(level, fmt, args...) do { } while(0) +#endif + +#ifdef RTE_LIBRTE_VIRTIO_DEBUG_TX +#define PMD_TX_LOG(level, fmt, args...) \ + RTE_LOG(level, PMD, "%s() tx: " fmt "\n", __func__, ## args) +#else +#define PMD_TX_LOG(level, fmt, args...) do { } while(0) +#endif + + +#ifdef RTE_LIBRTE_VIRTIO_DEBUG_DRIVER +#define PMD_DRV_LOG(level, fmt, args...) \ + RTE_LOG(level, PMD, "%s(): " fmt "\n", __func__, ## args) +#else +#define PMD_DRV_LOG(level, fmt, args...) do { } while(0) +#endif + +#endif /* _VIRTIO_LOGS_H_ */ diff --git a/src/seastar/dpdk/drivers/net/virtio/virtio_pci.c b/src/seastar/dpdk/drivers/net/virtio/virtio_pci.c new file mode 100644 index 00000000..b7b3d615 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtio_pci.c @@ -0,0 +1,700 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <stdint.h> + +#ifdef RTE_EXEC_ENV_LINUXAPP + #include <dirent.h> + #include <fcntl.h> +#endif + +#include <rte_io.h> + +#include "virtio_pci.h" +#include "virtio_logs.h" +#include "virtqueue.h" + +/* + * Following macros are derived from linux/pci_regs.h, however, + * we can't simply include that header here, as there is no such + * file for non-Linux platform. + */ +#define PCI_CAPABILITY_LIST 0x34 +#define PCI_CAP_ID_VNDR 0x09 +#define PCI_CAP_ID_MSIX 0x11 + +/* + * The remaining space is defined by each driver as the per-driver + * configuration space. + */ +#define VIRTIO_PCI_CONFIG(hw) (((hw)->use_msix) ? 24 : 20) + +static inline int +check_vq_phys_addr_ok(struct virtqueue *vq) +{ + /* Virtio PCI device VIRTIO_PCI_QUEUE_PF register is 32bit, + * and only accepts 32 bit page frame number. + * Check if the allocated physical memory exceeds 16TB. + */ + if ((vq->vq_ring_mem + vq->vq_ring_size - 1) >> + (VIRTIO_PCI_QUEUE_ADDR_SHIFT + 32)) { + PMD_INIT_LOG(ERR, "vring address shouldn't be above 16TB!"); + return 0; + } + + return 1; +} + +/* + * Since we are in legacy mode: + * http://ozlabs.org/~rusty/virtio-spec/virtio-0.9.5.pdf + * + * "Note that this is possible because while the virtio header is PCI (i.e. + * little) endian, the device-specific region is encoded in the native endian of + * the guest (where such distinction is applicable)." + * + * For powerpc which supports both, qemu supposes that cpu is big endian and + * enforces this for the virtio-net stuff. + */ +static void +legacy_read_dev_config(struct virtio_hw *hw, size_t offset, + void *dst, int length) +{ +#ifdef RTE_ARCH_PPC_64 + int size; + + while (length > 0) { + if (length >= 4) { + size = 4; + rte_pci_ioport_read(VTPCI_IO(hw), dst, size, + VIRTIO_PCI_CONFIG(hw) + offset); + *(uint32_t *)dst = rte_be_to_cpu_32(*(uint32_t *)dst); + } else if (length >= 2) { + size = 2; + rte_pci_ioport_read(VTPCI_IO(hw), dst, size, + VIRTIO_PCI_CONFIG(hw) + offset); + *(uint16_t *)dst = rte_be_to_cpu_16(*(uint16_t *)dst); + } else { + size = 1; + rte_pci_ioport_read(VTPCI_IO(hw), dst, size, + VIRTIO_PCI_CONFIG(hw) + offset); + } + + dst = (char *)dst + size; + offset += size; + length -= size; + } +#else + rte_pci_ioport_read(VTPCI_IO(hw), dst, length, + VIRTIO_PCI_CONFIG(hw) + offset); +#endif +} + +static void +legacy_write_dev_config(struct virtio_hw *hw, size_t offset, + const void *src, int length) +{ +#ifdef RTE_ARCH_PPC_64 + union { + uint32_t u32; + uint16_t u16; + } tmp; + int size; + + while (length > 0) { + if (length >= 4) { + size = 4; + tmp.u32 = rte_cpu_to_be_32(*(const uint32_t *)src); + rte_pci_ioport_write(VTPCI_IO(hw), &tmp.u32, size, + VIRTIO_PCI_CONFIG(hw) + offset); + } else if (length >= 2) { + size = 2; + tmp.u16 = rte_cpu_to_be_16(*(const uint16_t *)src); + rte_pci_ioport_write(VTPCI_IO(hw), &tmp.u16, size, + VIRTIO_PCI_CONFIG(hw) + offset); + } else { + size = 1; + rte_pci_ioport_write(VTPCI_IO(hw), src, size, + VIRTIO_PCI_CONFIG(hw) + offset); + } + + src = (const char *)src + size; + offset += size; + length -= size; + } +#else + rte_pci_ioport_write(VTPCI_IO(hw), src, length, + VIRTIO_PCI_CONFIG(hw) + offset); +#endif +} + +static uint64_t +legacy_get_features(struct virtio_hw *hw) +{ + uint32_t dst; + + rte_pci_ioport_read(VTPCI_IO(hw), &dst, 4, VIRTIO_PCI_HOST_FEATURES); + return dst; +} + +static void +legacy_set_features(struct virtio_hw *hw, uint64_t features) +{ + if ((features >> 32) != 0) { + PMD_DRV_LOG(ERR, + "only 32 bit features are allowed for legacy virtio!"); + return; + } + rte_pci_ioport_write(VTPCI_IO(hw), &features, 4, + VIRTIO_PCI_GUEST_FEATURES); +} + +static uint8_t +legacy_get_status(struct virtio_hw *hw) +{ + uint8_t dst; + + rte_pci_ioport_read(VTPCI_IO(hw), &dst, 1, VIRTIO_PCI_STATUS); + return dst; +} + +static void +legacy_set_status(struct virtio_hw *hw, uint8_t status) +{ + rte_pci_ioport_write(VTPCI_IO(hw), &status, 1, VIRTIO_PCI_STATUS); +} + +static void +legacy_reset(struct virtio_hw *hw) +{ + legacy_set_status(hw, VIRTIO_CONFIG_STATUS_RESET); +} + +static uint8_t +legacy_get_isr(struct virtio_hw *hw) +{ + uint8_t dst; + + rte_pci_ioport_read(VTPCI_IO(hw), &dst, 1, VIRTIO_PCI_ISR); + return dst; +} + +/* Enable one vector (0) for Link State Intrerrupt */ +static uint16_t +legacy_set_config_irq(struct virtio_hw *hw, uint16_t vec) +{ + uint16_t dst; + + rte_pci_ioport_write(VTPCI_IO(hw), &vec, 2, VIRTIO_MSI_CONFIG_VECTOR); + rte_pci_ioport_read(VTPCI_IO(hw), &dst, 2, VIRTIO_MSI_CONFIG_VECTOR); + return dst; +} + +static uint16_t +legacy_set_queue_irq(struct virtio_hw *hw, struct virtqueue *vq, uint16_t vec) +{ + uint16_t dst; + + rte_pci_ioport_write(VTPCI_IO(hw), &vq->vq_queue_index, 2, + VIRTIO_PCI_QUEUE_SEL); + rte_pci_ioport_write(VTPCI_IO(hw), &vec, 2, VIRTIO_MSI_QUEUE_VECTOR); + rte_pci_ioport_read(VTPCI_IO(hw), &dst, 2, VIRTIO_MSI_QUEUE_VECTOR); + return dst; +} + +static uint16_t +legacy_get_queue_num(struct virtio_hw *hw, uint16_t queue_id) +{ + uint16_t dst; + + rte_pci_ioport_write(VTPCI_IO(hw), &queue_id, 2, VIRTIO_PCI_QUEUE_SEL); + rte_pci_ioport_read(VTPCI_IO(hw), &dst, 2, VIRTIO_PCI_QUEUE_NUM); + return dst; +} + +static int +legacy_setup_queue(struct virtio_hw *hw, struct virtqueue *vq) +{ + uint32_t src; + + if (!check_vq_phys_addr_ok(vq)) + return -1; + + rte_pci_ioport_write(VTPCI_IO(hw), &vq->vq_queue_index, 2, + VIRTIO_PCI_QUEUE_SEL); + src = vq->vq_ring_mem >> VIRTIO_PCI_QUEUE_ADDR_SHIFT; + rte_pci_ioport_write(VTPCI_IO(hw), &src, 4, VIRTIO_PCI_QUEUE_PFN); + + return 0; +} + +static void +legacy_del_queue(struct virtio_hw *hw, struct virtqueue *vq) +{ + uint32_t src = 0; + + rte_pci_ioport_write(VTPCI_IO(hw), &vq->vq_queue_index, 2, + VIRTIO_PCI_QUEUE_SEL); + rte_pci_ioport_write(VTPCI_IO(hw), &src, 4, VIRTIO_PCI_QUEUE_PFN); +} + +static void +legacy_notify_queue(struct virtio_hw *hw, struct virtqueue *vq) +{ + rte_pci_ioport_write(VTPCI_IO(hw), &vq->vq_queue_index, 2, + VIRTIO_PCI_QUEUE_NOTIFY); +} + +const struct virtio_pci_ops legacy_ops = { + .read_dev_cfg = legacy_read_dev_config, + .write_dev_cfg = legacy_write_dev_config, + .reset = legacy_reset, + .get_status = legacy_get_status, + .set_status = legacy_set_status, + .get_features = legacy_get_features, + .set_features = legacy_set_features, + .get_isr = legacy_get_isr, + .set_config_irq = legacy_set_config_irq, + .set_queue_irq = legacy_set_queue_irq, + .get_queue_num = legacy_get_queue_num, + .setup_queue = legacy_setup_queue, + .del_queue = legacy_del_queue, + .notify_queue = legacy_notify_queue, +}; + +static inline void +io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi) +{ + rte_write32(val & ((1ULL << 32) - 1), lo); + rte_write32(val >> 32, hi); +} + +static void +modern_read_dev_config(struct virtio_hw *hw, size_t offset, + void *dst, int length) +{ + int i; + uint8_t *p; + uint8_t old_gen, new_gen; + + do { + old_gen = rte_read8(&hw->common_cfg->config_generation); + + p = dst; + for (i = 0; i < length; i++) + *p++ = rte_read8((uint8_t *)hw->dev_cfg + offset + i); + + new_gen = rte_read8(&hw->common_cfg->config_generation); + } while (old_gen != new_gen); +} + +static void +modern_write_dev_config(struct virtio_hw *hw, size_t offset, + const void *src, int length) +{ + int i; + const uint8_t *p = src; + + for (i = 0; i < length; i++) + rte_write8((*p++), (((uint8_t *)hw->dev_cfg) + offset + i)); +} + +static uint64_t +modern_get_features(struct virtio_hw *hw) +{ + uint32_t features_lo, features_hi; + + rte_write32(0, &hw->common_cfg->device_feature_select); + features_lo = rte_read32(&hw->common_cfg->device_feature); + + rte_write32(1, &hw->common_cfg->device_feature_select); + features_hi = rte_read32(&hw->common_cfg->device_feature); + + return ((uint64_t)features_hi << 32) | features_lo; +} + +static void +modern_set_features(struct virtio_hw *hw, uint64_t features) +{ + rte_write32(0, &hw->common_cfg->guest_feature_select); + rte_write32(features & ((1ULL << 32) - 1), + &hw->common_cfg->guest_feature); + + rte_write32(1, &hw->common_cfg->guest_feature_select); + rte_write32(features >> 32, + &hw->common_cfg->guest_feature); +} + +static uint8_t +modern_get_status(struct virtio_hw *hw) +{ + return rte_read8(&hw->common_cfg->device_status); +} + +static void +modern_set_status(struct virtio_hw *hw, uint8_t status) +{ + rte_write8(status, &hw->common_cfg->device_status); +} + +static void +modern_reset(struct virtio_hw *hw) +{ + modern_set_status(hw, VIRTIO_CONFIG_STATUS_RESET); + modern_get_status(hw); +} + +static uint8_t +modern_get_isr(struct virtio_hw *hw) +{ + return rte_read8(hw->isr); +} + +static uint16_t +modern_set_config_irq(struct virtio_hw *hw, uint16_t vec) +{ + rte_write16(vec, &hw->common_cfg->msix_config); + return rte_read16(&hw->common_cfg->msix_config); +} + +static uint16_t +modern_set_queue_irq(struct virtio_hw *hw, struct virtqueue *vq, uint16_t vec) +{ + rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select); + rte_write16(vec, &hw->common_cfg->queue_msix_vector); + return rte_read16(&hw->common_cfg->queue_msix_vector); +} + +static uint16_t +modern_get_queue_num(struct virtio_hw *hw, uint16_t queue_id) +{ + rte_write16(queue_id, &hw->common_cfg->queue_select); + return rte_read16(&hw->common_cfg->queue_size); +} + +static int +modern_setup_queue(struct virtio_hw *hw, struct virtqueue *vq) +{ + uint64_t desc_addr, avail_addr, used_addr; + uint16_t notify_off; + + if (!check_vq_phys_addr_ok(vq)) + return -1; + + desc_addr = vq->vq_ring_mem; + avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc); + used_addr = RTE_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail, + ring[vq->vq_nentries]), + VIRTIO_PCI_VRING_ALIGN); + + rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select); + + io_write64_twopart(desc_addr, &hw->common_cfg->queue_desc_lo, + &hw->common_cfg->queue_desc_hi); + io_write64_twopart(avail_addr, &hw->common_cfg->queue_avail_lo, + &hw->common_cfg->queue_avail_hi); + io_write64_twopart(used_addr, &hw->common_cfg->queue_used_lo, + &hw->common_cfg->queue_used_hi); + + notify_off = rte_read16(&hw->common_cfg->queue_notify_off); + vq->notify_addr = (void *)((uint8_t *)hw->notify_base + + notify_off * hw->notify_off_multiplier); + + rte_write16(1, &hw->common_cfg->queue_enable); + + PMD_INIT_LOG(DEBUG, "queue %u addresses:", vq->vq_queue_index); + PMD_INIT_LOG(DEBUG, "\t desc_addr: %" PRIx64, desc_addr); + PMD_INIT_LOG(DEBUG, "\t aval_addr: %" PRIx64, avail_addr); + PMD_INIT_LOG(DEBUG, "\t used_addr: %" PRIx64, used_addr); + PMD_INIT_LOG(DEBUG, "\t notify addr: %p (notify offset: %u)", + vq->notify_addr, notify_off); + + return 0; +} + +static void +modern_del_queue(struct virtio_hw *hw, struct virtqueue *vq) +{ + rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select); + + io_write64_twopart(0, &hw->common_cfg->queue_desc_lo, + &hw->common_cfg->queue_desc_hi); + io_write64_twopart(0, &hw->common_cfg->queue_avail_lo, + &hw->common_cfg->queue_avail_hi); + io_write64_twopart(0, &hw->common_cfg->queue_used_lo, + &hw->common_cfg->queue_used_hi); + + rte_write16(0, &hw->common_cfg->queue_enable); +} + +static void +modern_notify_queue(struct virtio_hw *hw __rte_unused, struct virtqueue *vq) +{ + rte_write16(vq->vq_queue_index, vq->notify_addr); +} + +const struct virtio_pci_ops modern_ops = { + .read_dev_cfg = modern_read_dev_config, + .write_dev_cfg = modern_write_dev_config, + .reset = modern_reset, + .get_status = modern_get_status, + .set_status = modern_set_status, + .get_features = modern_get_features, + .set_features = modern_set_features, + .get_isr = modern_get_isr, + .set_config_irq = modern_set_config_irq, + .set_queue_irq = modern_set_queue_irq, + .get_queue_num = modern_get_queue_num, + .setup_queue = modern_setup_queue, + .del_queue = modern_del_queue, + .notify_queue = modern_notify_queue, +}; + + +void +vtpci_read_dev_config(struct virtio_hw *hw, size_t offset, + void *dst, int length) +{ + VTPCI_OPS(hw)->read_dev_cfg(hw, offset, dst, length); +} + +void +vtpci_write_dev_config(struct virtio_hw *hw, size_t offset, + const void *src, int length) +{ + VTPCI_OPS(hw)->write_dev_cfg(hw, offset, src, length); +} + +uint64_t +vtpci_negotiate_features(struct virtio_hw *hw, uint64_t host_features) +{ + uint64_t features; + + /* + * Limit negotiated features to what the driver, virtqueue, and + * host all support. + */ + features = host_features & hw->guest_features; + VTPCI_OPS(hw)->set_features(hw, features); + + return features; +} + +void +vtpci_reset(struct virtio_hw *hw) +{ + VTPCI_OPS(hw)->set_status(hw, VIRTIO_CONFIG_STATUS_RESET); + /* flush status write */ + VTPCI_OPS(hw)->get_status(hw); +} + +void +vtpci_reinit_complete(struct virtio_hw *hw) +{ + vtpci_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER_OK); +} + +void +vtpci_set_status(struct virtio_hw *hw, uint8_t status) +{ + if (status != VIRTIO_CONFIG_STATUS_RESET) + status |= VTPCI_OPS(hw)->get_status(hw); + + VTPCI_OPS(hw)->set_status(hw, status); +} + +uint8_t +vtpci_get_status(struct virtio_hw *hw) +{ + return VTPCI_OPS(hw)->get_status(hw); +} + +uint8_t +vtpci_isr(struct virtio_hw *hw) +{ + return VTPCI_OPS(hw)->get_isr(hw); +} + +static void * +get_cfg_addr(struct rte_pci_device *dev, struct virtio_pci_cap *cap) +{ + uint8_t bar = cap->bar; + uint32_t length = cap->length; + uint32_t offset = cap->offset; + uint8_t *base; + + if (bar > 5) { + PMD_INIT_LOG(ERR, "invalid bar: %u", bar); + return NULL; + } + + if (offset + length < offset) { + PMD_INIT_LOG(ERR, "offset(%u) + length(%u) overflows", + offset, length); + return NULL; + } + + if (offset + length > dev->mem_resource[bar].len) { + PMD_INIT_LOG(ERR, + "invalid cap: overflows bar space: %u > %" PRIu64, + offset + length, dev->mem_resource[bar].len); + return NULL; + } + + base = dev->mem_resource[bar].addr; + if (base == NULL) { + PMD_INIT_LOG(ERR, "bar %u base addr is NULL", bar); + return NULL; + } + + return base + offset; +} + +static int +virtio_read_caps(struct rte_pci_device *dev, struct virtio_hw *hw) +{ + uint8_t pos; + struct virtio_pci_cap cap; + int ret; + + if (rte_pci_map_device(dev)) { + PMD_INIT_LOG(DEBUG, "failed to map pci device!"); + return -1; + } + + ret = rte_pci_read_config(dev, &pos, 1, PCI_CAPABILITY_LIST); + if (ret < 0) { + PMD_INIT_LOG(DEBUG, "failed to read pci capability list"); + return -1; + } + + while (pos) { + ret = rte_pci_read_config(dev, &cap, sizeof(cap), pos); + if (ret < 0) { + PMD_INIT_LOG(ERR, + "failed to read pci cap at pos: %x", pos); + break; + } + + if (cap.cap_vndr == PCI_CAP_ID_MSIX) + hw->use_msix = 1; + + if (cap.cap_vndr != PCI_CAP_ID_VNDR) { + PMD_INIT_LOG(DEBUG, + "[%2x] skipping non VNDR cap id: %02x", + pos, cap.cap_vndr); + goto next; + } + + PMD_INIT_LOG(DEBUG, + "[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u", + pos, cap.cfg_type, cap.bar, cap.offset, cap.length); + + switch (cap.cfg_type) { + case VIRTIO_PCI_CAP_COMMON_CFG: + hw->common_cfg = get_cfg_addr(dev, &cap); + break; + case VIRTIO_PCI_CAP_NOTIFY_CFG: + rte_pci_read_config(dev, &hw->notify_off_multiplier, + 4, pos + sizeof(cap)); + hw->notify_base = get_cfg_addr(dev, &cap); + break; + case VIRTIO_PCI_CAP_DEVICE_CFG: + hw->dev_cfg = get_cfg_addr(dev, &cap); + break; + case VIRTIO_PCI_CAP_ISR_CFG: + hw->isr = get_cfg_addr(dev, &cap); + break; + } + +next: + pos = cap.cap_next; + } + + if (hw->common_cfg == NULL || hw->notify_base == NULL || + hw->dev_cfg == NULL || hw->isr == NULL) { + PMD_INIT_LOG(INFO, "no modern virtio pci device found."); + return -1; + } + + PMD_INIT_LOG(INFO, "found modern virtio pci device."); + + PMD_INIT_LOG(DEBUG, "common cfg mapped at: %p", hw->common_cfg); + PMD_INIT_LOG(DEBUG, "device cfg mapped at: %p", hw->dev_cfg); + PMD_INIT_LOG(DEBUG, "isr cfg mapped at: %p", hw->isr); + PMD_INIT_LOG(DEBUG, "notify base: %p, notify off multiplier: %u", + hw->notify_base, hw->notify_off_multiplier); + + return 0; +} + +/* + * Return -1: + * if there is error mapping with VFIO/UIO. + * if port map error when driver type is KDRV_NONE. + * if whitelisted but driver type is KDRV_UNKNOWN. + * Return 1 if kernel driver is managing the device. + * Return 0 on success. + */ +int +vtpci_init(struct rte_pci_device *dev, struct virtio_hw *hw) +{ + /* + * Try if we can succeed reading virtio pci caps, which exists + * only on modern pci device. If failed, we fallback to legacy + * virtio handling. + */ + if (virtio_read_caps(dev, hw) == 0) { + PMD_INIT_LOG(INFO, "modern virtio pci detected."); + virtio_hw_internal[hw->port_id].vtpci_ops = &modern_ops; + hw->modern = 1; + return 0; + } + + PMD_INIT_LOG(INFO, "trying with legacy virtio pci."); + if (rte_pci_ioport_map(dev, 0, VTPCI_IO(hw)) < 0) { + if (dev->kdrv == RTE_KDRV_UNKNOWN && + (!dev->device.devargs || + dev->device.devargs->type != + RTE_DEVTYPE_WHITELISTED_PCI)) { + PMD_INIT_LOG(INFO, + "skip kernel managed virtio device."); + return 1; + } + return -1; + } + + virtio_hw_internal[hw->port_id].vtpci_ops = &legacy_ops; + hw->modern = 0; + + return 0; +} diff --git a/src/seastar/dpdk/drivers/net/virtio/virtio_pci.h b/src/seastar/dpdk/drivers/net/virtio/virtio_pci.h new file mode 100644 index 00000000..18caebdd --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtio_pci.h @@ -0,0 +1,344 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VIRTIO_PCI_H_ +#define _VIRTIO_PCI_H_ + +#include <stdint.h> + +#include <rte_pci.h> +#include <rte_ethdev.h> + +struct virtqueue; +struct virtnet_ctl; + +/* VirtIO PCI vendor/device ID. */ +#define VIRTIO_PCI_VENDORID 0x1AF4 +#define VIRTIO_PCI_LEGACY_DEVICEID_NET 0x1000 +#define VIRTIO_PCI_MODERN_DEVICEID_NET 0x1041 + +/* VirtIO ABI version, this must match exactly. */ +#define VIRTIO_PCI_ABI_VERSION 0 + +/* + * VirtIO Header, located in BAR 0. + */ +#define VIRTIO_PCI_HOST_FEATURES 0 /* host's supported features (32bit, RO)*/ +#define VIRTIO_PCI_GUEST_FEATURES 4 /* guest's supported features (32, RW) */ +#define VIRTIO_PCI_QUEUE_PFN 8 /* physical address of VQ (32, RW) */ +#define VIRTIO_PCI_QUEUE_NUM 12 /* number of ring entries (16, RO) */ +#define VIRTIO_PCI_QUEUE_SEL 14 /* current VQ selection (16, RW) */ +#define VIRTIO_PCI_QUEUE_NOTIFY 16 /* notify host regarding VQ (16, RW) */ +#define VIRTIO_PCI_STATUS 18 /* device status register (8, RW) */ +#define VIRTIO_PCI_ISR 19 /* interrupt status register, reading + * also clears the register (8, RO) */ +/* Only if MSIX is enabled: */ +#define VIRTIO_MSI_CONFIG_VECTOR 20 /* configuration change vector (16, RW) */ +#define VIRTIO_MSI_QUEUE_VECTOR 22 /* vector for selected VQ notifications + (16, RW) */ + +/* The bit of the ISR which indicates a device has an interrupt. */ +#define VIRTIO_PCI_ISR_INTR 0x1 +/* The bit of the ISR which indicates a device configuration change. */ +#define VIRTIO_PCI_ISR_CONFIG 0x2 +/* Vector value used to disable MSI for queue. */ +#define VIRTIO_MSI_NO_VECTOR 0xFFFF + +/* VirtIO device IDs. */ +#define VIRTIO_ID_NETWORK 0x01 +#define VIRTIO_ID_BLOCK 0x02 +#define VIRTIO_ID_CONSOLE 0x03 +#define VIRTIO_ID_ENTROPY 0x04 +#define VIRTIO_ID_BALLOON 0x05 +#define VIRTIO_ID_IOMEMORY 0x06 +#define VIRTIO_ID_9P 0x09 + +/* Status byte for guest to report progress. */ +#define VIRTIO_CONFIG_STATUS_RESET 0x00 +#define VIRTIO_CONFIG_STATUS_ACK 0x01 +#define VIRTIO_CONFIG_STATUS_DRIVER 0x02 +#define VIRTIO_CONFIG_STATUS_DRIVER_OK 0x04 +#define VIRTIO_CONFIG_STATUS_FEATURES_OK 0x08 +#define VIRTIO_CONFIG_STATUS_FAILED 0x80 + +/* + * Each virtqueue indirect descriptor list must be physically contiguous. + * To allow us to malloc(9) each list individually, limit the number + * supported to what will fit in one page. With 4KB pages, this is a limit + * of 256 descriptors. If there is ever a need for more, we can switch to + * contigmalloc(9) for the larger allocations, similar to what + * bus_dmamem_alloc(9) does. + * + * Note the sizeof(struct vring_desc) is 16 bytes. + */ +#define VIRTIO_MAX_INDIRECT ((int) (PAGE_SIZE / 16)) + +/* The feature bitmap for virtio net */ +#define VIRTIO_NET_F_CSUM 0 /* Host handles pkts w/ partial csum */ +#define VIRTIO_NET_F_GUEST_CSUM 1 /* Guest handles pkts w/ partial csum */ +#define VIRTIO_NET_F_MTU 3 /* Initial MTU advice. */ +#define VIRTIO_NET_F_MAC 5 /* Host has given MAC address. */ +#define VIRTIO_NET_F_GUEST_TSO4 7 /* Guest can handle TSOv4 in. */ +#define VIRTIO_NET_F_GUEST_TSO6 8 /* Guest can handle TSOv6 in. */ +#define VIRTIO_NET_F_GUEST_ECN 9 /* Guest can handle TSO[6] w/ ECN in. */ +#define VIRTIO_NET_F_GUEST_UFO 10 /* Guest can handle UFO in. */ +#define VIRTIO_NET_F_HOST_TSO4 11 /* Host can handle TSOv4 in. */ +#define VIRTIO_NET_F_HOST_TSO6 12 /* Host can handle TSOv6 in. */ +#define VIRTIO_NET_F_HOST_ECN 13 /* Host can handle TSO[6] w/ ECN in. */ +#define VIRTIO_NET_F_HOST_UFO 14 /* Host can handle UFO in. */ +#define VIRTIO_NET_F_MRG_RXBUF 15 /* Host can merge receive buffers. */ +#define VIRTIO_NET_F_STATUS 16 /* virtio_net_config.status available */ +#define VIRTIO_NET_F_CTRL_VQ 17 /* Control channel available */ +#define VIRTIO_NET_F_CTRL_RX 18 /* Control channel RX mode support */ +#define VIRTIO_NET_F_CTRL_VLAN 19 /* Control channel VLAN filtering */ +#define VIRTIO_NET_F_CTRL_RX_EXTRA 20 /* Extra RX mode control support */ +#define VIRTIO_NET_F_GUEST_ANNOUNCE 21 /* Guest can announce device on the + * network */ +#define VIRTIO_NET_F_MQ 22 /* Device supports Receive Flow + * Steering */ +#define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ + +/* Do we get callbacks when the ring is completely used, even if we've + * suppressed them? */ +#define VIRTIO_F_NOTIFY_ON_EMPTY 24 + +/* Can the device handle any descriptor layout? */ +#define VIRTIO_F_ANY_LAYOUT 27 + +/* We support indirect buffer descriptors */ +#define VIRTIO_RING_F_INDIRECT_DESC 28 + +#define VIRTIO_F_VERSION_1 32 +#define VIRTIO_F_IOMMU_PLATFORM 33 + +/* + * Some VirtIO feature bits (currently bits 28 through 31) are + * reserved for the transport being used (eg. virtio_ring), the + * rest are per-device feature bits. + */ +#define VIRTIO_TRANSPORT_F_START 28 +#define VIRTIO_TRANSPORT_F_END 34 + +/* The Guest publishes the used index for which it expects an interrupt + * at the end of the avail ring. Host should ignore the avail->flags field. */ +/* The Host publishes the avail index for which it expects a kick + * at the end of the used ring. Guest should ignore the used->flags field. */ +#define VIRTIO_RING_F_EVENT_IDX 29 + +#define VIRTIO_NET_S_LINK_UP 1 /* Link is up */ +#define VIRTIO_NET_S_ANNOUNCE 2 /* Announcement is needed */ + +/* + * Maximum number of virtqueues per device. + */ +#define VIRTIO_MAX_VIRTQUEUE_PAIRS 8 +#define VIRTIO_MAX_VIRTQUEUES (VIRTIO_MAX_VIRTQUEUE_PAIRS * 2 + 1) + +/* Common configuration */ +#define VIRTIO_PCI_CAP_COMMON_CFG 1 +/* Notifications */ +#define VIRTIO_PCI_CAP_NOTIFY_CFG 2 +/* ISR Status */ +#define VIRTIO_PCI_CAP_ISR_CFG 3 +/* Device specific configuration */ +#define VIRTIO_PCI_CAP_DEVICE_CFG 4 +/* PCI configuration access */ +#define VIRTIO_PCI_CAP_PCI_CFG 5 + +/* This is the PCI capability header: */ +struct virtio_pci_cap { + uint8_t cap_vndr; /* Generic PCI field: PCI_CAP_ID_VNDR */ + uint8_t cap_next; /* Generic PCI field: next ptr. */ + uint8_t cap_len; /* Generic PCI field: capability length */ + uint8_t cfg_type; /* Identifies the structure. */ + uint8_t bar; /* Where to find it. */ + uint8_t padding[3]; /* Pad to full dword. */ + uint32_t offset; /* Offset within bar. */ + uint32_t length; /* Length of the structure, in bytes. */ +}; + +struct virtio_pci_notify_cap { + struct virtio_pci_cap cap; + uint32_t notify_off_multiplier; /* Multiplier for queue_notify_off. */ +}; + +/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */ +struct virtio_pci_common_cfg { + /* About the whole device. */ + uint32_t device_feature_select; /* read-write */ + uint32_t device_feature; /* read-only */ + uint32_t guest_feature_select; /* read-write */ + uint32_t guest_feature; /* read-write */ + uint16_t msix_config; /* read-write */ + uint16_t num_queues; /* read-only */ + uint8_t device_status; /* read-write */ + uint8_t config_generation; /* read-only */ + + /* About a specific virtqueue. */ + uint16_t queue_select; /* read-write */ + uint16_t queue_size; /* read-write, power of 2. */ + uint16_t queue_msix_vector; /* read-write */ + uint16_t queue_enable; /* read-write */ + uint16_t queue_notify_off; /* read-only */ + uint32_t queue_desc_lo; /* read-write */ + uint32_t queue_desc_hi; /* read-write */ + uint32_t queue_avail_lo; /* read-write */ + uint32_t queue_avail_hi; /* read-write */ + uint32_t queue_used_lo; /* read-write */ + uint32_t queue_used_hi; /* read-write */ +}; + +struct virtio_hw; + +struct virtio_pci_ops { + void (*read_dev_cfg)(struct virtio_hw *hw, size_t offset, + void *dst, int len); + void (*write_dev_cfg)(struct virtio_hw *hw, size_t offset, + const void *src, int len); + void (*reset)(struct virtio_hw *hw); + + uint8_t (*get_status)(struct virtio_hw *hw); + void (*set_status)(struct virtio_hw *hw, uint8_t status); + + uint64_t (*get_features)(struct virtio_hw *hw); + void (*set_features)(struct virtio_hw *hw, uint64_t features); + + uint8_t (*get_isr)(struct virtio_hw *hw); + + uint16_t (*set_config_irq)(struct virtio_hw *hw, uint16_t vec); + + uint16_t (*set_queue_irq)(struct virtio_hw *hw, struct virtqueue *vq, + uint16_t vec); + + uint16_t (*get_queue_num)(struct virtio_hw *hw, uint16_t queue_id); + int (*setup_queue)(struct virtio_hw *hw, struct virtqueue *vq); + void (*del_queue)(struct virtio_hw *hw, struct virtqueue *vq); + void (*notify_queue)(struct virtio_hw *hw, struct virtqueue *vq); +}; + +struct virtio_net_config; + +struct virtio_hw { + struct virtnet_ctl *cvq; + uint64_t req_guest_features; + uint64_t guest_features; + uint32_t max_queue_pairs; + uint16_t started; + uint16_t max_mtu; + uint16_t vtnet_hdr_size; + uint8_t vlan_strip; + uint8_t use_msix; + uint8_t modern; + uint8_t use_simple_rxtx; + uint8_t port_id; + uint8_t mac_addr[ETHER_ADDR_LEN]; + uint32_t notify_off_multiplier; + uint8_t *isr; + uint16_t *notify_base; + struct virtio_pci_common_cfg *common_cfg; + struct virtio_net_config *dev_cfg; + void *virtio_user_dev; + + struct virtqueue **vqs; +}; + + +/* + * While virtio_hw is stored in shared memory, this structure stores + * some infos that may vary in the multiple process model locally. + * For example, the vtpci_ops pointer. + */ +struct virtio_hw_internal { + const struct virtio_pci_ops *vtpci_ops; + struct rte_pci_ioport io; +}; + +#define VTPCI_OPS(hw) (virtio_hw_internal[(hw)->port_id].vtpci_ops) +#define VTPCI_IO(hw) (&virtio_hw_internal[(hw)->port_id].io) + +extern struct virtio_hw_internal virtio_hw_internal[RTE_MAX_ETHPORTS]; + + +/* + * This structure is just a reference to read + * net device specific config space; it just a chodu structure + * + */ +struct virtio_net_config { + /* The config defining mac address (if VIRTIO_NET_F_MAC) */ + uint8_t mac[ETHER_ADDR_LEN]; + /* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */ + uint16_t status; + uint16_t max_virtqueue_pairs; + uint16_t mtu; +} __attribute__((packed)); + +/* + * How many bits to shift physical queue address written to QUEUE_PFN. + * 12 is historical, and due to x86 page size. + */ +#define VIRTIO_PCI_QUEUE_ADDR_SHIFT 12 + +/* The alignment to use between consumer and producer parts of vring. */ +#define VIRTIO_PCI_VRING_ALIGN 4096 + +static inline int +vtpci_with_feature(struct virtio_hw *hw, uint64_t bit) +{ + return (hw->guest_features & (1ULL << bit)) != 0; +} + +/* + * Function declaration from virtio_pci.c + */ +int vtpci_init(struct rte_pci_device *dev, struct virtio_hw *hw); +void vtpci_reset(struct virtio_hw *); + +void vtpci_reinit_complete(struct virtio_hw *); + +uint8_t vtpci_get_status(struct virtio_hw *); +void vtpci_set_status(struct virtio_hw *, uint8_t); + +uint64_t vtpci_negotiate_features(struct virtio_hw *, uint64_t); + +void vtpci_write_dev_config(struct virtio_hw *, size_t, const void *, int); + +void vtpci_read_dev_config(struct virtio_hw *, size_t, void *, int); + +uint8_t vtpci_isr(struct virtio_hw *); + +extern const struct virtio_pci_ops legacy_ops; +extern const struct virtio_pci_ops modern_ops; +extern const struct virtio_pci_ops virtio_user_ops; + +#endif /* _VIRTIO_PCI_H_ */ diff --git a/src/seastar/dpdk/drivers/net/virtio/virtio_ring.h b/src/seastar/dpdk/drivers/net/virtio/virtio_ring.h new file mode 100644 index 00000000..fcecc161 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtio_ring.h @@ -0,0 +1,163 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VIRTIO_RING_H_ +#define _VIRTIO_RING_H_ + +#include <stdint.h> + +#include <rte_common.h> + +/* This marks a buffer as continuing via the next field. */ +#define VRING_DESC_F_NEXT 1 +/* This marks a buffer as write-only (otherwise read-only). */ +#define VRING_DESC_F_WRITE 2 +/* This means the buffer contains a list of buffer descriptors. */ +#define VRING_DESC_F_INDIRECT 4 + +/* The Host uses this in used->flags to advise the Guest: don't kick me + * when you add a buffer. It's unreliable, so it's simply an + * optimization. Guest will still kick if it's out of buffers. */ +#define VRING_USED_F_NO_NOTIFY 1 +/* The Guest uses this in avail->flags to advise the Host: don't + * interrupt me when you consume a buffer. It's unreliable, so it's + * simply an optimization. */ +#define VRING_AVAIL_F_NO_INTERRUPT 1 + +/* VirtIO ring descriptors: 16 bytes. + * These can chain together via "next". */ +struct vring_desc { + uint64_t addr; /* Address (guest-physical). */ + uint32_t len; /* Length. */ + uint16_t flags; /* The flags as indicated above. */ + uint16_t next; /* We chain unused descriptors via this. */ +}; + +struct vring_avail { + uint16_t flags; + uint16_t idx; + uint16_t ring[0]; +}; + +/* id is a 16bit index. uint32_t is used here for ids for padding reasons. */ +struct vring_used_elem { + /* Index of start of used descriptor chain. */ + uint32_t id; + /* Total length of the descriptor chain which was written to. */ + uint32_t len; +}; + +struct vring_used { + uint16_t flags; + volatile uint16_t idx; + struct vring_used_elem ring[0]; +}; + +struct vring { + unsigned int num; + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; +}; + +/* The standard layout for the ring is a continuous chunk of memory which + * looks like this. We assume num is a power of 2. + * + * struct vring { + * // The actual descriptors (16 bytes each) + * struct vring_desc desc[num]; + * + * // A ring of available descriptor heads with free-running index. + * __u16 avail_flags; + * __u16 avail_idx; + * __u16 available[num]; + * __u16 used_event_idx; + * + * // Padding to the next align boundary. + * char pad[]; + * + * // A ring of used descriptor heads with free-running index. + * __u16 used_flags; + * __u16 used_idx; + * struct vring_used_elem used[num]; + * __u16 avail_event_idx; + * }; + * + * NOTE: for VirtIO PCI, align is 4096. + */ + +/* + * We publish the used event index at the end of the available ring, and vice + * versa. They are at the end for backwards compatibility. + */ +#define vring_used_event(vr) ((vr)->avail->ring[(vr)->num]) +#define vring_avail_event(vr) (*(uint16_t *)&(vr)->used->ring[(vr)->num]) + +static inline size_t +vring_size(unsigned int num, unsigned long align) +{ + size_t size; + + size = num * sizeof(struct vring_desc); + size += sizeof(struct vring_avail) + (num * sizeof(uint16_t)); + size = RTE_ALIGN_CEIL(size, align); + size += sizeof(struct vring_used) + + (num * sizeof(struct vring_used_elem)); + return size; +} + +static inline void +vring_init(struct vring *vr, unsigned int num, uint8_t *p, + unsigned long align) +{ + vr->num = num; + vr->desc = (struct vring_desc *) p; + vr->avail = (struct vring_avail *) (p + + num * sizeof(struct vring_desc)); + vr->used = (void *) + RTE_ALIGN_CEIL((uintptr_t)(&vr->avail->ring[num]), align); +} + +/* + * The following is used with VIRTIO_RING_F_EVENT_IDX. + * Assuming a given event_idx value from the other size, if we have + * just incremented index from old to new_idx, should we trigger an + * event? + */ +static inline int +vring_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old) +{ + return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old); +} + +#endif /* _VIRTIO_RING_H_ */ diff --git a/src/seastar/dpdk/drivers/net/virtio/virtio_rxtx.c b/src/seastar/dpdk/drivers/net/virtio/virtio_rxtx.c new file mode 100644 index 00000000..fbc96dfb --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtio_rxtx.c @@ -0,0 +1,1097 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +#include <rte_cycles.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_branch_prediction.h> +#include <rte_mempool.h> +#include <rte_malloc.h> +#include <rte_mbuf.h> +#include <rte_ether.h> +#include <rte_ethdev.h> +#include <rte_prefetch.h> +#include <rte_string_fns.h> +#include <rte_errno.h> +#include <rte_byteorder.h> +#include <rte_cpuflags.h> +#include <rte_net.h> +#include <rte_ip.h> +#include <rte_udp.h> +#include <rte_tcp.h> + +#include "virtio_logs.h" +#include "virtio_ethdev.h" +#include "virtio_pci.h" +#include "virtqueue.h" +#include "virtio_rxtx.h" + +#ifdef RTE_LIBRTE_VIRTIO_DEBUG_DUMP +#define VIRTIO_DUMP_PACKET(m, len) rte_pktmbuf_dump(stdout, m, len) +#else +#define VIRTIO_DUMP_PACKET(m, len) do { } while (0) +#endif + + +#define VIRTIO_SIMPLE_FLAGS ((uint32_t)ETH_TXQ_FLAGS_NOMULTSEGS | \ + ETH_TXQ_FLAGS_NOOFFLOADS) + +int +virtio_dev_rx_queue_done(void *rxq, uint16_t offset) +{ + struct virtnet_rx *rxvq = rxq; + struct virtqueue *vq = rxvq->vq; + + return VIRTQUEUE_NUSED(vq) >= offset; +} + +static void +vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx) +{ + struct vring_desc *dp, *dp_tail; + struct vq_desc_extra *dxp; + uint16_t desc_idx_last = desc_idx; + + dp = &vq->vq_ring.desc[desc_idx]; + dxp = &vq->vq_descx[desc_idx]; + vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt + dxp->ndescs); + if ((dp->flags & VRING_DESC_F_INDIRECT) == 0) { + while (dp->flags & VRING_DESC_F_NEXT) { + desc_idx_last = dp->next; + dp = &vq->vq_ring.desc[dp->next]; + } + } + dxp->ndescs = 0; + + /* + * We must append the existing free chain, if any, to the end of + * newly freed chain. If the virtqueue was completely used, then + * head would be VQ_RING_DESC_CHAIN_END (ASSERTed above). + */ + if (vq->vq_desc_tail_idx == VQ_RING_DESC_CHAIN_END) { + vq->vq_desc_head_idx = desc_idx; + } else { + dp_tail = &vq->vq_ring.desc[vq->vq_desc_tail_idx]; + dp_tail->next = desc_idx; + } + + vq->vq_desc_tail_idx = desc_idx_last; + dp->next = VQ_RING_DESC_CHAIN_END; +} + +static uint16_t +virtqueue_dequeue_burst_rx(struct virtqueue *vq, struct rte_mbuf **rx_pkts, + uint32_t *len, uint16_t num) +{ + struct vring_used_elem *uep; + struct rte_mbuf *cookie; + uint16_t used_idx, desc_idx; + uint16_t i; + + /* Caller does the check */ + for (i = 0; i < num ; i++) { + used_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1)); + uep = &vq->vq_ring.used->ring[used_idx]; + desc_idx = (uint16_t) uep->id; + len[i] = uep->len; + cookie = (struct rte_mbuf *)vq->vq_descx[desc_idx].cookie; + + if (unlikely(cookie == NULL)) { + PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie at %u", + vq->vq_used_cons_idx); + break; + } + + rte_prefetch0(cookie); + rte_packet_prefetch(rte_pktmbuf_mtod(cookie, void *)); + rx_pkts[i] = cookie; + vq->vq_used_cons_idx++; + vq_ring_free_chain(vq, desc_idx); + vq->vq_descx[desc_idx].cookie = NULL; + } + + return i; +} + +#ifndef DEFAULT_TX_FREE_THRESH +#define DEFAULT_TX_FREE_THRESH 32 +#endif + +/* Cleanup from completed transmits. */ +static void +virtio_xmit_cleanup(struct virtqueue *vq, uint16_t num) +{ + uint16_t i, used_idx, desc_idx; + for (i = 0; i < num; i++) { + struct vring_used_elem *uep; + struct vq_desc_extra *dxp; + + used_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1)); + uep = &vq->vq_ring.used->ring[used_idx]; + + desc_idx = (uint16_t) uep->id; + dxp = &vq->vq_descx[desc_idx]; + vq->vq_used_cons_idx++; + vq_ring_free_chain(vq, desc_idx); + + if (dxp->cookie != NULL) { + rte_pktmbuf_free(dxp->cookie); + dxp->cookie = NULL; + } + } +} + + +static inline int +virtqueue_enqueue_recv_refill(struct virtqueue *vq, struct rte_mbuf *cookie) +{ + struct vq_desc_extra *dxp; + struct virtio_hw *hw = vq->hw; + struct vring_desc *start_dp; + uint16_t needed = 1; + uint16_t head_idx, idx; + + if (unlikely(vq->vq_free_cnt == 0)) + return -ENOSPC; + if (unlikely(vq->vq_free_cnt < needed)) + return -EMSGSIZE; + + head_idx = vq->vq_desc_head_idx; + if (unlikely(head_idx >= vq->vq_nentries)) + return -EFAULT; + + idx = head_idx; + dxp = &vq->vq_descx[idx]; + dxp->cookie = (void *)cookie; + dxp->ndescs = needed; + + start_dp = vq->vq_ring.desc; + start_dp[idx].addr = + VIRTIO_MBUF_ADDR(cookie, vq) + + RTE_PKTMBUF_HEADROOM - hw->vtnet_hdr_size; + start_dp[idx].len = + cookie->buf_len - RTE_PKTMBUF_HEADROOM + hw->vtnet_hdr_size; + start_dp[idx].flags = VRING_DESC_F_WRITE; + idx = start_dp[idx].next; + vq->vq_desc_head_idx = idx; + if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END) + vq->vq_desc_tail_idx = idx; + vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - needed); + vq_update_avail_ring(vq, head_idx); + + return 0; +} + +/* When doing TSO, the IP length is not included in the pseudo header + * checksum of the packet given to the PMD, but for virtio it is + * expected. + */ +static void +virtio_tso_fix_cksum(struct rte_mbuf *m) +{ + /* common case: header is not fragmented */ + if (likely(rte_pktmbuf_data_len(m) >= m->l2_len + m->l3_len + + m->l4_len)) { + struct ipv4_hdr *iph; + struct ipv6_hdr *ip6h; + struct tcp_hdr *th; + uint16_t prev_cksum, new_cksum, ip_len, ip_paylen; + uint32_t tmp; + + iph = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, m->l2_len); + th = RTE_PTR_ADD(iph, m->l3_len); + if ((iph->version_ihl >> 4) == 4) { + iph->hdr_checksum = 0; + iph->hdr_checksum = rte_ipv4_cksum(iph); + ip_len = iph->total_length; + ip_paylen = rte_cpu_to_be_16(rte_be_to_cpu_16(ip_len) - + m->l3_len); + } else { + ip6h = (struct ipv6_hdr *)iph; + ip_paylen = ip6h->payload_len; + } + + /* calculate the new phdr checksum not including ip_paylen */ + prev_cksum = th->cksum; + tmp = prev_cksum; + tmp += ip_paylen; + tmp = (tmp & 0xffff) + (tmp >> 16); + new_cksum = tmp; + + /* replace it in the packet */ + th->cksum = new_cksum; + } +} + +static inline int +tx_offload_enabled(struct virtio_hw *hw) +{ + return vtpci_with_feature(hw, VIRTIO_NET_F_CSUM) || + vtpci_with_feature(hw, VIRTIO_NET_F_HOST_TSO4) || + vtpci_with_feature(hw, VIRTIO_NET_F_HOST_TSO6); +} + +/* avoid write operation when necessary, to lessen cache issues */ +#define ASSIGN_UNLESS_EQUAL(var, val) do { \ + if ((var) != (val)) \ + (var) = (val); \ +} while (0) + +static inline void +virtqueue_enqueue_xmit(struct virtnet_tx *txvq, struct rte_mbuf *cookie, + uint16_t needed, int use_indirect, int can_push) +{ + struct virtio_tx_region *txr = txvq->virtio_net_hdr_mz->addr; + struct vq_desc_extra *dxp; + struct virtqueue *vq = txvq->vq; + struct vring_desc *start_dp; + uint16_t seg_num = cookie->nb_segs; + uint16_t head_idx, idx; + uint16_t head_size = vq->hw->vtnet_hdr_size; + struct virtio_net_hdr *hdr; + int offload; + + offload = tx_offload_enabled(vq->hw); + head_idx = vq->vq_desc_head_idx; + idx = head_idx; + dxp = &vq->vq_descx[idx]; + dxp->cookie = (void *)cookie; + dxp->ndescs = needed; + + start_dp = vq->vq_ring.desc; + + if (can_push) { + /* prepend cannot fail, checked by caller */ + hdr = (struct virtio_net_hdr *) + rte_pktmbuf_prepend(cookie, head_size); + /* if offload disabled, it is not zeroed below, do it now */ + if (offload == 0) { + ASSIGN_UNLESS_EQUAL(hdr->csum_start, 0); + ASSIGN_UNLESS_EQUAL(hdr->csum_offset, 0); + ASSIGN_UNLESS_EQUAL(hdr->flags, 0); + ASSIGN_UNLESS_EQUAL(hdr->gso_type, 0); + ASSIGN_UNLESS_EQUAL(hdr->gso_size, 0); + ASSIGN_UNLESS_EQUAL(hdr->hdr_len, 0); + } + } else if (use_indirect) { + /* setup tx ring slot to point to indirect + * descriptor list stored in reserved region. + * + * the first slot in indirect ring is already preset + * to point to the header in reserved region + */ + start_dp[idx].addr = txvq->virtio_net_hdr_mem + + RTE_PTR_DIFF(&txr[idx].tx_indir, txr); + start_dp[idx].len = (seg_num + 1) * sizeof(struct vring_desc); + start_dp[idx].flags = VRING_DESC_F_INDIRECT; + hdr = (struct virtio_net_hdr *)&txr[idx].tx_hdr; + + /* loop below will fill in rest of the indirect elements */ + start_dp = txr[idx].tx_indir; + idx = 1; + } else { + /* setup first tx ring slot to point to header + * stored in reserved region. + */ + start_dp[idx].addr = txvq->virtio_net_hdr_mem + + RTE_PTR_DIFF(&txr[idx].tx_hdr, txr); + start_dp[idx].len = vq->hw->vtnet_hdr_size; + start_dp[idx].flags = VRING_DESC_F_NEXT; + hdr = (struct virtio_net_hdr *)&txr[idx].tx_hdr; + + idx = start_dp[idx].next; + } + + /* Checksum Offload / TSO */ + if (offload) { + if (cookie->ol_flags & PKT_TX_TCP_SEG) + cookie->ol_flags |= PKT_TX_TCP_CKSUM; + + switch (cookie->ol_flags & PKT_TX_L4_MASK) { + case PKT_TX_UDP_CKSUM: + hdr->csum_start = cookie->l2_len + cookie->l3_len; + hdr->csum_offset = offsetof(struct udp_hdr, + dgram_cksum); + hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + break; + + case PKT_TX_TCP_CKSUM: + hdr->csum_start = cookie->l2_len + cookie->l3_len; + hdr->csum_offset = offsetof(struct tcp_hdr, cksum); + hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + break; + + default: + ASSIGN_UNLESS_EQUAL(hdr->csum_start, 0); + ASSIGN_UNLESS_EQUAL(hdr->csum_offset, 0); + ASSIGN_UNLESS_EQUAL(hdr->flags, 0); + break; + } + + /* TCP Segmentation Offload */ + if (cookie->ol_flags & PKT_TX_TCP_SEG) { + virtio_tso_fix_cksum(cookie); + hdr->gso_type = (cookie->ol_flags & PKT_TX_IPV6) ? + VIRTIO_NET_HDR_GSO_TCPV6 : + VIRTIO_NET_HDR_GSO_TCPV4; + hdr->gso_size = cookie->tso_segsz; + hdr->hdr_len = + cookie->l2_len + + cookie->l3_len + + cookie->l4_len; + } else { + ASSIGN_UNLESS_EQUAL(hdr->gso_type, 0); + ASSIGN_UNLESS_EQUAL(hdr->gso_size, 0); + ASSIGN_UNLESS_EQUAL(hdr->hdr_len, 0); + } + } + + do { + start_dp[idx].addr = VIRTIO_MBUF_DATA_DMA_ADDR(cookie, vq); + start_dp[idx].len = cookie->data_len; + start_dp[idx].flags = cookie->next ? VRING_DESC_F_NEXT : 0; + idx = start_dp[idx].next; + } while ((cookie = cookie->next) != NULL); + + if (use_indirect) + idx = vq->vq_ring.desc[head_idx].next; + + vq->vq_desc_head_idx = idx; + if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END) + vq->vq_desc_tail_idx = idx; + vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - needed); + vq_update_avail_ring(vq, head_idx); +} + +void +virtio_dev_cq_start(struct rte_eth_dev *dev) +{ + struct virtio_hw *hw = dev->data->dev_private; + + if (hw->cvq && hw->cvq->vq) { + VIRTQUEUE_DUMP((struct virtqueue *)hw->cvq->vq); + } +} + +int +virtio_dev_rx_queue_setup(struct rte_eth_dev *dev, + uint16_t queue_idx, + uint16_t nb_desc, + unsigned int socket_id __rte_unused, + __rte_unused const struct rte_eth_rxconf *rx_conf, + struct rte_mempool *mp) +{ + uint16_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_RQ_QUEUE_IDX; + struct virtio_hw *hw = dev->data->dev_private; + struct virtqueue *vq = hw->vqs[vtpci_queue_idx]; + struct virtnet_rx *rxvq; + int error, nbufs; + struct rte_mbuf *m; + uint16_t desc_idx; + + PMD_INIT_FUNC_TRACE(); + + if (nb_desc == 0 || nb_desc > vq->vq_nentries) + nb_desc = vq->vq_nentries; + vq->vq_free_cnt = RTE_MIN(vq->vq_free_cnt, nb_desc); + + rxvq = &vq->rxq; + rxvq->queue_id = queue_idx; + rxvq->mpool = mp; + if (rxvq->mpool == NULL) { + rte_exit(EXIT_FAILURE, + "Cannot allocate mbufs for rx virtqueue"); + } + dev->data->rx_queues[queue_idx] = rxvq; + + + /* Allocate blank mbufs for the each rx descriptor */ + nbufs = 0; + error = ENOSPC; + + if (hw->use_simple_rxtx) { + for (desc_idx = 0; desc_idx < vq->vq_nentries; + desc_idx++) { + vq->vq_ring.avail->ring[desc_idx] = desc_idx; + vq->vq_ring.desc[desc_idx].flags = + VRING_DESC_F_WRITE; + } + } + + memset(&rxvq->fake_mbuf, 0, sizeof(rxvq->fake_mbuf)); + for (desc_idx = 0; desc_idx < RTE_PMD_VIRTIO_RX_MAX_BURST; + desc_idx++) { + vq->sw_ring[vq->vq_nentries + desc_idx] = + &rxvq->fake_mbuf; + } + + while (!virtqueue_full(vq)) { + m = rte_mbuf_raw_alloc(rxvq->mpool); + if (m == NULL) + break; + + /* Enqueue allocated buffers */ + if (hw->use_simple_rxtx) + error = virtqueue_enqueue_recv_refill_simple(vq, m); + else + error = virtqueue_enqueue_recv_refill(vq, m); + + if (error) { + rte_pktmbuf_free(m); + break; + } + nbufs++; + } + + vq_update_avail_idx(vq); + + PMD_INIT_LOG(DEBUG, "Allocated %d bufs", nbufs); + + virtio_rxq_vec_setup(rxvq); + + VIRTQUEUE_DUMP(vq); + + return 0; +} + +static void +virtio_update_rxtx_handler(struct rte_eth_dev *dev, + const struct rte_eth_txconf *tx_conf) +{ + uint8_t use_simple_rxtx = 0; + struct virtio_hw *hw = dev->data->dev_private; + +#if defined RTE_ARCH_X86 + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE3)) + use_simple_rxtx = 1; +#elif defined RTE_ARCH_ARM64 || defined CONFIG_RTE_ARCH_ARM + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) + use_simple_rxtx = 1; +#endif + /* Use simple rx/tx func if single segment and no offloads */ + if (use_simple_rxtx && + (tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS && + !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) { + PMD_INIT_LOG(INFO, "Using simple rx/tx path"); + dev->tx_pkt_burst = virtio_xmit_pkts_simple; + dev->rx_pkt_burst = virtio_recv_pkts_vec; + hw->use_simple_rxtx = use_simple_rxtx; + } +} + +/* + * struct rte_eth_dev *dev: Used to update dev + * uint16_t nb_desc: Defaults to values read from config space + * unsigned int socket_id: Used to allocate memzone + * const struct rte_eth_txconf *tx_conf: Used to setup tx engine + * uint16_t queue_idx: Just used as an index in dev txq list + */ +int +virtio_dev_tx_queue_setup(struct rte_eth_dev *dev, + uint16_t queue_idx, + uint16_t nb_desc, + unsigned int socket_id __rte_unused, + const struct rte_eth_txconf *tx_conf) +{ + uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX; + struct virtio_hw *hw = dev->data->dev_private; + struct virtqueue *vq = hw->vqs[vtpci_queue_idx]; + struct virtnet_tx *txvq; + uint16_t tx_free_thresh; + uint16_t desc_idx; + + PMD_INIT_FUNC_TRACE(); + + virtio_update_rxtx_handler(dev, tx_conf); + + if (nb_desc == 0 || nb_desc > vq->vq_nentries) + nb_desc = vq->vq_nentries; + vq->vq_free_cnt = RTE_MIN(vq->vq_free_cnt, nb_desc); + + txvq = &vq->txq; + txvq->queue_id = queue_idx; + + tx_free_thresh = tx_conf->tx_free_thresh; + if (tx_free_thresh == 0) + tx_free_thresh = + RTE_MIN(vq->vq_nentries / 4, DEFAULT_TX_FREE_THRESH); + + if (tx_free_thresh >= (vq->vq_nentries - 3)) { + RTE_LOG(ERR, PMD, "tx_free_thresh must be less than the " + "number of TX entries minus 3 (%u)." + " (tx_free_thresh=%u port=%u queue=%u)\n", + vq->vq_nentries - 3, + tx_free_thresh, dev->data->port_id, queue_idx); + return -EINVAL; + } + + vq->vq_free_thresh = tx_free_thresh; + + if (hw->use_simple_rxtx) { + uint16_t mid_idx = vq->vq_nentries >> 1; + + for (desc_idx = 0; desc_idx < mid_idx; desc_idx++) { + vq->vq_ring.avail->ring[desc_idx] = + desc_idx + mid_idx; + vq->vq_ring.desc[desc_idx + mid_idx].next = + desc_idx; + vq->vq_ring.desc[desc_idx + mid_idx].addr = + txvq->virtio_net_hdr_mem + + offsetof(struct virtio_tx_region, tx_hdr); + vq->vq_ring.desc[desc_idx + mid_idx].len = + vq->hw->vtnet_hdr_size; + vq->vq_ring.desc[desc_idx + mid_idx].flags = + VRING_DESC_F_NEXT; + vq->vq_ring.desc[desc_idx].flags = 0; + } + for (desc_idx = mid_idx; desc_idx < vq->vq_nentries; + desc_idx++) + vq->vq_ring.avail->ring[desc_idx] = desc_idx; + } + + VIRTQUEUE_DUMP(vq); + + dev->data->tx_queues[queue_idx] = txvq; + return 0; +} + +static void +virtio_discard_rxbuf(struct virtqueue *vq, struct rte_mbuf *m) +{ + int error; + /* + * Requeue the discarded mbuf. This should always be + * successful since it was just dequeued. + */ + error = virtqueue_enqueue_recv_refill(vq, m); + if (unlikely(error)) { + RTE_LOG(ERR, PMD, "cannot requeue discarded mbuf"); + rte_pktmbuf_free(m); + } +} + +static void +virtio_update_packet_stats(struct virtnet_stats *stats, struct rte_mbuf *mbuf) +{ + uint32_t s = mbuf->pkt_len; + struct ether_addr *ea; + + if (s == 64) { + stats->size_bins[1]++; + } else if (s > 64 && s < 1024) { + uint32_t bin; + + /* count zeros, and offset into correct bin */ + bin = (sizeof(s) * 8) - __builtin_clz(s) - 5; + stats->size_bins[bin]++; + } else { + if (s < 64) + stats->size_bins[0]++; + else if (s < 1519) + stats->size_bins[6]++; + else if (s >= 1519) + stats->size_bins[7]++; + } + + ea = rte_pktmbuf_mtod(mbuf, struct ether_addr *); + if (is_multicast_ether_addr(ea)) { + if (is_broadcast_ether_addr(ea)) + stats->broadcast++; + else + stats->multicast++; + } +} + +/* Optionally fill offload information in structure */ +static int +virtio_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr) +{ + struct rte_net_hdr_lens hdr_lens; + uint32_t hdrlen, ptype; + int l4_supported = 0; + + /* nothing to do */ + if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE) + return 0; + + m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN; + + ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK); + m->packet_type = ptype; + if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP || + (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP || + (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP) + l4_supported = 1; + + if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { + hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len; + if (hdr->csum_start <= hdrlen && l4_supported) { + m->ol_flags |= PKT_RX_L4_CKSUM_NONE; + } else { + /* Unknown proto or tunnel, do sw cksum. We can assume + * the cksum field is in the first segment since the + * buffers we provided to the host are large enough. + * In case of SCTP, this will be wrong since it's a CRC + * but there's nothing we can do. + */ + uint16_t csum, off; + + rte_raw_cksum_mbuf(m, hdr->csum_start, + rte_pktmbuf_pkt_len(m) - hdr->csum_start, + &csum); + if (likely(csum != 0xffff)) + csum = ~csum; + off = hdr->csum_offset + hdr->csum_start; + if (rte_pktmbuf_data_len(m) >= off + 1) + *rte_pktmbuf_mtod_offset(m, uint16_t *, + off) = csum; + } + } else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID && l4_supported) { + m->ol_flags |= PKT_RX_L4_CKSUM_GOOD; + } + + /* GSO request, save required information in mbuf */ + if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { + /* Check unsupported modes */ + if ((hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN) || + (hdr->gso_size == 0)) { + return -EINVAL; + } + + /* Update mss lengthes in mbuf */ + m->tso_segsz = hdr->gso_size; + switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { + case VIRTIO_NET_HDR_GSO_TCPV4: + case VIRTIO_NET_HDR_GSO_TCPV6: + m->ol_flags |= PKT_RX_LRO | \ + PKT_RX_L4_CKSUM_NONE; + break; + default: + return -EINVAL; + } + } + + return 0; +} + +static inline int +rx_offload_enabled(struct virtio_hw *hw) +{ + return vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_CSUM) || + vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_TSO4) || + vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_TSO6); +} + +#define VIRTIO_MBUF_BURST_SZ 64 +#define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc)) +uint16_t +virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) +{ + struct virtnet_rx *rxvq = rx_queue; + struct virtqueue *vq = rxvq->vq; + struct virtio_hw *hw = vq->hw; + struct rte_mbuf *rxm, *new_mbuf; + uint16_t nb_used, num, nb_rx; + uint32_t len[VIRTIO_MBUF_BURST_SZ]; + struct rte_mbuf *rcv_pkts[VIRTIO_MBUF_BURST_SZ]; + int error; + uint32_t i, nb_enqueued; + uint32_t hdr_size; + int offload; + struct virtio_net_hdr *hdr; + + nb_rx = 0; + if (unlikely(hw->started == 0)) + return nb_rx; + + nb_used = VIRTQUEUE_NUSED(vq); + + virtio_rmb(); + + num = (uint16_t)(likely(nb_used <= nb_pkts) ? nb_used : nb_pkts); + num = (uint16_t)(likely(num <= VIRTIO_MBUF_BURST_SZ) ? num : VIRTIO_MBUF_BURST_SZ); + if (likely(num > DESC_PER_CACHELINE)) + num = num - ((vq->vq_used_cons_idx + num) % DESC_PER_CACHELINE); + + num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len, num); + PMD_RX_LOG(DEBUG, "used:%d dequeue:%d", nb_used, num); + + nb_enqueued = 0; + hdr_size = hw->vtnet_hdr_size; + offload = rx_offload_enabled(hw); + + for (i = 0; i < num ; i++) { + rxm = rcv_pkts[i]; + + PMD_RX_LOG(DEBUG, "packet len:%d", len[i]); + + if (unlikely(len[i] < hdr_size + ETHER_HDR_LEN)) { + PMD_RX_LOG(ERR, "Packet drop"); + nb_enqueued++; + virtio_discard_rxbuf(vq, rxm); + rxvq->stats.errors++; + continue; + } + + rxm->port = rxvq->port_id; + rxm->data_off = RTE_PKTMBUF_HEADROOM; + rxm->ol_flags = 0; + rxm->vlan_tci = 0; + + rxm->pkt_len = (uint32_t)(len[i] - hdr_size); + rxm->data_len = (uint16_t)(len[i] - hdr_size); + + hdr = (struct virtio_net_hdr *)((char *)rxm->buf_addr + + RTE_PKTMBUF_HEADROOM - hdr_size); + + if (hw->vlan_strip) + rte_vlan_strip(rxm); + + if (offload && virtio_rx_offload(rxm, hdr) < 0) { + virtio_discard_rxbuf(vq, rxm); + rxvq->stats.errors++; + continue; + } + + VIRTIO_DUMP_PACKET(rxm, rxm->data_len); + + rx_pkts[nb_rx++] = rxm; + + rxvq->stats.bytes += rxm->pkt_len; + virtio_update_packet_stats(&rxvq->stats, rxm); + } + + rxvq->stats.packets += nb_rx; + + /* Allocate new mbuf for the used descriptor */ + error = ENOSPC; + while (likely(!virtqueue_full(vq))) { + new_mbuf = rte_mbuf_raw_alloc(rxvq->mpool); + if (unlikely(new_mbuf == NULL)) { + struct rte_eth_dev *dev + = &rte_eth_devices[rxvq->port_id]; + dev->data->rx_mbuf_alloc_failed++; + break; + } + error = virtqueue_enqueue_recv_refill(vq, new_mbuf); + if (unlikely(error)) { + rte_pktmbuf_free(new_mbuf); + break; + } + nb_enqueued++; + } + + if (likely(nb_enqueued)) { + vq_update_avail_idx(vq); + + if (unlikely(virtqueue_kick_prepare(vq))) { + virtqueue_notify(vq); + PMD_RX_LOG(DEBUG, "Notified"); + } + } + + return nb_rx; +} + +uint16_t +virtio_recv_mergeable_pkts(void *rx_queue, + struct rte_mbuf **rx_pkts, + uint16_t nb_pkts) +{ + struct virtnet_rx *rxvq = rx_queue; + struct virtqueue *vq = rxvq->vq; + struct virtio_hw *hw = vq->hw; + struct rte_mbuf *rxm, *new_mbuf; + uint16_t nb_used, num, nb_rx; + uint32_t len[VIRTIO_MBUF_BURST_SZ]; + struct rte_mbuf *rcv_pkts[VIRTIO_MBUF_BURST_SZ]; + struct rte_mbuf *prev; + int error; + uint32_t i, nb_enqueued; + uint32_t seg_num; + uint16_t extra_idx; + uint32_t seg_res; + uint32_t hdr_size; + int offload; + + nb_rx = 0; + if (unlikely(hw->started == 0)) + return nb_rx; + + nb_used = VIRTQUEUE_NUSED(vq); + + virtio_rmb(); + + PMD_RX_LOG(DEBUG, "used:%d", nb_used); + + i = 0; + nb_enqueued = 0; + seg_num = 0; + extra_idx = 0; + seg_res = 0; + hdr_size = hw->vtnet_hdr_size; + offload = rx_offload_enabled(hw); + + while (i < nb_used) { + struct virtio_net_hdr_mrg_rxbuf *header; + + if (nb_rx == nb_pkts) + break; + + num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len, 1); + if (num != 1) + continue; + + i++; + + PMD_RX_LOG(DEBUG, "dequeue:%d", num); + PMD_RX_LOG(DEBUG, "packet len:%d", len[0]); + + rxm = rcv_pkts[0]; + + if (unlikely(len[0] < hdr_size + ETHER_HDR_LEN)) { + PMD_RX_LOG(ERR, "Packet drop"); + nb_enqueued++; + virtio_discard_rxbuf(vq, rxm); + rxvq->stats.errors++; + continue; + } + + header = (struct virtio_net_hdr_mrg_rxbuf *)((char *)rxm->buf_addr + + RTE_PKTMBUF_HEADROOM - hdr_size); + seg_num = header->num_buffers; + + if (seg_num == 0) + seg_num = 1; + + rxm->data_off = RTE_PKTMBUF_HEADROOM; + rxm->nb_segs = seg_num; + rxm->ol_flags = 0; + rxm->vlan_tci = 0; + rxm->pkt_len = (uint32_t)(len[0] - hdr_size); + rxm->data_len = (uint16_t)(len[0] - hdr_size); + + rxm->port = rxvq->port_id; + rx_pkts[nb_rx] = rxm; + prev = rxm; + + if (offload && virtio_rx_offload(rxm, &header->hdr) < 0) { + virtio_discard_rxbuf(vq, rxm); + rxvq->stats.errors++; + continue; + } + + seg_res = seg_num - 1; + + while (seg_res != 0) { + /* + * Get extra segments for current uncompleted packet. + */ + uint16_t rcv_cnt = + RTE_MIN(seg_res, RTE_DIM(rcv_pkts)); + if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) { + uint32_t rx_num = + virtqueue_dequeue_burst_rx(vq, + rcv_pkts, len, rcv_cnt); + i += rx_num; + rcv_cnt = rx_num; + } else { + PMD_RX_LOG(ERR, + "No enough segments for packet."); + nb_enqueued++; + virtio_discard_rxbuf(vq, rxm); + rxvq->stats.errors++; + break; + } + + extra_idx = 0; + + while (extra_idx < rcv_cnt) { + rxm = rcv_pkts[extra_idx]; + + rxm->data_off = RTE_PKTMBUF_HEADROOM - hdr_size; + rxm->pkt_len = (uint32_t)(len[extra_idx]); + rxm->data_len = (uint16_t)(len[extra_idx]); + + if (prev) + prev->next = rxm; + + prev = rxm; + rx_pkts[nb_rx]->pkt_len += rxm->pkt_len; + extra_idx++; + }; + seg_res -= rcv_cnt; + } + + if (hw->vlan_strip) + rte_vlan_strip(rx_pkts[nb_rx]); + + VIRTIO_DUMP_PACKET(rx_pkts[nb_rx], + rx_pkts[nb_rx]->data_len); + + rxvq->stats.bytes += rx_pkts[nb_rx]->pkt_len; + virtio_update_packet_stats(&rxvq->stats, rx_pkts[nb_rx]); + nb_rx++; + } + + rxvq->stats.packets += nb_rx; + + /* Allocate new mbuf for the used descriptor */ + error = ENOSPC; + while (likely(!virtqueue_full(vq))) { + new_mbuf = rte_mbuf_raw_alloc(rxvq->mpool); + if (unlikely(new_mbuf == NULL)) { + struct rte_eth_dev *dev + = &rte_eth_devices[rxvq->port_id]; + dev->data->rx_mbuf_alloc_failed++; + break; + } + error = virtqueue_enqueue_recv_refill(vq, new_mbuf); + if (unlikely(error)) { + rte_pktmbuf_free(new_mbuf); + break; + } + nb_enqueued++; + } + + if (likely(nb_enqueued)) { + vq_update_avail_idx(vq); + + if (unlikely(virtqueue_kick_prepare(vq))) { + virtqueue_notify(vq); + PMD_RX_LOG(DEBUG, "Notified"); + } + } + + return nb_rx; +} + +uint16_t +virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) +{ + struct virtnet_tx *txvq = tx_queue; + struct virtqueue *vq = txvq->vq; + struct virtio_hw *hw = vq->hw; + uint16_t hdr_size = hw->vtnet_hdr_size; + uint16_t nb_used, nb_tx = 0; + int error; + + if (unlikely(hw->started == 0)) + return nb_tx; + + if (unlikely(nb_pkts < 1)) + return nb_pkts; + + PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts); + nb_used = VIRTQUEUE_NUSED(vq); + + virtio_rmb(); + if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh)) + virtio_xmit_cleanup(vq, nb_used); + + for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) { + struct rte_mbuf *txm = tx_pkts[nb_tx]; + int can_push = 0, use_indirect = 0, slots, need; + + /* Do VLAN tag insertion */ + if (unlikely(txm->ol_flags & PKT_TX_VLAN_PKT)) { + error = rte_vlan_insert(&txm); + if (unlikely(error)) { + rte_pktmbuf_free(txm); + continue; + } + } + + /* optimize ring usage */ + if ((vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) || + vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) && + rte_mbuf_refcnt_read(txm) == 1 && + RTE_MBUF_DIRECT(txm) && + txm->nb_segs == 1 && + rte_pktmbuf_headroom(txm) >= hdr_size && + rte_is_aligned(rte_pktmbuf_mtod(txm, char *), + __alignof__(struct virtio_net_hdr_mrg_rxbuf))) + can_push = 1; + else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) && + txm->nb_segs < VIRTIO_MAX_TX_INDIRECT) + use_indirect = 1; + + /* How many main ring entries are needed to this Tx? + * any_layout => number of segments + * indirect => 1 + * default => number of segments + 1 + */ + slots = use_indirect ? 1 : (txm->nb_segs + !can_push); + need = slots - vq->vq_free_cnt; + + /* Positive value indicates it need free vring descriptors */ + if (unlikely(need > 0)) { + nb_used = VIRTQUEUE_NUSED(vq); + virtio_rmb(); + need = RTE_MIN(need, (int)nb_used); + + virtio_xmit_cleanup(vq, need); + need = slots - vq->vq_free_cnt; + if (unlikely(need > 0)) { + PMD_TX_LOG(ERR, + "No free tx descriptors to transmit"); + break; + } + } + + /* Enqueue Packet buffers */ + virtqueue_enqueue_xmit(txvq, txm, slots, use_indirect, can_push); + + txvq->stats.bytes += txm->pkt_len; + virtio_update_packet_stats(&txvq->stats, txm); + } + + txvq->stats.packets += nb_tx; + + if (likely(nb_tx)) { + vq_update_avail_idx(vq); + + if (unlikely(virtqueue_kick_prepare(vq))) { + virtqueue_notify(vq); + PMD_TX_LOG(DEBUG, "Notified backend after xmit"); + } + } + + return nb_tx; +} diff --git a/src/seastar/dpdk/drivers/net/virtio/virtio_rxtx.h b/src/seastar/dpdk/drivers/net/virtio/virtio_rxtx.h new file mode 100644 index 00000000..28f82d6a --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtio_rxtx.h @@ -0,0 +1,94 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VIRTIO_RXTX_H_ +#define _VIRTIO_RXTX_H_ + +#define RTE_PMD_VIRTIO_RX_MAX_BURST 64 + +struct virtnet_stats { + uint64_t packets; + uint64_t bytes; + uint64_t errors; + uint64_t multicast; + uint64_t broadcast; + /* Size bins in array as RFC 2819, undersized [0], 64 [1], etc */ + uint64_t size_bins[8]; +}; + +struct virtnet_rx { + struct virtqueue *vq; + /* dummy mbuf, for wraparound when processing RX ring. */ + struct rte_mbuf fake_mbuf; + uint64_t mbuf_initializer; /**< value to init mbufs. */ + struct rte_mempool *mpool; /**< mempool for mbuf allocation */ + + uint16_t queue_id; /**< DPDK queue index. */ + uint8_t port_id; /**< Device port identifier. */ + + /* Statistics */ + struct virtnet_stats stats; + + const struct rte_memzone *mz; /**< mem zone to populate RX ring. */ +}; + +struct virtnet_tx { + struct virtqueue *vq; + /**< memzone to populate hdr. */ + const struct rte_memzone *virtio_net_hdr_mz; + phys_addr_t virtio_net_hdr_mem; /**< hdr for each xmit packet */ + + uint16_t queue_id; /**< DPDK queue index. */ + uint8_t port_id; /**< Device port identifier. */ + + /* Statistics */ + struct virtnet_stats stats; + + const struct rte_memzone *mz; /**< mem zone to populate TX ring. */ +}; + +struct virtnet_ctl { + struct virtqueue *vq; + /**< memzone to populate hdr. */ + const struct rte_memzone *virtio_net_hdr_mz; + phys_addr_t virtio_net_hdr_mem; /**< hdr for each xmit packet */ + uint8_t port_id; /**< Device port identifier. */ + const struct rte_memzone *mz; /**< mem zone to populate RX ring. */ +}; + +int virtio_rxq_vec_setup(struct virtnet_rx *rxvq); + +int virtqueue_enqueue_recv_refill_simple(struct virtqueue *vq, + struct rte_mbuf *m); + +#endif /* _VIRTIO_RXTX_H_ */ diff --git a/src/seastar/dpdk/drivers/net/virtio/virtio_rxtx_simple.c b/src/seastar/dpdk/drivers/net/virtio/virtio_rxtx_simple.c new file mode 100644 index 00000000..542cf805 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtio_rxtx_simple.c @@ -0,0 +1,180 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +#include <rte_cycles.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_branch_prediction.h> +#include <rte_mempool.h> +#include <rte_malloc.h> +#include <rte_mbuf.h> +#include <rte_ether.h> +#include <rte_ethdev.h> +#include <rte_prefetch.h> +#include <rte_string_fns.h> +#include <rte_errno.h> +#include <rte_byteorder.h> + +#include "virtio_rxtx_simple.h" + +#ifndef __INTEL_COMPILER +#pragma GCC diagnostic ignored "-Wcast-qual" +#endif + +int __attribute__((cold)) +virtqueue_enqueue_recv_refill_simple(struct virtqueue *vq, + struct rte_mbuf *cookie) +{ + struct vq_desc_extra *dxp; + struct vring_desc *start_dp; + uint16_t desc_idx; + + desc_idx = vq->vq_avail_idx & (vq->vq_nentries - 1); + dxp = &vq->vq_descx[desc_idx]; + dxp->cookie = (void *)cookie; + vq->sw_ring[desc_idx] = cookie; + + start_dp = vq->vq_ring.desc; + start_dp[desc_idx].addr = + VIRTIO_MBUF_ADDR(cookie, vq) + + RTE_PKTMBUF_HEADROOM - vq->hw->vtnet_hdr_size; + start_dp[desc_idx].len = cookie->buf_len - + RTE_PKTMBUF_HEADROOM + vq->hw->vtnet_hdr_size; + + vq->vq_free_cnt--; + vq->vq_avail_idx++; + + return 0; +} + +uint16_t +virtio_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts) +{ + struct virtnet_tx *txvq = tx_queue; + struct virtqueue *vq = txvq->vq; + struct virtio_hw *hw = vq->hw; + uint16_t nb_used; + uint16_t desc_idx; + struct vring_desc *start_dp; + uint16_t nb_tail, nb_commit; + int i; + uint16_t desc_idx_max = (vq->vq_nentries >> 1) - 1; + uint16_t nb_tx = 0; + + if (unlikely(hw->started == 0)) + return nb_tx; + + nb_used = VIRTQUEUE_NUSED(vq); + rte_compiler_barrier(); + + if (nb_used >= VIRTIO_TX_FREE_THRESH) + virtio_xmit_cleanup(vq); + + nb_commit = nb_pkts = RTE_MIN((vq->vq_free_cnt >> 1), nb_pkts); + desc_idx = (uint16_t)(vq->vq_avail_idx & desc_idx_max); + start_dp = vq->vq_ring.desc; + nb_tail = (uint16_t) (desc_idx_max + 1 - desc_idx); + + if (nb_commit >= nb_tail) { + for (i = 0; i < nb_tail; i++) + vq->vq_descx[desc_idx + i].cookie = tx_pkts[i]; + for (i = 0; i < nb_tail; i++) { + start_dp[desc_idx].addr = + VIRTIO_MBUF_DATA_DMA_ADDR(*tx_pkts, vq); + start_dp[desc_idx].len = (*tx_pkts)->pkt_len; + tx_pkts++; + desc_idx++; + } + nb_commit -= nb_tail; + desc_idx = 0; + } + for (i = 0; i < nb_commit; i++) + vq->vq_descx[desc_idx + i].cookie = tx_pkts[i]; + for (i = 0; i < nb_commit; i++) { + start_dp[desc_idx].addr = + VIRTIO_MBUF_DATA_DMA_ADDR(*tx_pkts, vq); + start_dp[desc_idx].len = (*tx_pkts)->pkt_len; + tx_pkts++; + desc_idx++; + } + + rte_compiler_barrier(); + + vq->vq_free_cnt -= (uint16_t)(nb_pkts << 1); + vq->vq_avail_idx += nb_pkts; + vq->vq_ring.avail->idx = vq->vq_avail_idx; + txvq->stats.packets += nb_pkts; + + if (likely(nb_pkts)) { + if (unlikely(virtqueue_kick_prepare(vq))) + virtqueue_notify(vq); + } + + return nb_pkts; +} + +int __attribute__((cold)) +virtio_rxq_vec_setup(struct virtnet_rx *rxq) +{ + uintptr_t p; + struct rte_mbuf mb_def = { .buf_addr = 0 }; /* zeroed mbuf */ + + mb_def.nb_segs = 1; + mb_def.data_off = RTE_PKTMBUF_HEADROOM; + mb_def.port = rxq->port_id; + rte_mbuf_refcnt_set(&mb_def, 1); + + /* prevent compiler reordering: rearm_data covers previous fields */ + rte_compiler_barrier(); + p = (uintptr_t)&mb_def.rearm_data; + rxq->mbuf_initializer = *(uint64_t *)p; + + return 0; +} + +/* Stub for linkage when arch specific implementation is not available */ +uint16_t __attribute__((weak)) +virtio_recv_pkts_vec(void *rx_queue __rte_unused, + struct rte_mbuf **rx_pkts __rte_unused, + uint16_t nb_pkts __rte_unused) +{ + rte_panic("Wrong weak function linked by linker\n"); + return 0; +} diff --git a/src/seastar/dpdk/drivers/net/virtio/virtio_rxtx_simple.h b/src/seastar/dpdk/drivers/net/virtio/virtio_rxtx_simple.h new file mode 100644 index 00000000..f531c542 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtio_rxtx_simple.h @@ -0,0 +1,136 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VIRTIO_RXTX_SIMPLE_H_ +#define _VIRTIO_RXTX_SIMPLE_H_ + +#include <stdint.h> + +#include "virtio_logs.h" +#include "virtio_ethdev.h" +#include "virtqueue.h" +#include "virtio_rxtx.h" + +#define RTE_VIRTIO_VPMD_RX_BURST 32 +#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST + +static inline void +virtio_rxq_rearm_vec(struct virtnet_rx *rxvq) +{ + int i; + uint16_t desc_idx; + struct rte_mbuf **sw_ring; + struct vring_desc *start_dp; + int ret; + struct virtqueue *vq = rxvq->vq; + + desc_idx = vq->vq_avail_idx & (vq->vq_nentries - 1); + sw_ring = &vq->sw_ring[desc_idx]; + start_dp = &vq->vq_ring.desc[desc_idx]; + + ret = rte_mempool_get_bulk(rxvq->mpool, (void **)sw_ring, + RTE_VIRTIO_VPMD_RX_REARM_THRESH); + if (unlikely(ret)) { + rte_eth_devices[rxvq->port_id].data->rx_mbuf_alloc_failed += + RTE_VIRTIO_VPMD_RX_REARM_THRESH; + return; + } + + for (i = 0; i < RTE_VIRTIO_VPMD_RX_REARM_THRESH; i++) { + uintptr_t p; + + p = (uintptr_t)&sw_ring[i]->rearm_data; + *(uint64_t *)p = rxvq->mbuf_initializer; + + start_dp[i].addr = + VIRTIO_MBUF_ADDR(sw_ring[i], vq) + + RTE_PKTMBUF_HEADROOM - vq->hw->vtnet_hdr_size; + start_dp[i].len = sw_ring[i]->buf_len - + RTE_PKTMBUF_HEADROOM + vq->hw->vtnet_hdr_size; + } + + vq->vq_avail_idx += RTE_VIRTIO_VPMD_RX_REARM_THRESH; + vq->vq_free_cnt -= RTE_VIRTIO_VPMD_RX_REARM_THRESH; + vq_update_avail_idx(vq); +} + +#define VIRTIO_TX_FREE_THRESH 32 +#define VIRTIO_TX_MAX_FREE_BUF_SZ 32 +#define VIRTIO_TX_FREE_NR 32 +/* TODO: vq->tx_free_cnt could mean num of free slots so we could avoid shift */ +static inline void +virtio_xmit_cleanup(struct virtqueue *vq) +{ + uint16_t i, desc_idx; + uint32_t nb_free = 0; + struct rte_mbuf *m, *free[VIRTIO_TX_MAX_FREE_BUF_SZ]; + + desc_idx = (uint16_t)(vq->vq_used_cons_idx & + ((vq->vq_nentries >> 1) - 1)); + m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie; + m = rte_pktmbuf_prefree_seg(m); + if (likely(m != NULL)) { + free[0] = m; + nb_free = 1; + for (i = 1; i < VIRTIO_TX_FREE_NR; i++) { + m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie; + m = rte_pktmbuf_prefree_seg(m); + if (likely(m != NULL)) { + if (likely(m->pool == free[0]->pool)) + free[nb_free++] = m; + else { + rte_mempool_put_bulk(free[0]->pool, + (void **)free, + RTE_MIN(RTE_DIM(free), + nb_free)); + free[0] = m; + nb_free = 1; + } + } + } + rte_mempool_put_bulk(free[0]->pool, (void **)free, + RTE_MIN(RTE_DIM(free), nb_free)); + } else { + for (i = 1; i < VIRTIO_TX_FREE_NR; i++) { + m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie; + m = rte_pktmbuf_prefree_seg(m); + if (m != NULL) + rte_mempool_put(m->pool, m); + } + } + + vq->vq_used_cons_idx += VIRTIO_TX_FREE_NR; + vq->vq_free_cnt += (VIRTIO_TX_FREE_NR << 1); +} + +#endif /* _VIRTIO_RXTX_SIMPLE_H_ */ diff --git a/src/seastar/dpdk/drivers/net/virtio/virtio_rxtx_simple_neon.c b/src/seastar/dpdk/drivers/net/virtio/virtio_rxtx_simple_neon.c new file mode 100644 index 00000000..ecc62ada --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtio_rxtx_simple_neon.c @@ -0,0 +1,239 @@ +/* + * BSD LICENSE + * + * Copyright (C) Cavium networks Ltd. 2016 + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Cavium networks nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +#include <rte_byteorder.h> +#include <rte_branch_prediction.h> +#include <rte_cycles.h> +#include <rte_ether.h> +#include <rte_ethdev.h> +#include <rte_errno.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_mempool.h> +#include <rte_malloc.h> +#include <rte_mbuf.h> +#include <rte_prefetch.h> +#include <rte_string_fns.h> +#include <rte_vect.h> + +#include "virtio_rxtx_simple.h" + +#define RTE_VIRTIO_VPMD_RX_BURST 32 +#define RTE_VIRTIO_DESC_PER_LOOP 8 +#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST + +/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP) + * + * This routine is for non-mergeable RX, one desc for each guest buffer. + * This routine is based on the RX ring layout optimization. Each entry in the + * avail ring points to the desc with the same index in the desc ring and this + * will never be changed in the driver. + * + * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet + */ +uint16_t +virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts) +{ + struct virtnet_rx *rxvq = rx_queue; + struct virtqueue *vq = rxvq->vq; + struct virtio_hw *hw = vq->hw; + uint16_t nb_used; + uint16_t desc_idx; + struct vring_used_elem *rused; + struct rte_mbuf **sw_ring; + struct rte_mbuf **sw_ring_end; + uint16_t nb_pkts_received = 0; + + uint8x16_t shuf_msk1 = { + 0xFF, 0xFF, 0xFF, 0xFF, /* packet type */ + 4, 5, 0xFF, 0xFF, /* pkt len */ + 4, 5, /* dat len */ + 0xFF, 0xFF, /* vlan tci */ + 0xFF, 0xFF, 0xFF, 0xFF + }; + + uint8x16_t shuf_msk2 = { + 0xFF, 0xFF, 0xFF, 0xFF, /* packet type */ + 12, 13, 0xFF, 0xFF, /* pkt len */ + 12, 13, /* dat len */ + 0xFF, 0xFF, /* vlan tci */ + 0xFF, 0xFF, 0xFF, 0xFF + }; + + /* Subtract the header length. + * In which case do we need the header length in used->len ? + */ + uint16x8_t len_adjust = { + 0, 0, + (uint16_t)vq->hw->vtnet_hdr_size, 0, + (uint16_t)vq->hw->vtnet_hdr_size, + 0, + 0, 0 + }; + + if (unlikely(hw->started == 0)) + return nb_pkts_received; + + if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP)) + return 0; + + nb_used = VIRTQUEUE_NUSED(vq); + + rte_rmb(); + + if (unlikely(nb_used == 0)) + return 0; + + nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP); + nb_used = RTE_MIN(nb_used, nb_pkts); + + desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1)); + rused = &vq->vq_ring.used->ring[desc_idx]; + sw_ring = &vq->sw_ring[desc_idx]; + sw_ring_end = &vq->sw_ring[vq->vq_nentries]; + + rte_prefetch_non_temporal(rused); + + if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) { + virtio_rxq_rearm_vec(rxvq); + if (unlikely(virtqueue_kick_prepare(vq))) + virtqueue_notify(vq); + } + + for (nb_pkts_received = 0; + nb_pkts_received < nb_used;) { + uint64x2_t desc[RTE_VIRTIO_DESC_PER_LOOP / 2]; + uint64x2_t mbp[RTE_VIRTIO_DESC_PER_LOOP / 2]; + uint64x2_t pkt_mb[RTE_VIRTIO_DESC_PER_LOOP]; + + mbp[0] = vld1q_u64((uint64_t *)(sw_ring + 0)); + desc[0] = vld1q_u64((uint64_t *)(rused + 0)); + vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0]); + + mbp[1] = vld1q_u64((uint64_t *)(sw_ring + 2)); + desc[1] = vld1q_u64((uint64_t *)(rused + 2)); + vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1]); + + mbp[2] = vld1q_u64((uint64_t *)(sw_ring + 4)); + desc[2] = vld1q_u64((uint64_t *)(rused + 4)); + vst1q_u64((uint64_t *)&rx_pkts[4], mbp[2]); + + mbp[3] = vld1q_u64((uint64_t *)(sw_ring + 6)); + desc[3] = vld1q_u64((uint64_t *)(rused + 6)); + vst1q_u64((uint64_t *)&rx_pkts[6], mbp[3]); + + pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(desc[0]), shuf_msk2)); + pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(desc[0]), shuf_msk1)); + pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16( + vreinterpretq_u16_u64(pkt_mb[1]), len_adjust)); + pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16( + vreinterpretq_u16_u64(pkt_mb[0]), len_adjust)); + vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1, + pkt_mb[1]); + vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1, + pkt_mb[0]); + + pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(desc[1]), shuf_msk2)); + pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(desc[1]), shuf_msk1)); + pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16( + vreinterpretq_u16_u64(pkt_mb[3]), len_adjust)); + pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16( + vreinterpretq_u16_u64(pkt_mb[2]), len_adjust)); + vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1, + pkt_mb[3]); + vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1, + pkt_mb[2]); + + pkt_mb[5] = vreinterpretq_u64_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(desc[2]), shuf_msk2)); + pkt_mb[4] = vreinterpretq_u64_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(desc[2]), shuf_msk1)); + pkt_mb[5] = vreinterpretq_u64_u16(vsubq_u16( + vreinterpretq_u16_u64(pkt_mb[5]), len_adjust)); + pkt_mb[4] = vreinterpretq_u64_u16(vsubq_u16( + vreinterpretq_u16_u64(pkt_mb[4]), len_adjust)); + vst1q_u64((void *)&rx_pkts[5]->rx_descriptor_fields1, + pkt_mb[5]); + vst1q_u64((void *)&rx_pkts[4]->rx_descriptor_fields1, + pkt_mb[4]); + + pkt_mb[7] = vreinterpretq_u64_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(desc[3]), shuf_msk2)); + pkt_mb[6] = vreinterpretq_u64_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(desc[3]), shuf_msk1)); + pkt_mb[7] = vreinterpretq_u64_u16(vsubq_u16( + vreinterpretq_u16_u64(pkt_mb[7]), len_adjust)); + pkt_mb[6] = vreinterpretq_u64_u16(vsubq_u16( + vreinterpretq_u16_u64(pkt_mb[6]), len_adjust)); + vst1q_u64((void *)&rx_pkts[7]->rx_descriptor_fields1, + pkt_mb[7]); + vst1q_u64((void *)&rx_pkts[6]->rx_descriptor_fields1, + pkt_mb[6]); + + if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) { + if (sw_ring + nb_used <= sw_ring_end) + nb_pkts_received += nb_used; + else + nb_pkts_received += sw_ring_end - sw_ring; + break; + } else { + if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >= + sw_ring_end)) { + nb_pkts_received += sw_ring_end - sw_ring; + break; + } else { + nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP; + + rx_pkts += RTE_VIRTIO_DESC_PER_LOOP; + sw_ring += RTE_VIRTIO_DESC_PER_LOOP; + rused += RTE_VIRTIO_DESC_PER_LOOP; + nb_used -= RTE_VIRTIO_DESC_PER_LOOP; + } + } + } + + vq->vq_used_cons_idx += nb_pkts_received; + vq->vq_free_cnt += nb_pkts_received; + rxvq->stats.packets += nb_pkts_received; + return nb_pkts_received; +} diff --git a/src/seastar/dpdk/drivers/net/virtio/virtio_rxtx_simple_sse.c b/src/seastar/dpdk/drivers/net/virtio/virtio_rxtx_simple_sse.c new file mode 100644 index 00000000..7cf0f8b8 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtio_rxtx_simple_sse.c @@ -0,0 +1,226 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +#include <tmmintrin.h> + +#include <rte_byteorder.h> +#include <rte_branch_prediction.h> +#include <rte_cycles.h> +#include <rte_ether.h> +#include <rte_ethdev.h> +#include <rte_errno.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_mempool.h> +#include <rte_malloc.h> +#include <rte_mbuf.h> +#include <rte_prefetch.h> +#include <rte_string_fns.h> + +#include "virtio_rxtx_simple.h" + +#define RTE_VIRTIO_VPMD_RX_BURST 32 +#define RTE_VIRTIO_DESC_PER_LOOP 8 +#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST + +/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP) + * + * This routine is for non-mergeable RX, one desc for each guest buffer. + * This routine is based on the RX ring layout optimization. Each entry in the + * avail ring points to the desc with the same index in the desc ring and this + * will never be changed in the driver. + * + * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet + */ +uint16_t +virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts) +{ + struct virtnet_rx *rxvq = rx_queue; + struct virtqueue *vq = rxvq->vq; + struct virtio_hw *hw = vq->hw; + uint16_t nb_used; + uint16_t desc_idx; + struct vring_used_elem *rused; + struct rte_mbuf **sw_ring; + struct rte_mbuf **sw_ring_end; + uint16_t nb_pkts_received = 0; + __m128i shuf_msk1, shuf_msk2, len_adjust; + + shuf_msk1 = _mm_set_epi8( + 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, /* vlan tci */ + 5, 4, /* dat len */ + 0xFF, 0xFF, 5, 4, /* pkt len */ + 0xFF, 0xFF, 0xFF, 0xFF /* packet type */ + + ); + + shuf_msk2 = _mm_set_epi8( + 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, /* vlan tci */ + 13, 12, /* dat len */ + 0xFF, 0xFF, 13, 12, /* pkt len */ + 0xFF, 0xFF, 0xFF, 0xFF /* packet type */ + ); + + /* Subtract the header length. + * In which case do we need the header length in used->len ? + */ + len_adjust = _mm_set_epi16( + 0, 0, + 0, + (uint16_t)-vq->hw->vtnet_hdr_size, + 0, (uint16_t)-vq->hw->vtnet_hdr_size, + 0, 0); + + if (unlikely(hw->started == 0)) + return nb_pkts_received; + + if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP)) + return 0; + + nb_used = VIRTQUEUE_NUSED(vq); + + rte_compiler_barrier(); + + if (unlikely(nb_used == 0)) + return 0; + + nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP); + nb_used = RTE_MIN(nb_used, nb_pkts); + + desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1)); + rused = &vq->vq_ring.used->ring[desc_idx]; + sw_ring = &vq->sw_ring[desc_idx]; + sw_ring_end = &vq->sw_ring[vq->vq_nentries]; + + rte_prefetch0(rused); + + if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) { + virtio_rxq_rearm_vec(rxvq); + if (unlikely(virtqueue_kick_prepare(vq))) + virtqueue_notify(vq); + } + + for (nb_pkts_received = 0; + nb_pkts_received < nb_used;) { + __m128i desc[RTE_VIRTIO_DESC_PER_LOOP / 2]; + __m128i mbp[RTE_VIRTIO_DESC_PER_LOOP / 2]; + __m128i pkt_mb[RTE_VIRTIO_DESC_PER_LOOP]; + + mbp[0] = _mm_loadu_si128((__m128i *)(sw_ring + 0)); + desc[0] = _mm_loadu_si128((__m128i *)(rused + 0)); + _mm_storeu_si128((__m128i *)&rx_pkts[0], mbp[0]); + + mbp[1] = _mm_loadu_si128((__m128i *)(sw_ring + 2)); + desc[1] = _mm_loadu_si128((__m128i *)(rused + 2)); + _mm_storeu_si128((__m128i *)&rx_pkts[2], mbp[1]); + + mbp[2] = _mm_loadu_si128((__m128i *)(sw_ring + 4)); + desc[2] = _mm_loadu_si128((__m128i *)(rused + 4)); + _mm_storeu_si128((__m128i *)&rx_pkts[4], mbp[2]); + + mbp[3] = _mm_loadu_si128((__m128i *)(sw_ring + 6)); + desc[3] = _mm_loadu_si128((__m128i *)(rused + 6)); + _mm_storeu_si128((__m128i *)&rx_pkts[6], mbp[3]); + + pkt_mb[1] = _mm_shuffle_epi8(desc[0], shuf_msk2); + pkt_mb[0] = _mm_shuffle_epi8(desc[0], shuf_msk1); + pkt_mb[1] = _mm_add_epi16(pkt_mb[1], len_adjust); + pkt_mb[0] = _mm_add_epi16(pkt_mb[0], len_adjust); + _mm_storeu_si128((void *)&rx_pkts[1]->rx_descriptor_fields1, + pkt_mb[1]); + _mm_storeu_si128((void *)&rx_pkts[0]->rx_descriptor_fields1, + pkt_mb[0]); + + pkt_mb[3] = _mm_shuffle_epi8(desc[1], shuf_msk2); + pkt_mb[2] = _mm_shuffle_epi8(desc[1], shuf_msk1); + pkt_mb[3] = _mm_add_epi16(pkt_mb[3], len_adjust); + pkt_mb[2] = _mm_add_epi16(pkt_mb[2], len_adjust); + _mm_storeu_si128((void *)&rx_pkts[3]->rx_descriptor_fields1, + pkt_mb[3]); + _mm_storeu_si128((void *)&rx_pkts[2]->rx_descriptor_fields1, + pkt_mb[2]); + + pkt_mb[5] = _mm_shuffle_epi8(desc[2], shuf_msk2); + pkt_mb[4] = _mm_shuffle_epi8(desc[2], shuf_msk1); + pkt_mb[5] = _mm_add_epi16(pkt_mb[5], len_adjust); + pkt_mb[4] = _mm_add_epi16(pkt_mb[4], len_adjust); + _mm_storeu_si128((void *)&rx_pkts[5]->rx_descriptor_fields1, + pkt_mb[5]); + _mm_storeu_si128((void *)&rx_pkts[4]->rx_descriptor_fields1, + pkt_mb[4]); + + pkt_mb[7] = _mm_shuffle_epi8(desc[3], shuf_msk2); + pkt_mb[6] = _mm_shuffle_epi8(desc[3], shuf_msk1); + pkt_mb[7] = _mm_add_epi16(pkt_mb[7], len_adjust); + pkt_mb[6] = _mm_add_epi16(pkt_mb[6], len_adjust); + _mm_storeu_si128((void *)&rx_pkts[7]->rx_descriptor_fields1, + pkt_mb[7]); + _mm_storeu_si128((void *)&rx_pkts[6]->rx_descriptor_fields1, + pkt_mb[6]); + + if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) { + if (sw_ring + nb_used <= sw_ring_end) + nb_pkts_received += nb_used; + else + nb_pkts_received += sw_ring_end - sw_ring; + break; + } else { + if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >= + sw_ring_end)) { + nb_pkts_received += sw_ring_end - sw_ring; + break; + } else { + nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP; + + rx_pkts += RTE_VIRTIO_DESC_PER_LOOP; + sw_ring += RTE_VIRTIO_DESC_PER_LOOP; + rused += RTE_VIRTIO_DESC_PER_LOOP; + nb_used -= RTE_VIRTIO_DESC_PER_LOOP; + } + } + } + + vq->vq_used_cons_idx += nb_pkts_received; + vq->vq_free_cnt += nb_pkts_received; + rxvq->stats.packets += nb_pkts_received; + return nb_pkts_received; +} diff --git a/src/seastar/dpdk/drivers/net/virtio/virtio_user/vhost.h b/src/seastar/dpdk/drivers/net/virtio/virtio_user/vhost.h new file mode 100644 index 00000000..5c983bd4 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtio_user/vhost.h @@ -0,0 +1,123 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VHOST_NET_USER_H +#define _VHOST_NET_USER_H + +#include <stdint.h> +#include <linux/types.h> +#include <linux/ioctl.h> + +#include "../virtio_pci.h" +#include "../virtio_logs.h" +#include "../virtqueue.h" + +struct vhost_vring_state { + unsigned int index; + unsigned int num; +}; + +struct vhost_vring_file { + unsigned int index; + int fd; +}; + +struct vhost_vring_addr { + unsigned int index; + /* Option flags. */ + unsigned int flags; + /* Flag values: */ + /* Whether log address is valid. If set enables logging. */ +#define VHOST_VRING_F_LOG 0 + + /* Start of array of descriptors (virtually contiguous) */ + uint64_t desc_user_addr; + /* Used structure address. Must be 32 bit aligned */ + uint64_t used_user_addr; + /* Available structure address. Must be 16 bit aligned */ + uint64_t avail_user_addr; + /* Logging support. */ + /* Log writes to used structure, at offset calculated from specified + * address. Address must be 32 bit aligned. + */ + uint64_t log_guest_addr; +}; + +enum vhost_user_request { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_MAX +}; + +const char * const vhost_msg_strings[VHOST_USER_MAX]; + +struct vhost_memory_region { + uint64_t guest_phys_addr; + uint64_t memory_size; /* bytes */ + uint64_t userspace_addr; + uint64_t mmap_offset; +}; + +struct virtio_user_dev; + +struct virtio_user_backend_ops { + int (*setup)(struct virtio_user_dev *dev); + int (*send_request)(struct virtio_user_dev *dev, + enum vhost_user_request req, + void *arg); + int (*enable_qp)(struct virtio_user_dev *dev, + uint16_t pair_idx, + int enable); +}; + +struct virtio_user_backend_ops ops_user; +struct virtio_user_backend_ops ops_kernel; + +#endif diff --git a/src/seastar/dpdk/drivers/net/virtio/virtio_user/vhost_kernel.c b/src/seastar/dpdk/drivers/net/virtio/virtio_user/vhost_kernel.c new file mode 100644 index 00000000..68d28b13 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtio_user/vhost_kernel.c @@ -0,0 +1,403 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> + +#include <rte_memory.h> +#include <rte_eal_memconfig.h> + +#include "vhost.h" +#include "virtio_user_dev.h" +#include "vhost_kernel_tap.h" + +struct vhost_memory_kernel { + uint32_t nregions; + uint32_t padding; + struct vhost_memory_region regions[0]; +}; + +/* vhost kernel ioctls */ +#define VHOST_VIRTIO 0xAF +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) +#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory_kernel) +#define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) +#define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) +#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) +#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state) +#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state) +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) +#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file) + +static uint64_t max_regions = 64; + +static void +get_vhost_kernel_max_regions(void) +{ + int fd; + char buf[20] = {'\0'}; + + fd = open("/sys/module/vhost/parameters/max_mem_regions", O_RDONLY); + if (fd < 0) + return; + + if (read(fd, buf, sizeof(buf) - 1) > 0) + max_regions = strtoull(buf, NULL, 10); + + close(fd); +} + +static uint64_t vhost_req_user_to_kernel[] = { + [VHOST_USER_SET_OWNER] = VHOST_SET_OWNER, + [VHOST_USER_RESET_OWNER] = VHOST_RESET_OWNER, + [VHOST_USER_SET_FEATURES] = VHOST_SET_FEATURES, + [VHOST_USER_GET_FEATURES] = VHOST_GET_FEATURES, + [VHOST_USER_SET_VRING_CALL] = VHOST_SET_VRING_CALL, + [VHOST_USER_SET_VRING_NUM] = VHOST_SET_VRING_NUM, + [VHOST_USER_SET_VRING_BASE] = VHOST_SET_VRING_BASE, + [VHOST_USER_GET_VRING_BASE] = VHOST_GET_VRING_BASE, + [VHOST_USER_SET_VRING_ADDR] = VHOST_SET_VRING_ADDR, + [VHOST_USER_SET_VRING_KICK] = VHOST_SET_VRING_KICK, + [VHOST_USER_SET_MEM_TABLE] = VHOST_SET_MEM_TABLE, +}; + +/* By default, vhost kernel module allows 64 regions, but DPDK allows + * 256 segments. As a relief, below function merges those virtually + * adjacent memsegs into one region. + */ +static struct vhost_memory_kernel * +prepare_vhost_memory_kernel(void) +{ + uint32_t i, j, k = 0; + struct rte_memseg *seg; + struct vhost_memory_region *mr; + struct vhost_memory_kernel *vm; + + vm = malloc(sizeof(struct vhost_memory_kernel) + + max_regions * + sizeof(struct vhost_memory_region)); + if (!vm) + return NULL; + + for (i = 0; i < RTE_MAX_MEMSEG; ++i) { + seg = &rte_eal_get_configuration()->mem_config->memseg[i]; + if (!seg->addr) + break; + + int new_region = 1; + + for (j = 0; j < k; ++j) { + mr = &vm->regions[j]; + + if (mr->userspace_addr + mr->memory_size == + (uint64_t)(uintptr_t)seg->addr) { + mr->memory_size += seg->len; + new_region = 0; + break; + } + + if ((uint64_t)(uintptr_t)seg->addr + seg->len == + mr->userspace_addr) { + mr->guest_phys_addr = + (uint64_t)(uintptr_t)seg->addr; + mr->userspace_addr = + (uint64_t)(uintptr_t)seg->addr; + mr->memory_size += seg->len; + new_region = 0; + break; + } + } + + if (new_region == 0) + continue; + + mr = &vm->regions[k++]; + /* use vaddr here! */ + mr->guest_phys_addr = (uint64_t)(uintptr_t)seg->addr; + mr->userspace_addr = (uint64_t)(uintptr_t)seg->addr; + mr->memory_size = seg->len; + mr->mmap_offset = 0; + + if (k >= max_regions) { + free(vm); + return NULL; + } + } + + vm->nregions = k; + vm->padding = 0; + return vm; +} + +/* with below features, vhost kernel does not need to do the checksum and TSO, + * these info will be passed to virtio_user through virtio net header. + */ +#define VHOST_KERNEL_GUEST_OFFLOADS_MASK \ + ((1ULL << VIRTIO_NET_F_GUEST_CSUM) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ + (1ULL << VIRTIO_NET_F_GUEST_ECN) | \ + (1ULL << VIRTIO_NET_F_GUEST_UFO)) + +/* with below features, when flows from virtio_user to vhost kernel + * (1) if flows goes up through the kernel networking stack, it does not need + * to verify checksum, which can save CPU cycles; + * (2) if flows goes through a Linux bridge and outside from an interface + * (kernel driver), checksum and TSO will be done by GSO in kernel or even + * offloaded into real physical device. + */ +#define VHOST_KERNEL_HOST_OFFLOADS_MASK \ + ((1ULL << VIRTIO_NET_F_HOST_TSO4) | \ + (1ULL << VIRTIO_NET_F_HOST_TSO6) | \ + (1ULL << VIRTIO_NET_F_CSUM)) + +static int +tap_supporte_mq(void) +{ + int tapfd; + unsigned int tap_features; + + tapfd = open(PATH_NET_TUN, O_RDWR); + if (tapfd < 0) { + PMD_DRV_LOG(ERR, "fail to open %s: %s", + PATH_NET_TUN, strerror(errno)); + return -1; + } + + if (ioctl(tapfd, TUNGETFEATURES, &tap_features) == -1) { + PMD_DRV_LOG(ERR, "TUNGETFEATURES failed: %s", strerror(errno)); + close(tapfd); + return -1; + } + + close(tapfd); + return tap_features & IFF_MULTI_QUEUE; +} + +static int +vhost_kernel_ioctl(struct virtio_user_dev *dev, + enum vhost_user_request req, + void *arg) +{ + int ret = -1; + unsigned int i; + uint64_t req_kernel; + struct vhost_memory_kernel *vm = NULL; + int vhostfd; + unsigned int queue_sel; + + PMD_DRV_LOG(INFO, "%s", vhost_msg_strings[req]); + + req_kernel = vhost_req_user_to_kernel[req]; + + if (req_kernel == VHOST_SET_MEM_TABLE) { + vm = prepare_vhost_memory_kernel(); + if (!vm) + return -1; + arg = (void *)vm; + } + + if (req_kernel == VHOST_SET_FEATURES) { + /* We don't need memory protection here */ + *(uint64_t *)arg &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM); + + /* VHOST kernel does not know about below flags */ + *(uint64_t *)arg &= ~VHOST_KERNEL_GUEST_OFFLOADS_MASK; + *(uint64_t *)arg &= ~VHOST_KERNEL_HOST_OFFLOADS_MASK; + + *(uint64_t *)arg &= ~(1ULL << VIRTIO_NET_F_MQ); + } + + switch (req_kernel) { + case VHOST_SET_VRING_NUM: + case VHOST_SET_VRING_ADDR: + case VHOST_SET_VRING_BASE: + case VHOST_GET_VRING_BASE: + case VHOST_SET_VRING_KICK: + case VHOST_SET_VRING_CALL: + queue_sel = *(unsigned int *)arg; + vhostfd = dev->vhostfds[queue_sel / 2]; + *(unsigned int *)arg = queue_sel % 2; + PMD_DRV_LOG(DEBUG, "vhostfd=%d, index=%u", + vhostfd, *(unsigned int *)arg); + break; + default: + vhostfd = -1; + } + if (vhostfd == -1) { + for (i = 0; i < dev->max_queue_pairs; ++i) { + if (dev->vhostfds[i] < 0) + continue; + + ret = ioctl(dev->vhostfds[i], req_kernel, arg); + if (ret < 0) + break; + } + } else { + ret = ioctl(vhostfd, req_kernel, arg); + } + + if (!ret && req_kernel == VHOST_GET_FEATURES) { + /* with tap as the backend, all these features are supported + * but not claimed by vhost-net, so we add them back when + * reporting to upper layer. + */ + *((uint64_t *)arg) |= VHOST_KERNEL_GUEST_OFFLOADS_MASK; + *((uint64_t *)arg) |= VHOST_KERNEL_HOST_OFFLOADS_MASK; + + /* vhost_kernel will not declare this feature, but it does + * support multi-queue. + */ + if (tap_supporte_mq()) + *(uint64_t *)arg |= (1ull << VIRTIO_NET_F_MQ); + } + + if (vm) + free(vm); + + if (ret < 0) + PMD_DRV_LOG(ERR, "%s failed: %s", + vhost_msg_strings[req], strerror(errno)); + + return ret; +} + +/** + * Set up environment to talk with a vhost kernel backend. + * + * @return + * - (-1) if fail to set up; + * - (>=0) if successful. + */ +static int +vhost_kernel_setup(struct virtio_user_dev *dev) +{ + int vhostfd; + uint32_t i; + + get_vhost_kernel_max_regions(); + + for (i = 0; i < dev->max_queue_pairs; ++i) { + vhostfd = open(dev->path, O_RDWR); + if (vhostfd < 0) { + PMD_DRV_LOG(ERR, "fail to open %s, %s", + dev->path, strerror(errno)); + return -1; + } + + dev->vhostfds[i] = vhostfd; + } + + return 0; +} + +static int +vhost_kernel_set_backend(int vhostfd, int tapfd) +{ + struct vhost_vring_file f; + + f.fd = tapfd; + f.index = 0; + if (ioctl(vhostfd, VHOST_NET_SET_BACKEND, &f) < 0) { + PMD_DRV_LOG(ERR, "VHOST_NET_SET_BACKEND fails, %s", + strerror(errno)); + return -1; + } + + f.index = 1; + if (ioctl(vhostfd, VHOST_NET_SET_BACKEND, &f) < 0) { + PMD_DRV_LOG(ERR, "VHOST_NET_SET_BACKEND fails, %s", + strerror(errno)); + return -1; + } + + return 0; +} + +static int +vhost_kernel_enable_queue_pair(struct virtio_user_dev *dev, + uint16_t pair_idx, + int enable) +{ + int hdr_size; + int vhostfd; + int tapfd; + int req_mq = (dev->max_queue_pairs > 1); + + vhostfd = dev->vhostfds[pair_idx]; + + if (!enable) { + if (dev->tapfds[pair_idx] >= 0) { + close(dev->tapfds[pair_idx]); + dev->tapfds[pair_idx] = -1; + } + return vhost_kernel_set_backend(vhostfd, -1); + } else if (dev->tapfds[pair_idx] >= 0) { + return 0; + } + + if ((dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF)) || + (dev->features & (1ULL << VIRTIO_F_VERSION_1))) + hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf); + else + hdr_size = sizeof(struct virtio_net_hdr); + + tapfd = vhost_kernel_open_tap(&dev->ifname, hdr_size, req_mq); + if (tapfd < 0) { + PMD_DRV_LOG(ERR, "fail to open tap for vhost kernel"); + return -1; + } + + if (vhost_kernel_set_backend(vhostfd, tapfd) < 0) { + PMD_DRV_LOG(ERR, "fail to set backend for vhost kernel"); + close(tapfd); + return -1; + } + + dev->tapfds[pair_idx] = tapfd; + return 0; +} + +struct virtio_user_backend_ops ops_kernel = { + .setup = vhost_kernel_setup, + .send_request = vhost_kernel_ioctl, + .enable_qp = vhost_kernel_enable_queue_pair +}; diff --git a/src/seastar/dpdk/drivers/net/virtio/virtio_user/vhost_kernel_tap.c b/src/seastar/dpdk/drivers/net/virtio/virtio_user/vhost_kernel_tap.c new file mode 100644 index 00000000..f585de8c --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtio_user/vhost_kernel_tap.c @@ -0,0 +1,133 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <net/if.h> +#include <errno.h> +#include <string.h> +#include <limits.h> + +#include "vhost_kernel_tap.h" +#include "../virtio_logs.h" + +int +vhost_kernel_open_tap(char **p_ifname, int hdr_size, int req_mq) +{ + unsigned int tap_features; + int sndbuf = INT_MAX; + struct ifreq ifr; + int tapfd; + unsigned int offload = + TUN_F_CSUM | + TUN_F_TSO4 | + TUN_F_TSO6 | + TUN_F_TSO_ECN | + TUN_F_UFO; + + /* TODO: + * 1. verify we can get/set vnet_hdr_len, tap_probe_vnet_hdr_len + * 2. get number of memory regions from vhost module parameter + * max_mem_regions, supported in newer version linux kernel + */ + tapfd = open(PATH_NET_TUN, O_RDWR); + if (tapfd < 0) { + PMD_DRV_LOG(ERR, "fail to open %s: %s", + PATH_NET_TUN, strerror(errno)); + return -1; + } + + /* Construct ifr */ + memset(&ifr, 0, sizeof(ifr)); + ifr.ifr_flags = IFF_TAP | IFF_NO_PI; + + if (ioctl(tapfd, TUNGETFEATURES, &tap_features) == -1) { + PMD_DRV_LOG(ERR, "TUNGETFEATURES failed: %s", strerror(errno)); + goto error; + } + if (tap_features & IFF_ONE_QUEUE) + ifr.ifr_flags |= IFF_ONE_QUEUE; + + /* Let tap instead of vhost-net handle vnet header, as the latter does + * not support offloading. And in this case, we should not set feature + * bit VHOST_NET_F_VIRTIO_NET_HDR. + */ + if (tap_features & IFF_VNET_HDR) { + ifr.ifr_flags |= IFF_VNET_HDR; + } else { + PMD_DRV_LOG(ERR, "TAP does not support IFF_VNET_HDR"); + goto error; + } + + if (req_mq) + ifr.ifr_flags |= IFF_MULTI_QUEUE; + + if (*p_ifname) + strncpy(ifr.ifr_name, *p_ifname, IFNAMSIZ); + else + strncpy(ifr.ifr_name, "tap%d", IFNAMSIZ); + if (ioctl(tapfd, TUNSETIFF, (void *)&ifr) == -1) { + PMD_DRV_LOG(ERR, "TUNSETIFF failed: %s", strerror(errno)); + goto error; + } + + fcntl(tapfd, F_SETFL, O_NONBLOCK); + + if (ioctl(tapfd, TUNSETVNETHDRSZ, &hdr_size) < 0) { + PMD_DRV_LOG(ERR, "TUNSETVNETHDRSZ failed: %s", strerror(errno)); + goto error; + } + + if (ioctl(tapfd, TUNSETSNDBUF, &sndbuf) < 0) { + PMD_DRV_LOG(ERR, "TUNSETSNDBUF failed: %s", strerror(errno)); + goto error; + } + + /* TODO: before set the offload capabilities, we'd better (1) check + * negotiated features to see if necessary to offload; (2) query tap + * to see if it supports the offload capabilities. + */ + if (ioctl(tapfd, TUNSETOFFLOAD, offload) != 0) + PMD_DRV_LOG(ERR, "TUNSETOFFLOAD ioctl() failed: %s", + strerror(errno)); + + if (!(*p_ifname)) + *p_ifname = strdup(ifr.ifr_name); + + return tapfd; +error: + close(tapfd); + return -1; +} diff --git a/src/seastar/dpdk/drivers/net/virtio/virtio_user/vhost_kernel_tap.h b/src/seastar/dpdk/drivers/net/virtio/virtio_user/vhost_kernel_tap.h new file mode 100644 index 00000000..eae340cc --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtio_user/vhost_kernel_tap.h @@ -0,0 +1,67 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/ioctl.h> + +/* TUN ioctls */ +#define TUNSETIFF _IOW('T', 202, int) +#define TUNGETFEATURES _IOR('T', 207, unsigned int) +#define TUNSETOFFLOAD _IOW('T', 208, unsigned int) +#define TUNGETIFF _IOR('T', 210, unsigned int) +#define TUNSETSNDBUF _IOW('T', 212, int) +#define TUNGETVNETHDRSZ _IOR('T', 215, int) +#define TUNSETVNETHDRSZ _IOW('T', 216, int) +#define TUNSETQUEUE _IOW('T', 217, int) +#define TUNSETVNETLE _IOW('T', 220, int) +#define TUNSETVNETBE _IOW('T', 222, int) + +/* TUNSETIFF ifr flags */ +#define IFF_TAP 0x0002 +#define IFF_NO_PI 0x1000 +#define IFF_ONE_QUEUE 0x2000 +#define IFF_VNET_HDR 0x4000 +#define IFF_MULTI_QUEUE 0x0100 +#define IFF_ATTACH_QUEUE 0x0200 +#define IFF_DETACH_QUEUE 0x0400 + +/* Features for GSO (TUNSETOFFLOAD). */ +#define TUN_F_CSUM 0x01 /* You can hand me unchecksummed packets. */ +#define TUN_F_TSO4 0x02 /* I can handle TSO for IPv4 packets */ +#define TUN_F_TSO6 0x04 /* I can handle TSO for IPv6 packets */ +#define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */ +#define TUN_F_UFO 0x10 /* I can handle UFO packets */ + +/* Constants */ +#define PATH_NET_TUN "/dev/net/tun" + +int vhost_kernel_open_tap(char **p_ifname, int hdr_size, int req_mq); diff --git a/src/seastar/dpdk/drivers/net/virtio/virtio_user/vhost_user.c b/src/seastar/dpdk/drivers/net/virtio/virtio_user/vhost_user.c new file mode 100644 index 00000000..4ad7b21b --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtio_user/vhost_user.c @@ -0,0 +1,467 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/un.h> +#include <string.h> +#include <errno.h> + +#include "vhost.h" +#include "virtio_user_dev.h" + +/* The version of the protocol we support */ +#define VHOST_USER_VERSION 0x1 + +#define VHOST_MEMORY_MAX_NREGIONS 8 +struct vhost_memory { + uint32_t nregions; + uint32_t padding; + struct vhost_memory_region regions[VHOST_MEMORY_MAX_NREGIONS]; +}; + +struct vhost_user_msg { + enum vhost_user_request request; + +#define VHOST_USER_VERSION_MASK 0x3 +#define VHOST_USER_REPLY_MASK (0x1 << 2) + uint32_t flags; + uint32_t size; /* the following payload size */ + union { +#define VHOST_USER_VRING_IDX_MASK 0xff +#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + struct vhost_memory memory; + } payload; + int fds[VHOST_MEMORY_MAX_NREGIONS]; +} __attribute((packed)); + +#define VHOST_USER_HDR_SIZE offsetof(struct vhost_user_msg, payload.u64) +#define VHOST_USER_PAYLOAD_SIZE \ + (sizeof(struct vhost_user_msg) - VHOST_USER_HDR_SIZE) + +static int +vhost_user_write(int fd, void *buf, int len, int *fds, int fd_num) +{ + int r; + struct msghdr msgh; + struct iovec iov; + size_t fd_size = fd_num * sizeof(int); + char control[CMSG_SPACE(fd_size)]; + struct cmsghdr *cmsg; + + memset(&msgh, 0, sizeof(msgh)); + memset(control, 0, sizeof(control)); + + iov.iov_base = (uint8_t *)buf; + iov.iov_len = len; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + + cmsg = CMSG_FIRSTHDR(&msgh); + cmsg->cmsg_len = CMSG_LEN(fd_size); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), fds, fd_size); + + do { + r = sendmsg(fd, &msgh, 0); + } while (r < 0 && errno == EINTR); + + return r; +} + +static int +vhost_user_read(int fd, struct vhost_user_msg *msg) +{ + uint32_t valid_flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION; + int ret, sz_hdr = VHOST_USER_HDR_SIZE, sz_payload; + + ret = recv(fd, (void *)msg, sz_hdr, 0); + if (ret < sz_hdr) { + PMD_DRV_LOG(ERR, "Failed to recv msg hdr: %d instead of %d.", + ret, sz_hdr); + goto fail; + } + + /* validate msg flags */ + if (msg->flags != (valid_flags)) { + PMD_DRV_LOG(ERR, "Failed to recv msg: flags %x instead of %x.", + msg->flags, valid_flags); + goto fail; + } + + sz_payload = msg->size; + if (sz_payload) { + ret = recv(fd, (void *)((char *)msg + sz_hdr), sz_payload, 0); + if (ret < sz_payload) { + PMD_DRV_LOG(ERR, + "Failed to recv msg payload: %d instead of %d.", + ret, msg->size); + goto fail; + } + } + + return 0; + +fail: + return -1; +} + +struct hugepage_file_info { + uint64_t addr; /**< virtual addr */ + size_t size; /**< the file size */ + char path[PATH_MAX]; /**< path to backing file */ +}; + +/* Two possible options: + * 1. Match HUGEPAGE_INFO_FMT to find the file storing struct hugepage_file + * array. This is simple but cannot be used in secondary process because + * secondary process will close and munmap that file. + * 2. Match HUGEFILE_FMT to find hugepage files directly. + * + * We choose option 2. + */ +static int +get_hugepage_file_info(struct hugepage_file_info huges[], int max) +{ + int idx; + FILE *f; + char buf[BUFSIZ], *tmp, *tail; + char *str_underline, *str_start; + int huge_index; + uint64_t v_start, v_end; + + f = fopen("/proc/self/maps", "r"); + if (!f) { + PMD_DRV_LOG(ERR, "cannot open /proc/self/maps"); + return -1; + } + + idx = 0; + while (fgets(buf, sizeof(buf), f) != NULL) { + if (sscanf(buf, "%" PRIx64 "-%" PRIx64, &v_start, &v_end) < 2) { + PMD_DRV_LOG(ERR, "Failed to parse address"); + goto error; + } + + tmp = strchr(buf, ' ') + 1; /** skip address */ + tmp = strchr(tmp, ' ') + 1; /** skip perm */ + tmp = strchr(tmp, ' ') + 1; /** skip offset */ + tmp = strchr(tmp, ' ') + 1; /** skip dev */ + tmp = strchr(tmp, ' ') + 1; /** skip inode */ + while (*tmp == ' ') /** skip spaces */ + tmp++; + tail = strrchr(tmp, '\n'); /** remove newline if exists */ + if (tail) + *tail = '\0'; + + /* Match HUGEFILE_FMT, aka "%s/%smap_%d", + * which is defined in eal_filesystem.h + */ + str_underline = strrchr(tmp, '_'); + if (!str_underline) + continue; + + str_start = str_underline - strlen("map"); + if (str_start < tmp) + continue; + + if (sscanf(str_start, "map_%d", &huge_index) != 1) + continue; + + if (idx >= max) { + PMD_DRV_LOG(ERR, "Exceed maximum of %d", max); + goto error; + } + huges[idx].addr = v_start; + huges[idx].size = v_end - v_start; + snprintf(huges[idx].path, PATH_MAX, "%s", tmp); + idx++; + } + + fclose(f); + return idx; + +error: + fclose(f); + return -1; +} + +static int +prepare_vhost_memory_user(struct vhost_user_msg *msg, int fds[]) +{ + int i, num; + struct hugepage_file_info huges[VHOST_MEMORY_MAX_NREGIONS]; + struct vhost_memory_region *mr; + + num = get_hugepage_file_info(huges, VHOST_MEMORY_MAX_NREGIONS); + if (num < 0) { + PMD_INIT_LOG(ERR, "Failed to prepare memory for vhost-user"); + return -1; + } + + for (i = 0; i < num; ++i) { + mr = &msg->payload.memory.regions[i]; + mr->guest_phys_addr = huges[i].addr; /* use vaddr! */ + mr->userspace_addr = huges[i].addr; + mr->memory_size = huges[i].size; + mr->mmap_offset = 0; + fds[i] = open(huges[i].path, O_RDWR); + } + + msg->payload.memory.nregions = num; + msg->payload.memory.padding = 0; + + return 0; +} + +static struct vhost_user_msg m; + +const char * const vhost_msg_strings[] = { + [VHOST_USER_SET_OWNER] = "VHOST_SET_OWNER", + [VHOST_USER_RESET_OWNER] = "VHOST_RESET_OWNER", + [VHOST_USER_SET_FEATURES] = "VHOST_SET_FEATURES", + [VHOST_USER_GET_FEATURES] = "VHOST_GET_FEATURES", + [VHOST_USER_SET_VRING_CALL] = "VHOST_SET_VRING_CALL", + [VHOST_USER_SET_VRING_NUM] = "VHOST_SET_VRING_NUM", + [VHOST_USER_SET_VRING_BASE] = "VHOST_SET_VRING_BASE", + [VHOST_USER_GET_VRING_BASE] = "VHOST_GET_VRING_BASE", + [VHOST_USER_SET_VRING_ADDR] = "VHOST_SET_VRING_ADDR", + [VHOST_USER_SET_VRING_KICK] = "VHOST_SET_VRING_KICK", + [VHOST_USER_SET_MEM_TABLE] = "VHOST_SET_MEM_TABLE", + [VHOST_USER_SET_VRING_ENABLE] = "VHOST_SET_VRING_ENABLE", +}; + +static int +vhost_user_sock(struct virtio_user_dev *dev, + enum vhost_user_request req, + void *arg) +{ + struct vhost_user_msg msg; + struct vhost_vring_file *file = 0; + int need_reply = 0; + int fds[VHOST_MEMORY_MAX_NREGIONS]; + int fd_num = 0; + int i, len; + int vhostfd = dev->vhostfd; + + RTE_SET_USED(m); + + PMD_DRV_LOG(INFO, "%s", vhost_msg_strings[req]); + + msg.request = req; + msg.flags = VHOST_USER_VERSION; + msg.size = 0; + + switch (req) { + case VHOST_USER_GET_FEATURES: + need_reply = 1; + break; + + case VHOST_USER_SET_FEATURES: + case VHOST_USER_SET_LOG_BASE: + msg.payload.u64 = *((__u64 *)arg); + msg.size = sizeof(m.payload.u64); + break; + + case VHOST_USER_SET_OWNER: + case VHOST_USER_RESET_OWNER: + break; + + case VHOST_USER_SET_MEM_TABLE: + if (prepare_vhost_memory_user(&msg, fds) < 0) + return -1; + fd_num = msg.payload.memory.nregions; + msg.size = sizeof(m.payload.memory.nregions); + msg.size += sizeof(m.payload.memory.padding); + msg.size += fd_num * sizeof(struct vhost_memory_region); + break; + + case VHOST_USER_SET_LOG_FD: + fds[fd_num++] = *((int *)arg); + break; + + case VHOST_USER_SET_VRING_NUM: + case VHOST_USER_SET_VRING_BASE: + case VHOST_USER_SET_VRING_ENABLE: + memcpy(&msg.payload.state, arg, sizeof(msg.payload.state)); + msg.size = sizeof(m.payload.state); + break; + + case VHOST_USER_GET_VRING_BASE: + memcpy(&msg.payload.state, arg, sizeof(msg.payload.state)); + msg.size = sizeof(m.payload.state); + need_reply = 1; + break; + + case VHOST_USER_SET_VRING_ADDR: + memcpy(&msg.payload.addr, arg, sizeof(msg.payload.addr)); + msg.size = sizeof(m.payload.addr); + break; + + case VHOST_USER_SET_VRING_KICK: + case VHOST_USER_SET_VRING_CALL: + case VHOST_USER_SET_VRING_ERR: + file = arg; + msg.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK; + msg.size = sizeof(m.payload.u64); + if (file->fd > 0) + fds[fd_num++] = file->fd; + else + msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK; + break; + + default: + PMD_DRV_LOG(ERR, "trying to send unhandled msg type"); + return -1; + } + + len = VHOST_USER_HDR_SIZE + msg.size; + if (vhost_user_write(vhostfd, &msg, len, fds, fd_num) < 0) { + PMD_DRV_LOG(ERR, "%s failed: %s", + vhost_msg_strings[req], strerror(errno)); + return -1; + } + + if (req == VHOST_USER_SET_MEM_TABLE) + for (i = 0; i < fd_num; ++i) + close(fds[i]); + + if (need_reply) { + if (vhost_user_read(vhostfd, &msg) < 0) { + PMD_DRV_LOG(ERR, "Received msg failed: %s", + strerror(errno)); + return -1; + } + + if (req != msg.request) { + PMD_DRV_LOG(ERR, "Received unexpected msg type"); + return -1; + } + + switch (req) { + case VHOST_USER_GET_FEATURES: + if (msg.size != sizeof(m.payload.u64)) { + PMD_DRV_LOG(ERR, "Received bad msg size"); + return -1; + } + *((__u64 *)arg) = msg.payload.u64; + break; + case VHOST_USER_GET_VRING_BASE: + if (msg.size != sizeof(m.payload.state)) { + PMD_DRV_LOG(ERR, "Received bad msg size"); + return -1; + } + memcpy(arg, &msg.payload.state, + sizeof(struct vhost_vring_state)); + break; + default: + PMD_DRV_LOG(ERR, "Received unexpected msg type"); + return -1; + } + } + + return 0; +} + +/** + * Set up environment to talk with a vhost user backend. + * + * @return + * - (-1) if fail; + * - (0) if succeed. + */ +static int +vhost_user_setup(struct virtio_user_dev *dev) +{ + int fd; + int flag; + struct sockaddr_un un; + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) { + PMD_DRV_LOG(ERR, "socket() error, %s", strerror(errno)); + return -1; + } + + flag = fcntl(fd, F_GETFD); + if (fcntl(fd, F_SETFD, flag | FD_CLOEXEC) < 0) + PMD_DRV_LOG(WARNING, "fcntl failed, %s", strerror(errno)); + + memset(&un, 0, sizeof(un)); + un.sun_family = AF_UNIX; + snprintf(un.sun_path, sizeof(un.sun_path), "%s", dev->path); + if (connect(fd, (struct sockaddr *)&un, sizeof(un)) < 0) { + PMD_DRV_LOG(ERR, "connect error, %s", strerror(errno)); + close(fd); + return -1; + } + + dev->vhostfd = fd; + return 0; +} + +static int +vhost_user_enable_queue_pair(struct virtio_user_dev *dev, + uint16_t pair_idx, + int enable) +{ + int i; + + for (i = 0; i < 2; ++i) { + struct vhost_vring_state state = { + .index = pair_idx * 2 + i, + .num = enable, + }; + + if (vhost_user_sock(dev, VHOST_USER_SET_VRING_ENABLE, &state)) + return -1; + } + + return 0; +} + +struct virtio_user_backend_ops ops_user = { + .setup = vhost_user_setup, + .send_request = vhost_user_sock, + .enable_qp = vhost_user_enable_queue_pair +}; diff --git a/src/seastar/dpdk/drivers/net/virtio/virtio_user/virtio_user_dev.c b/src/seastar/dpdk/drivers/net/virtio/virtio_user/virtio_user_dev.c new file mode 100644 index 00000000..450404ba --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtio_user/virtio_user_dev.c @@ -0,0 +1,501 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdint.h> +#include <stdio.h> +#include <fcntl.h> +#include <string.h> +#include <errno.h> +#include <sys/mman.h> +#include <unistd.h> +#include <sys/eventfd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +#include "vhost.h" +#include "virtio_user_dev.h" +#include "../virtio_ethdev.h" + +static int +virtio_user_create_queue(struct virtio_user_dev *dev, uint32_t queue_sel) +{ + /* Of all per virtqueue MSGs, make sure VHOST_SET_VRING_CALL come + * firstly because vhost depends on this msg to allocate virtqueue + * pair. + */ + struct vhost_vring_file file; + + file.index = queue_sel; + file.fd = dev->callfds[queue_sel]; + dev->ops->send_request(dev, VHOST_USER_SET_VRING_CALL, &file); + + return 0; +} + +static int +virtio_user_kick_queue(struct virtio_user_dev *dev, uint32_t queue_sel) +{ + struct vhost_vring_file file; + struct vhost_vring_state state; + struct vring *vring = &dev->vrings[queue_sel]; + struct vhost_vring_addr addr = { + .index = queue_sel, + .desc_user_addr = (uint64_t)(uintptr_t)vring->desc, + .avail_user_addr = (uint64_t)(uintptr_t)vring->avail, + .used_user_addr = (uint64_t)(uintptr_t)vring->used, + .log_guest_addr = 0, + .flags = 0, /* disable log */ + }; + + state.index = queue_sel; + state.num = vring->num; + dev->ops->send_request(dev, VHOST_USER_SET_VRING_NUM, &state); + + state.index = queue_sel; + state.num = 0; /* no reservation */ + dev->ops->send_request(dev, VHOST_USER_SET_VRING_BASE, &state); + + dev->ops->send_request(dev, VHOST_USER_SET_VRING_ADDR, &addr); + + /* Of all per virtqueue MSGs, make sure VHOST_USER_SET_VRING_KICK comes + * lastly because vhost depends on this msg to judge if + * virtio is ready. + */ + file.index = queue_sel; + file.fd = dev->kickfds[queue_sel]; + dev->ops->send_request(dev, VHOST_USER_SET_VRING_KICK, &file); + + return 0; +} + +static int +virtio_user_queue_setup(struct virtio_user_dev *dev, + int (*fn)(struct virtio_user_dev *, uint32_t)) +{ + uint32_t i, queue_sel; + + for (i = 0; i < dev->max_queue_pairs; ++i) { + queue_sel = 2 * i + VTNET_SQ_RQ_QUEUE_IDX; + if (fn(dev, queue_sel) < 0) { + PMD_DRV_LOG(INFO, "setup rx vq fails: %u", i); + return -1; + } + } + for (i = 0; i < dev->max_queue_pairs; ++i) { + queue_sel = 2 * i + VTNET_SQ_TQ_QUEUE_IDX; + if (fn(dev, queue_sel) < 0) { + PMD_DRV_LOG(INFO, "setup tx vq fails: %u", i); + return -1; + } + } + + return 0; +} + +int +virtio_user_start_device(struct virtio_user_dev *dev) +{ + uint64_t features; + int ret; + + /* Step 0: tell vhost to create queues */ + if (virtio_user_queue_setup(dev, virtio_user_create_queue) < 0) + goto error; + + /* Step 1: set features */ + features = dev->features; + /* Strip VIRTIO_NET_F_MAC, as MAC address is handled in vdev init */ + features &= ~(1ull << VIRTIO_NET_F_MAC); + /* Strip VIRTIO_NET_F_CTRL_VQ, as devices do not really need to know */ + features &= ~(1ull << VIRTIO_NET_F_CTRL_VQ); + features &= ~(1ull << VIRTIO_NET_F_STATUS); + ret = dev->ops->send_request(dev, VHOST_USER_SET_FEATURES, &features); + if (ret < 0) + goto error; + PMD_DRV_LOG(INFO, "set features: %" PRIx64, features); + + /* Step 2: share memory regions */ + ret = dev->ops->send_request(dev, VHOST_USER_SET_MEM_TABLE, NULL); + if (ret < 0) + goto error; + + /* Step 3: kick queues */ + if (virtio_user_queue_setup(dev, virtio_user_kick_queue) < 0) + goto error; + + /* Step 4: enable queues + * we enable the 1st queue pair by default. + */ + dev->ops->enable_qp(dev, 0, 1); + + return 0; +error: + /* TODO: free resource here or caller to check */ + return -1; +} + +int virtio_user_stop_device(struct virtio_user_dev *dev) +{ + uint32_t i; + + for (i = 0; i < dev->max_queue_pairs; ++i) + dev->ops->enable_qp(dev, i, 0); + + return 0; +} + +static inline void +parse_mac(struct virtio_user_dev *dev, const char *mac) +{ + int i, r; + uint32_t tmp[ETHER_ADDR_LEN]; + + if (!mac) + return; + + r = sscanf(mac, "%x:%x:%x:%x:%x:%x", &tmp[0], + &tmp[1], &tmp[2], &tmp[3], &tmp[4], &tmp[5]); + if (r == ETHER_ADDR_LEN) { + for (i = 0; i < ETHER_ADDR_LEN; ++i) + dev->mac_addr[i] = (uint8_t)tmp[i]; + dev->mac_specified = 1; + } else { + /* ignore the wrong mac, use random mac */ + PMD_DRV_LOG(ERR, "wrong format of mac: %s", mac); + } +} + +int +is_vhost_user_by_type(const char *path) +{ + struct stat sb; + + if (stat(path, &sb) == -1) + return 0; + + return S_ISSOCK(sb.st_mode); +} + +static int +virtio_user_dev_init_notify(struct virtio_user_dev *dev) +{ + uint32_t i, j; + int callfd; + int kickfd; + + for (i = 0; i < VIRTIO_MAX_VIRTQUEUES; ++i) { + if (i >= dev->max_queue_pairs * 2) { + dev->kickfds[i] = -1; + dev->callfds[i] = -1; + continue; + } + + /* May use invalid flag, but some backend uses kickfd and + * callfd as criteria to judge if dev is alive. so finally we + * use real event_fd. + */ + callfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK); + if (callfd < 0) { + PMD_DRV_LOG(ERR, "callfd error, %s", strerror(errno)); + break; + } + kickfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK); + if (kickfd < 0) { + PMD_DRV_LOG(ERR, "kickfd error, %s", strerror(errno)); + break; + } + dev->callfds[i] = callfd; + dev->kickfds[i] = kickfd; + } + + if (i < VIRTIO_MAX_VIRTQUEUES) { + for (j = 0; j <= i; ++j) { + close(dev->callfds[j]); + close(dev->kickfds[j]); + } + + return -1; + } + + return 0; +} + +static int +virtio_user_fill_intr_handle(struct virtio_user_dev *dev) +{ + uint32_t i; + struct rte_eth_dev *eth_dev = &rte_eth_devices[dev->port_id]; + + if (!eth_dev->intr_handle) { + eth_dev->intr_handle = malloc(sizeof(*eth_dev->intr_handle)); + if (!eth_dev->intr_handle) { + PMD_DRV_LOG(ERR, "fail to allocate intr_handle"); + return -1; + } + memset(eth_dev->intr_handle, 0, sizeof(*eth_dev->intr_handle)); + } + + for (i = 0; i < dev->max_queue_pairs; ++i) + eth_dev->intr_handle->efds[i] = dev->callfds[i]; + eth_dev->intr_handle->nb_efd = dev->max_queue_pairs; + eth_dev->intr_handle->max_intr = dev->max_queue_pairs + 1; + eth_dev->intr_handle->type = RTE_INTR_HANDLE_VDEV; + if (dev->vhostfd >= 0) + eth_dev->intr_handle->fd = dev->vhostfd; + + return 0; +} + +static int +virtio_user_dev_setup(struct virtio_user_dev *dev) +{ + uint32_t q; + + dev->vhostfd = -1; + dev->vhostfds = NULL; + dev->tapfds = NULL; + + if (is_vhost_user_by_type(dev->path)) { + dev->ops = &ops_user; + } else { + dev->ops = &ops_kernel; + + dev->vhostfds = malloc(dev->max_queue_pairs * sizeof(int)); + dev->tapfds = malloc(dev->max_queue_pairs * sizeof(int)); + if (!dev->vhostfds || !dev->tapfds) { + PMD_INIT_LOG(ERR, "Failed to malloc"); + return -1; + } + + for (q = 0; q < dev->max_queue_pairs; ++q) { + dev->vhostfds[q] = -1; + dev->tapfds[q] = -1; + } + } + + if (dev->ops->setup(dev) < 0) + return -1; + + if (virtio_user_dev_init_notify(dev) < 0) + return -1; + + if (virtio_user_fill_intr_handle(dev) < 0) + return -1; + + return 0; +} + +/* Use below macro to filter features from vhost backend */ +#define VIRTIO_USER_SUPPORTED_FEATURES \ + (1ULL << VIRTIO_NET_F_MAC | \ + 1ULL << VIRTIO_NET_F_STATUS | \ + 1ULL << VIRTIO_NET_F_MQ | \ + 1ULL << VIRTIO_NET_F_CTRL_MAC_ADDR | \ + 1ULL << VIRTIO_NET_F_CTRL_VQ | \ + 1ULL << VIRTIO_NET_F_CTRL_RX | \ + 1ULL << VIRTIO_NET_F_CTRL_VLAN | \ + 1ULL << VIRTIO_NET_F_CSUM | \ + 1ULL << VIRTIO_NET_F_HOST_TSO4 | \ + 1ULL << VIRTIO_NET_F_HOST_TSO6 | \ + 1ULL << VIRTIO_NET_F_MRG_RXBUF | \ + 1ULL << VIRTIO_RING_F_INDIRECT_DESC | \ + 1ULL << VIRTIO_NET_F_GUEST_CSUM | \ + 1ULL << VIRTIO_NET_F_GUEST_TSO4 | \ + 1ULL << VIRTIO_NET_F_GUEST_TSO6 | \ + 1ULL << VIRTIO_F_VERSION_1) + +int +virtio_user_dev_init(struct virtio_user_dev *dev, char *path, int queues, + int cq, int queue_size, const char *mac, char **ifname) +{ + snprintf(dev->path, PATH_MAX, "%s", path); + dev->max_queue_pairs = queues; + dev->queue_pairs = 1; /* mq disabled by default */ + dev->queue_size = queue_size; + dev->mac_specified = 0; + parse_mac(dev, mac); + + if (*ifname) { + dev->ifname = *ifname; + *ifname = NULL; + } + + if (virtio_user_dev_setup(dev) < 0) { + PMD_INIT_LOG(ERR, "backend set up fails"); + return -1; + } + if (dev->ops->send_request(dev, VHOST_USER_SET_OWNER, NULL) < 0) { + PMD_INIT_LOG(ERR, "set_owner fails: %s", strerror(errno)); + return -1; + } + + if (dev->ops->send_request(dev, VHOST_USER_GET_FEATURES, + &dev->device_features) < 0) { + PMD_INIT_LOG(ERR, "get_features failed: %s", strerror(errno)); + return -1; + } + if (dev->mac_specified) + dev->device_features |= (1ull << VIRTIO_NET_F_MAC); + + if (cq) { + /* device does not really need to know anything about CQ, + * so if necessary, we just claim to support CQ + */ + dev->device_features |= (1ull << VIRTIO_NET_F_CTRL_VQ); + } else { + dev->device_features &= ~(1ull << VIRTIO_NET_F_CTRL_VQ); + /* Also disable features depends on VIRTIO_NET_F_CTRL_VQ */ + dev->device_features &= ~(1ull << VIRTIO_NET_F_CTRL_RX); + dev->device_features &= ~(1ull << VIRTIO_NET_F_CTRL_VLAN); + dev->device_features &= ~(1ull << VIRTIO_NET_F_GUEST_ANNOUNCE); + dev->device_features &= ~(1ull << VIRTIO_NET_F_MQ); + dev->device_features &= ~(1ull << VIRTIO_NET_F_CTRL_MAC_ADDR); + } + + /* The backend will not report this feature, we add it explicitly */ + if (is_vhost_user_by_type(dev->path)) + dev->device_features |= (1ull << VIRTIO_NET_F_STATUS); + + dev->device_features &= VIRTIO_USER_SUPPORTED_FEATURES; + + return 0; +} + +void +virtio_user_dev_uninit(struct virtio_user_dev *dev) +{ + uint32_t i; + + virtio_user_stop_device(dev); + + for (i = 0; i < dev->max_queue_pairs * 2; ++i) { + close(dev->callfds[i]); + close(dev->kickfds[i]); + } + + close(dev->vhostfd); + + if (dev->vhostfds) { + for (i = 0; i < dev->max_queue_pairs; ++i) + close(dev->vhostfds[i]); + free(dev->vhostfds); + free(dev->tapfds); + } + + free(dev->ifname); +} + +static uint8_t +virtio_user_handle_mq(struct virtio_user_dev *dev, uint16_t q_pairs) +{ + uint16_t i; + uint8_t ret = 0; + + if (q_pairs > dev->max_queue_pairs) { + PMD_INIT_LOG(ERR, "multi-q config %u, but only %u supported", + q_pairs, dev->max_queue_pairs); + return -1; + } + + for (i = 0; i < q_pairs; ++i) + ret |= dev->ops->enable_qp(dev, i, 1); + for (i = q_pairs; i < dev->max_queue_pairs; ++i) + ret |= dev->ops->enable_qp(dev, i, 0); + + dev->queue_pairs = q_pairs; + + return ret; +} + +static uint32_t +virtio_user_handle_ctrl_msg(struct virtio_user_dev *dev, struct vring *vring, + uint16_t idx_hdr) +{ + struct virtio_net_ctrl_hdr *hdr; + virtio_net_ctrl_ack status = ~0; + uint16_t i, idx_data, idx_status; + uint32_t n_descs = 0; + + /* locate desc for header, data, and status */ + idx_data = vring->desc[idx_hdr].next; + n_descs++; + + i = idx_data; + while (vring->desc[i].flags == VRING_DESC_F_NEXT) { + i = vring->desc[i].next; + n_descs++; + } + + /* locate desc for status */ + idx_status = i; + n_descs++; + + hdr = (void *)(uintptr_t)vring->desc[idx_hdr].addr; + if (hdr->class == VIRTIO_NET_CTRL_MQ && + hdr->cmd == VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET) { + uint16_t queues; + + queues = *(uint16_t *)(uintptr_t)vring->desc[idx_data].addr; + status = virtio_user_handle_mq(dev, queues); + } + + /* Update status */ + *(virtio_net_ctrl_ack *)(uintptr_t)vring->desc[idx_status].addr = status; + + return n_descs; +} + +void +virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx) +{ + uint16_t avail_idx, desc_idx; + struct vring_used_elem *uep; + uint32_t n_descs; + struct vring *vring = &dev->vrings[queue_idx]; + + /* Consume avail ring, using used ring idx as first one */ + while (vring->used->idx != vring->avail->idx) { + avail_idx = (vring->used->idx) & (vring->num - 1); + desc_idx = vring->avail->ring[avail_idx]; + + n_descs = virtio_user_handle_ctrl_msg(dev, vring, desc_idx); + + /* Update used ring */ + uep = &vring->used->ring[avail_idx]; + uep->id = avail_idx; + uep->len = n_descs; + + vring->used->idx++; + } +} diff --git a/src/seastar/dpdk/drivers/net/virtio/virtio_user/virtio_user_dev.h b/src/seastar/dpdk/drivers/net/virtio/virtio_user/virtio_user_dev.h new file mode 100644 index 00000000..8361b6bd --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtio_user/virtio_user_dev.h @@ -0,0 +1,77 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VIRTIO_USER_DEV_H +#define _VIRTIO_USER_DEV_H + +#include <limits.h> +#include "../virtio_pci.h" +#include "../virtio_ring.h" +#include "vhost.h" + +struct virtio_user_dev { + /* for vhost_user backend */ + int vhostfd; + + /* for vhost_kernel backend */ + char *ifname; + int *vhostfds; + int *tapfds; + + /* for both vhost_user and vhost_kernel */ + int callfds[VIRTIO_MAX_VIRTQUEUES]; + int kickfds[VIRTIO_MAX_VIRTQUEUES]; + int mac_specified; + uint32_t max_queue_pairs; + uint32_t queue_pairs; + uint32_t queue_size; + uint64_t features; /* the negotiated features with driver, + * and will be sync with device + */ + uint64_t device_features; /* supported features by device */ + uint8_t status; + uint8_t port_id; + uint8_t mac_addr[ETHER_ADDR_LEN]; + char path[PATH_MAX]; + struct vring vrings[VIRTIO_MAX_VIRTQUEUES]; + struct virtio_user_backend_ops *ops; +}; + +int is_vhost_user_by_type(const char *path); +int virtio_user_start_device(struct virtio_user_dev *dev); +int virtio_user_stop_device(struct virtio_user_dev *dev); +int virtio_user_dev_init(struct virtio_user_dev *dev, char *path, int queues, + int cq, int queue_size, const char *mac, char **ifname); +void virtio_user_dev_uninit(struct virtio_user_dev *dev); +void virtio_user_handle_cq(struct virtio_user_dev *dev, uint16_t queue_idx); +#endif diff --git a/src/seastar/dpdk/drivers/net/virtio/virtio_user_ethdev.c b/src/seastar/dpdk/drivers/net/virtio/virtio_user_ethdev.c new file mode 100644 index 00000000..280406c0 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtio_user_ethdev.c @@ -0,0 +1,578 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdint.h> +#include <sys/types.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/socket.h> + +#include <rte_malloc.h> +#include <rte_kvargs.h> +#include <rte_ethdev_vdev.h> +#include <rte_vdev.h> +#include <rte_alarm.h> + +#include "virtio_ethdev.h" +#include "virtio_logs.h" +#include "virtio_pci.h" +#include "virtqueue.h" +#include "virtio_rxtx.h" +#include "virtio_user/virtio_user_dev.h" + +#define virtio_user_get_dev(hw) \ + ((struct virtio_user_dev *)(hw)->virtio_user_dev) + +static void +virtio_user_delayed_handler(void *param) +{ + struct virtio_hw *hw = (struct virtio_hw *)param; + struct rte_eth_dev *dev = &rte_eth_devices[hw->port_id]; + + rte_intr_callback_unregister(dev->intr_handle, + virtio_interrupt_handler, + dev); +} + +static void +virtio_user_read_dev_config(struct virtio_hw *hw, size_t offset, + void *dst, int length) +{ + int i; + struct virtio_user_dev *dev = virtio_user_get_dev(hw); + + if (offset == offsetof(struct virtio_net_config, mac) && + length == ETHER_ADDR_LEN) { + for (i = 0; i < ETHER_ADDR_LEN; ++i) + ((uint8_t *)dst)[i] = dev->mac_addr[i]; + return; + } + + if (offset == offsetof(struct virtio_net_config, status)) { + char buf[128]; + + if (dev->vhostfd >= 0) { + int r; + int flags; + + flags = fcntl(dev->vhostfd, F_GETFL); + fcntl(dev->vhostfd, F_SETFL, flags | O_NONBLOCK); + r = recv(dev->vhostfd, buf, 128, MSG_PEEK); + if (r == 0 || (r < 0 && errno != EAGAIN)) { + dev->status &= (~VIRTIO_NET_S_LINK_UP); + PMD_DRV_LOG(ERR, "virtio-user port %u is down", + hw->port_id); + /* Only client mode is available now. Once the + * connection is broken, it can never be up + * again. Besides, this function could be called + * in the process of interrupt handling, + * callback cannot be unregistered here, set an + * alarm to do it. + */ + rte_eal_alarm_set(1, + virtio_user_delayed_handler, + (void *)hw); + } else { + dev->status |= VIRTIO_NET_S_LINK_UP; + } + fcntl(dev->vhostfd, F_SETFL, flags & (~O_NONBLOCK)); + } + *(uint16_t *)dst = dev->status; + } + + if (offset == offsetof(struct virtio_net_config, max_virtqueue_pairs)) + *(uint16_t *)dst = dev->max_queue_pairs; +} + +static void +virtio_user_write_dev_config(struct virtio_hw *hw, size_t offset, + const void *src, int length) +{ + int i; + struct virtio_user_dev *dev = virtio_user_get_dev(hw); + + if ((offset == offsetof(struct virtio_net_config, mac)) && + (length == ETHER_ADDR_LEN)) + for (i = 0; i < ETHER_ADDR_LEN; ++i) + dev->mac_addr[i] = ((const uint8_t *)src)[i]; + else + PMD_DRV_LOG(ERR, "not supported offset=%zu, len=%d", + offset, length); +} + +static void +virtio_user_reset(struct virtio_hw *hw) +{ + struct virtio_user_dev *dev = virtio_user_get_dev(hw); + + if (dev->status & VIRTIO_CONFIG_STATUS_DRIVER_OK) + virtio_user_stop_device(dev); +} + +static void +virtio_user_set_status(struct virtio_hw *hw, uint8_t status) +{ + struct virtio_user_dev *dev = virtio_user_get_dev(hw); + + if (status & VIRTIO_CONFIG_STATUS_DRIVER_OK) + virtio_user_start_device(dev); + else if (status == VIRTIO_CONFIG_STATUS_RESET) + virtio_user_reset(hw); + dev->status = status; +} + +static uint8_t +virtio_user_get_status(struct virtio_hw *hw) +{ + struct virtio_user_dev *dev = virtio_user_get_dev(hw); + + return dev->status; +} + +static uint64_t +virtio_user_get_features(struct virtio_hw *hw) +{ + struct virtio_user_dev *dev = virtio_user_get_dev(hw); + + /* unmask feature bits defined in vhost user protocol */ + return dev->device_features & VIRTIO_PMD_SUPPORTED_GUEST_FEATURES; +} + +static void +virtio_user_set_features(struct virtio_hw *hw, uint64_t features) +{ + struct virtio_user_dev *dev = virtio_user_get_dev(hw); + + dev->features = features & dev->device_features; +} + +static uint8_t +virtio_user_get_isr(struct virtio_hw *hw __rte_unused) +{ + /* rxq interrupts and config interrupt are separated in virtio-user, + * here we only report config change. + */ + return VIRTIO_PCI_ISR_CONFIG; +} + +static uint16_t +virtio_user_set_config_irq(struct virtio_hw *hw __rte_unused, + uint16_t vec __rte_unused) +{ + return 0; +} + +static uint16_t +virtio_user_set_queue_irq(struct virtio_hw *hw __rte_unused, + struct virtqueue *vq __rte_unused, + uint16_t vec) +{ + /* pretend we have done that */ + return vec; +} + +/* This function is to get the queue size, aka, number of descs, of a specified + * queue. Different with the VHOST_USER_GET_QUEUE_NUM, which is used to get the + * max supported queues. + */ +static uint16_t +virtio_user_get_queue_num(struct virtio_hw *hw, uint16_t queue_id __rte_unused) +{ + struct virtio_user_dev *dev = virtio_user_get_dev(hw); + + /* Currently, each queue has same queue size */ + return dev->queue_size; +} + +static int +virtio_user_setup_queue(struct virtio_hw *hw, struct virtqueue *vq) +{ + struct virtio_user_dev *dev = virtio_user_get_dev(hw); + uint16_t queue_idx = vq->vq_queue_index; + uint64_t desc_addr, avail_addr, used_addr; + + desc_addr = (uintptr_t)vq->vq_ring_virt_mem; + avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc); + used_addr = RTE_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail, + ring[vq->vq_nentries]), + VIRTIO_PCI_VRING_ALIGN); + + dev->vrings[queue_idx].num = vq->vq_nentries; + dev->vrings[queue_idx].desc = (void *)(uintptr_t)desc_addr; + dev->vrings[queue_idx].avail = (void *)(uintptr_t)avail_addr; + dev->vrings[queue_idx].used = (void *)(uintptr_t)used_addr; + + return 0; +} + +static void +virtio_user_del_queue(struct virtio_hw *hw, struct virtqueue *vq) +{ + /* For legacy devices, write 0 to VIRTIO_PCI_QUEUE_PFN port, QEMU + * correspondingly stops the ioeventfds, and reset the status of + * the device. + * For modern devices, set queue desc, avail, used in PCI bar to 0, + * not see any more behavior in QEMU. + * + * Here we just care about what information to deliver to vhost-user + * or vhost-kernel. So we just close ioeventfd for now. + */ + struct virtio_user_dev *dev = virtio_user_get_dev(hw); + + close(dev->callfds[vq->vq_queue_index]); + close(dev->kickfds[vq->vq_queue_index]); +} + +static void +virtio_user_notify_queue(struct virtio_hw *hw, struct virtqueue *vq) +{ + uint64_t buf = 1; + struct virtio_user_dev *dev = virtio_user_get_dev(hw); + + if (hw->cvq && (hw->cvq->vq == vq)) { + virtio_user_handle_cq(dev, vq->vq_queue_index); + return; + } + + if (write(dev->kickfds[vq->vq_queue_index], &buf, sizeof(buf)) < 0) + PMD_DRV_LOG(ERR, "failed to kick backend: %s", + strerror(errno)); +} + +const struct virtio_pci_ops virtio_user_ops = { + .read_dev_cfg = virtio_user_read_dev_config, + .write_dev_cfg = virtio_user_write_dev_config, + .reset = virtio_user_reset, + .get_status = virtio_user_get_status, + .set_status = virtio_user_set_status, + .get_features = virtio_user_get_features, + .set_features = virtio_user_set_features, + .get_isr = virtio_user_get_isr, + .set_config_irq = virtio_user_set_config_irq, + .set_queue_irq = virtio_user_set_queue_irq, + .get_queue_num = virtio_user_get_queue_num, + .setup_queue = virtio_user_setup_queue, + .del_queue = virtio_user_del_queue, + .notify_queue = virtio_user_notify_queue, +}; + +static const char *valid_args[] = { +#define VIRTIO_USER_ARG_QUEUES_NUM "queues" + VIRTIO_USER_ARG_QUEUES_NUM, +#define VIRTIO_USER_ARG_CQ_NUM "cq" + VIRTIO_USER_ARG_CQ_NUM, +#define VIRTIO_USER_ARG_MAC "mac" + VIRTIO_USER_ARG_MAC, +#define VIRTIO_USER_ARG_PATH "path" + VIRTIO_USER_ARG_PATH, +#define VIRTIO_USER_ARG_QUEUE_SIZE "queue_size" + VIRTIO_USER_ARG_QUEUE_SIZE, +#define VIRTIO_USER_ARG_INTERFACE_NAME "iface" + VIRTIO_USER_ARG_INTERFACE_NAME, + NULL +}; + +#define VIRTIO_USER_DEF_CQ_EN 0 +#define VIRTIO_USER_DEF_Q_NUM 1 +#define VIRTIO_USER_DEF_Q_SZ 256 + +static int +get_string_arg(const char *key __rte_unused, + const char *value, void *extra_args) +{ + if (!value || !extra_args) + return -EINVAL; + + *(char **)extra_args = strdup(value); + + if (!*(char **)extra_args) + return -ENOMEM; + + return 0; +} + +static int +get_integer_arg(const char *key __rte_unused, + const char *value, void *extra_args) +{ + if (!value || !extra_args) + return -EINVAL; + + *(uint64_t *)extra_args = strtoull(value, NULL, 0); + + return 0; +} + +static struct rte_vdev_driver virtio_user_driver; + +static struct rte_eth_dev * +virtio_user_eth_dev_alloc(struct rte_vdev_device *vdev) +{ + struct rte_eth_dev *eth_dev; + struct rte_eth_dev_data *data; + struct virtio_hw *hw; + struct virtio_user_dev *dev; + + eth_dev = rte_eth_vdev_allocate(vdev, sizeof(*hw)); + if (!eth_dev) { + PMD_INIT_LOG(ERR, "cannot alloc rte_eth_dev"); + return NULL; + } + + data = eth_dev->data; + hw = eth_dev->data->dev_private; + + dev = rte_zmalloc(NULL, sizeof(*dev), 0); + if (!dev) { + PMD_INIT_LOG(ERR, "malloc virtio_user_dev failed"); + rte_eth_dev_release_port(eth_dev); + rte_free(hw); + return NULL; + } + + hw->port_id = data->port_id; + dev->port_id = data->port_id; + virtio_hw_internal[hw->port_id].vtpci_ops = &virtio_user_ops; + /* + * MSIX is required to enable LSC (see virtio_init_device). + * Here just pretend that we support msix. + */ + hw->use_msix = 1; + hw->modern = 0; + hw->use_simple_rxtx = 0; + hw->virtio_user_dev = dev; + data->dev_flags = RTE_ETH_DEV_DETACHABLE; + return eth_dev; +} + +static void +virtio_user_eth_dev_free(struct rte_eth_dev *eth_dev) +{ + struct rte_eth_dev_data *data = eth_dev->data; + struct virtio_hw *hw = data->dev_private; + + rte_free(hw->virtio_user_dev); + rte_free(hw); + rte_eth_dev_release_port(eth_dev); +} + +/* Dev initialization routine. Invoked once for each virtio vdev at + * EAL init time, see rte_eal_dev_init(). + * Returns 0 on success. + */ +static int +virtio_user_pmd_probe(struct rte_vdev_device *dev) +{ + struct rte_kvargs *kvlist = NULL; + struct rte_eth_dev *eth_dev; + struct virtio_hw *hw; + uint64_t queues = VIRTIO_USER_DEF_Q_NUM; + uint64_t cq = VIRTIO_USER_DEF_CQ_EN; + uint64_t queue_size = VIRTIO_USER_DEF_Q_SZ; + char *path = NULL; + char *ifname = NULL; + char *mac_addr = NULL; + int ret = -1; + + kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_args); + if (!kvlist) { + PMD_INIT_LOG(ERR, "error when parsing param"); + goto end; + } + + if (rte_kvargs_count(kvlist, VIRTIO_USER_ARG_PATH) == 1) { + if (rte_kvargs_process(kvlist, VIRTIO_USER_ARG_PATH, + &get_string_arg, &path) < 0) { + PMD_INIT_LOG(ERR, "error to parse %s", + VIRTIO_USER_ARG_PATH); + goto end; + } + } else { + PMD_INIT_LOG(ERR, "arg %s is mandatory for virtio_user", + VIRTIO_USER_ARG_QUEUE_SIZE); + goto end; + } + + if (rte_kvargs_count(kvlist, VIRTIO_USER_ARG_INTERFACE_NAME) == 1) { + if (is_vhost_user_by_type(path)) { + PMD_INIT_LOG(ERR, + "arg %s applies only to vhost-kernel backend", + VIRTIO_USER_ARG_INTERFACE_NAME); + goto end; + } + + if (rte_kvargs_process(kvlist, VIRTIO_USER_ARG_INTERFACE_NAME, + &get_string_arg, &ifname) < 0) { + PMD_INIT_LOG(ERR, "error to parse %s", + VIRTIO_USER_ARG_INTERFACE_NAME); + goto end; + } + } + + if (rte_kvargs_count(kvlist, VIRTIO_USER_ARG_MAC) == 1) { + if (rte_kvargs_process(kvlist, VIRTIO_USER_ARG_MAC, + &get_string_arg, &mac_addr) < 0) { + PMD_INIT_LOG(ERR, "error to parse %s", + VIRTIO_USER_ARG_MAC); + goto end; + } + } + + if (rte_kvargs_count(kvlist, VIRTIO_USER_ARG_QUEUE_SIZE) == 1) { + if (rte_kvargs_process(kvlist, VIRTIO_USER_ARG_QUEUE_SIZE, + &get_integer_arg, &queue_size) < 0) { + PMD_INIT_LOG(ERR, "error to parse %s", + VIRTIO_USER_ARG_QUEUE_SIZE); + goto end; + } + } + + if (rte_kvargs_count(kvlist, VIRTIO_USER_ARG_QUEUES_NUM) == 1) { + if (rte_kvargs_process(kvlist, VIRTIO_USER_ARG_QUEUES_NUM, + &get_integer_arg, &queues) < 0) { + PMD_INIT_LOG(ERR, "error to parse %s", + VIRTIO_USER_ARG_QUEUES_NUM); + goto end; + } + } + + if (rte_kvargs_count(kvlist, VIRTIO_USER_ARG_CQ_NUM) == 1) { + if (rte_kvargs_process(kvlist, VIRTIO_USER_ARG_CQ_NUM, + &get_integer_arg, &cq) < 0) { + PMD_INIT_LOG(ERR, "error to parse %s", + VIRTIO_USER_ARG_CQ_NUM); + goto end; + } + } else if (queues > 1) { + cq = 1; + } + + if (queues > 1 && cq == 0) { + PMD_INIT_LOG(ERR, "multi-q requires ctrl-q"); + goto end; + } + + if (queues > VIRTIO_MAX_VIRTQUEUE_PAIRS) { + PMD_INIT_LOG(ERR, "arg %s %" PRIu64 " exceeds the limit %u", + VIRTIO_USER_ARG_QUEUES_NUM, queues, + VIRTIO_MAX_VIRTQUEUE_PAIRS); + goto end; + } + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + eth_dev = virtio_user_eth_dev_alloc(dev); + if (!eth_dev) { + PMD_INIT_LOG(ERR, "virtio_user fails to alloc device"); + goto end; + } + + hw = eth_dev->data->dev_private; + if (virtio_user_dev_init(hw->virtio_user_dev, path, queues, cq, + queue_size, mac_addr, &ifname) < 0) { + PMD_INIT_LOG(ERR, "virtio_user_dev_init fails"); + virtio_user_eth_dev_free(eth_dev); + goto end; + } + } else { + eth_dev = rte_eth_dev_attach_secondary(rte_vdev_device_name(dev)); + if (!eth_dev) + goto end; + } + + /* previously called by rte_pci_probe() for physical dev */ + if (eth_virtio_dev_init(eth_dev) < 0) { + PMD_INIT_LOG(ERR, "eth_virtio_dev_init fails"); + virtio_user_eth_dev_free(eth_dev); + goto end; + } + ret = 0; + +end: + if (kvlist) + rte_kvargs_free(kvlist); + if (path) + free(path); + if (mac_addr) + free(mac_addr); + if (ifname) + free(ifname); + return ret; +} + +/** Called by rte_eth_dev_detach() */ +static int +virtio_user_pmd_remove(struct rte_vdev_device *vdev) +{ + const char *name; + struct rte_eth_dev *eth_dev; + struct virtio_hw *hw; + struct virtio_user_dev *dev; + + if (!vdev) + return -EINVAL; + + name = rte_vdev_device_name(vdev); + PMD_DRV_LOG(INFO, "Un-Initializing %s", name); + eth_dev = rte_eth_dev_allocated(name); + if (!eth_dev) + return -ENODEV; + + /* make sure the device is stopped, queues freed */ + rte_eth_dev_close(eth_dev->data->port_id); + + hw = eth_dev->data->dev_private; + dev = hw->virtio_user_dev; + virtio_user_dev_uninit(dev); + + rte_free(eth_dev->data->dev_private); + rte_free(eth_dev->data); + rte_eth_dev_release_port(eth_dev); + + return 0; +} + +static struct rte_vdev_driver virtio_user_driver = { + .probe = virtio_user_pmd_probe, + .remove = virtio_user_pmd_remove, +}; + +RTE_PMD_REGISTER_VDEV(net_virtio_user, virtio_user_driver); +RTE_PMD_REGISTER_ALIAS(net_virtio_user, virtio_user); +RTE_PMD_REGISTER_PARAM_STRING(net_virtio_user, + "path=<path> " + "mac=<mac addr> " + "cq=<int> " + "queue_size=<int> " + "queues=<int> " + "iface=<string>"); diff --git a/src/seastar/dpdk/drivers/net/virtio/virtqueue.c b/src/seastar/dpdk/drivers/net/virtio/virtqueue.c new file mode 100644 index 00000000..9ad77b8a --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtqueue.c @@ -0,0 +1,61 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <stdint.h> + +#include <rte_mbuf.h> + +#include "virtqueue.h" +#include "virtio_logs.h" +#include "virtio_pci.h" + +/* + * Two types of mbuf to be cleaned: + * 1) mbuf that has been consumed by backend but not used by virtio. + * 2) mbuf that hasn't been consued by backend. + */ +struct rte_mbuf * +virtqueue_detatch_unused(struct virtqueue *vq) +{ + struct rte_mbuf *cookie; + int idx; + + if (vq != NULL) + for (idx = 0; idx < vq->vq_nentries; idx++) { + cookie = vq->vq_descx[idx].cookie; + if (cookie != NULL) { + vq->vq_descx[idx].cookie = NULL; + return cookie; + } + } + return NULL; +} diff --git a/src/seastar/dpdk/drivers/net/virtio/virtqueue.h b/src/seastar/dpdk/drivers/net/virtio/virtqueue.h new file mode 100644 index 00000000..2e120861 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/virtio/virtqueue.h @@ -0,0 +1,374 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VIRTQUEUE_H_ +#define _VIRTQUEUE_H_ + +#include <stdint.h> + +#include <rte_atomic.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_mempool.h> + +#include "virtio_pci.h" +#include "virtio_ring.h" +#include "virtio_logs.h" +#include "virtio_rxtx.h" + +struct rte_mbuf; + +/* + * Per virtio_config.h in Linux. + * For virtio_pci on SMP, we don't need to order with respect to MMIO + * accesses through relaxed memory I/O windows, so smp_mb() et al are + * sufficient. + * + */ +#define virtio_mb() rte_smp_mb() +#define virtio_rmb() rte_smp_rmb() +#define virtio_wmb() rte_smp_wmb() + +#ifdef RTE_PMD_PACKET_PREFETCH +#define rte_packet_prefetch(p) rte_prefetch1(p) +#else +#define rte_packet_prefetch(p) do {} while(0) +#endif + +#define VIRTQUEUE_MAX_NAME_SZ 32 + +#ifdef RTE_VIRTIO_USER +/** + * Return the physical address (or virtual address in case of + * virtio-user) of mbuf data buffer. + * + * The address is firstly casted to the word size (sizeof(uintptr_t)) + * before casting it to uint64_t. This is to make it work with different + * combination of word size (64 bit and 32 bit) and virtio device + * (virtio-pci and virtio-user). + */ +#define VIRTIO_MBUF_ADDR(mb, vq) \ + ((uint64_t)(*(uintptr_t *)((uintptr_t)(mb) + (vq)->offset))) +#else +#define VIRTIO_MBUF_ADDR(mb, vq) ((mb)->buf_physaddr) +#endif + +/** + * Return the physical address (or virtual address in case of + * virtio-user) of mbuf data buffer, taking care of mbuf data offset + */ +#define VIRTIO_MBUF_DATA_DMA_ADDR(mb, vq) \ + (VIRTIO_MBUF_ADDR(mb, vq) + (mb)->data_off) + +#define VTNET_SQ_RQ_QUEUE_IDX 0 +#define VTNET_SQ_TQ_QUEUE_IDX 1 +#define VTNET_SQ_CQ_QUEUE_IDX 2 + +enum { VTNET_RQ = 0, VTNET_TQ = 1, VTNET_CQ = 2 }; +/** + * The maximum virtqueue size is 2^15. Use that value as the end of + * descriptor chain terminator since it will never be a valid index + * in the descriptor table. This is used to verify we are correctly + * handling vq_free_cnt. + */ +#define VQ_RING_DESC_CHAIN_END 32768 + +/** + * Control the RX mode, ie. promiscuous, allmulti, etc... + * All commands require an "out" sg entry containing a 1 byte + * state value, zero = disable, non-zero = enable. Commands + * 0 and 1 are supported with the VIRTIO_NET_F_CTRL_RX feature. + * Commands 2-5 are added with VIRTIO_NET_F_CTRL_RX_EXTRA. + */ +#define VIRTIO_NET_CTRL_RX 0 +#define VIRTIO_NET_CTRL_RX_PROMISC 0 +#define VIRTIO_NET_CTRL_RX_ALLMULTI 1 +#define VIRTIO_NET_CTRL_RX_ALLUNI 2 +#define VIRTIO_NET_CTRL_RX_NOMULTI 3 +#define VIRTIO_NET_CTRL_RX_NOUNI 4 +#define VIRTIO_NET_CTRL_RX_NOBCAST 5 + +/** + * Control the MAC + * + * The MAC filter table is managed by the hypervisor, the guest should + * assume the size is infinite. Filtering should be considered + * non-perfect, ie. based on hypervisor resources, the guest may + * received packets from sources not specified in the filter list. + * + * In addition to the class/cmd header, the TABLE_SET command requires + * two out scatterlists. Each contains a 4 byte count of entries followed + * by a concatenated byte stream of the ETH_ALEN MAC addresses. The + * first sg list contains unicast addresses, the second is for multicast. + * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature + * is available. + * + * The ADDR_SET command requests one out scatterlist, it contains a + * 6 bytes MAC address. This functionality is present if the + * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available. + */ +struct virtio_net_ctrl_mac { + uint32_t entries; + uint8_t macs[][ETHER_ADDR_LEN]; +} __attribute__((__packed__)); + +#define VIRTIO_NET_CTRL_MAC 1 + #define VIRTIO_NET_CTRL_MAC_TABLE_SET 0 + #define VIRTIO_NET_CTRL_MAC_ADDR_SET 1 + +/** + * Control VLAN filtering + * + * The VLAN filter table is controlled via a simple ADD/DEL interface. + * VLAN IDs not added may be filtered by the hypervisor. Del is the + * opposite of add. Both commands expect an out entry containing a 2 + * byte VLAN ID. VLAN filtering is available with the + * VIRTIO_NET_F_CTRL_VLAN feature bit. + */ +#define VIRTIO_NET_CTRL_VLAN 2 +#define VIRTIO_NET_CTRL_VLAN_ADD 0 +#define VIRTIO_NET_CTRL_VLAN_DEL 1 + +struct virtio_net_ctrl_hdr { + uint8_t class; + uint8_t cmd; +} __attribute__((packed)); + +typedef uint8_t virtio_net_ctrl_ack; + +#define VIRTIO_NET_OK 0 +#define VIRTIO_NET_ERR 1 + +#define VIRTIO_MAX_CTRL_DATA 2048 + +struct virtio_pmd_ctrl { + struct virtio_net_ctrl_hdr hdr; + virtio_net_ctrl_ack status; + uint8_t data[VIRTIO_MAX_CTRL_DATA]; +}; + +struct vq_desc_extra { + void *cookie; + uint16_t ndescs; +}; + +struct virtqueue { + struct virtio_hw *hw; /**< virtio_hw structure pointer. */ + struct vring vq_ring; /**< vring keeping desc, used and avail */ + /** + * Last consumed descriptor in the used table, + * trails vq_ring.used->idx. + */ + uint16_t vq_used_cons_idx; + uint16_t vq_nentries; /**< vring desc numbers */ + uint16_t vq_free_cnt; /**< num of desc available */ + uint16_t vq_avail_idx; /**< sync until needed */ + uint16_t vq_free_thresh; /**< free threshold */ + + void *vq_ring_virt_mem; /**< linear address of vring*/ + unsigned int vq_ring_size; + + union { + struct virtnet_rx rxq; + struct virtnet_tx txq; + struct virtnet_ctl cq; + }; + + phys_addr_t vq_ring_mem; /**< physical address of vring, + * or virtual address for virtio_user. */ + + /** + * Head of the free chain in the descriptor table. If + * there are no free descriptors, this will be set to + * VQ_RING_DESC_CHAIN_END. + */ + uint16_t vq_desc_head_idx; + uint16_t vq_desc_tail_idx; + uint16_t vq_queue_index; /**< PCI queue index */ + uint16_t offset; /**< relative offset to obtain addr in mbuf */ + uint16_t *notify_addr; + struct rte_mbuf **sw_ring; /**< RX software ring. */ + struct vq_desc_extra vq_descx[0]; +}; + +/* If multiqueue is provided by host, then we suppport it. */ +#define VIRTIO_NET_CTRL_MQ 4 +#define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET 0 +#define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN 1 +#define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX 0x8000 + +#define VIRTIO_NET_CTRL_MAC_ADDR_SET 1 + +/** + * This is the first element of the scatter-gather list. If you don't + * specify GSO or CSUM features, you can simply ignore the header. + */ +struct virtio_net_hdr { +#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /**< Use csum_start,csum_offset*/ +#define VIRTIO_NET_HDR_F_DATA_VALID 2 /**< Checksum is valid */ + uint8_t flags; +#define VIRTIO_NET_HDR_GSO_NONE 0 /**< Not a GSO frame */ +#define VIRTIO_NET_HDR_GSO_TCPV4 1 /**< GSO frame, IPv4 TCP (TSO) */ +#define VIRTIO_NET_HDR_GSO_UDP 3 /**< GSO frame, IPv4 UDP (UFO) */ +#define VIRTIO_NET_HDR_GSO_TCPV6 4 /**< GSO frame, IPv6 TCP */ +#define VIRTIO_NET_HDR_GSO_ECN 0x80 /**< TCP has ECN set */ + uint8_t gso_type; + uint16_t hdr_len; /**< Ethernet + IP + tcp/udp hdrs */ + uint16_t gso_size; /**< Bytes to append to hdr_len per frame */ + uint16_t csum_start; /**< Position to start checksumming from */ + uint16_t csum_offset; /**< Offset after that to place checksum */ +}; + +/** + * This is the version of the header to use when the MRG_RXBUF + * feature has been negotiated. + */ +struct virtio_net_hdr_mrg_rxbuf { + struct virtio_net_hdr hdr; + uint16_t num_buffers; /**< Number of merged rx buffers */ +}; + +/* Region reserved to allow for transmit header and indirect ring */ +#define VIRTIO_MAX_TX_INDIRECT 8 +struct virtio_tx_region { + struct virtio_net_hdr_mrg_rxbuf tx_hdr; + struct vring_desc tx_indir[VIRTIO_MAX_TX_INDIRECT] + __attribute__((__aligned__(16))); +}; + +/* Chain all the descriptors in the ring with an END */ +static inline void +vring_desc_init(struct vring_desc *dp, uint16_t n) +{ + uint16_t i; + + for (i = 0; i < n - 1; i++) + dp[i].next = (uint16_t)(i + 1); + dp[i].next = VQ_RING_DESC_CHAIN_END; +} + +/** + * Tell the backend not to interrupt us. + */ +static inline void +virtqueue_disable_intr(struct virtqueue *vq) +{ + vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; +} + +/** + * Tell the backend to interrupt us. + */ +static inline void +virtqueue_enable_intr(struct virtqueue *vq) +{ + vq->vq_ring.avail->flags &= (~VRING_AVAIL_F_NO_INTERRUPT); +} + +/** + * Dump virtqueue internal structures, for debug purpose only. + */ +void virtqueue_dump(struct virtqueue *vq); +/** + * Get all mbufs to be freed. + */ +struct rte_mbuf *virtqueue_detatch_unused(struct virtqueue *vq); + +static inline int +virtqueue_full(const struct virtqueue *vq) +{ + return vq->vq_free_cnt == 0; +} + +#define VIRTQUEUE_NUSED(vq) ((uint16_t)((vq)->vq_ring.used->idx - (vq)->vq_used_cons_idx)) + +static inline void +vq_update_avail_idx(struct virtqueue *vq) +{ + virtio_wmb(); + vq->vq_ring.avail->idx = vq->vq_avail_idx; +} + +static inline void +vq_update_avail_ring(struct virtqueue *vq, uint16_t desc_idx) +{ + uint16_t avail_idx; + /* + * Place the head of the descriptor chain into the next slot and make + * it usable to the host. The chain is made available now rather than + * deferring to virtqueue_notify() in the hopes that if the host is + * currently running on another CPU, we can keep it processing the new + * descriptor. + */ + avail_idx = (uint16_t)(vq->vq_avail_idx & (vq->vq_nentries - 1)); + if (unlikely(vq->vq_ring.avail->ring[avail_idx] != desc_idx)) + vq->vq_ring.avail->ring[avail_idx] = desc_idx; + vq->vq_avail_idx++; +} + +static inline int +virtqueue_kick_prepare(struct virtqueue *vq) +{ + return !(vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY); +} + +static inline void +virtqueue_notify(struct virtqueue *vq) +{ + /* + * Ensure updated avail->idx is visible to host. + * For virtio on IA, the notificaiton is through io port operation + * which is a serialization instruction itself. + */ + VTPCI_OPS(vq->hw)->notify_queue(vq->hw, vq); +} + +#ifdef RTE_LIBRTE_VIRTIO_DEBUG_DUMP +#define VIRTQUEUE_DUMP(vq) do { \ + uint16_t used_idx, nused; \ + used_idx = (vq)->vq_ring.used->idx; \ + nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \ + PMD_INIT_LOG(DEBUG, \ + "VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \ + " avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \ + " avail.flags=0x%x; used.flags=0x%x", \ + (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \ + (vq)->vq_desc_head_idx, (vq)->vq_ring.avail->idx, \ + (vq)->vq_used_cons_idx, (vq)->vq_ring.used->idx, \ + (vq)->vq_ring.avail->flags, (vq)->vq_ring.used->flags); \ +} while (0) +#else +#define VIRTQUEUE_DUMP(vq) do { } while (0) +#endif + +#endif /* _VIRTQUEUE_H_ */ |