diff options
Diffstat (limited to 'src/seastar/dpdk/drivers/net/tap')
-rw-r--r-- | src/seastar/dpdk/drivers/net/tap/Makefile | 93 | ||||
-rw-r--r-- | src/seastar/dpdk/drivers/net/tap/rte_eth_tap.c | 1394 | ||||
-rw-r--r-- | src/seastar/dpdk/drivers/net/tap/rte_eth_tap.h | 100 | ||||
-rw-r--r-- | src/seastar/dpdk/drivers/net/tap/rte_pmd_tap_version.map | 4 | ||||
-rw-r--r-- | src/seastar/dpdk/drivers/net/tap/tap_flow.c | 1507 | ||||
-rw-r--r-- | src/seastar/dpdk/drivers/net/tap/tap_flow.h | 82 | ||||
-rw-r--r-- | src/seastar/dpdk/drivers/net/tap/tap_netlink.c | 367 | ||||
-rw-r--r-- | src/seastar/dpdk/drivers/net/tap/tap_netlink.h | 69 | ||||
-rw-r--r-- | src/seastar/dpdk/drivers/net/tap/tap_tcmsgs.c | 323 | ||||
-rw-r--r-- | src/seastar/dpdk/drivers/net/tap/tap_tcmsgs.h | 61 |
10 files changed, 4000 insertions, 0 deletions
diff --git a/src/seastar/dpdk/drivers/net/tap/Makefile b/src/seastar/dpdk/drivers/net/tap/Makefile new file mode 100644 index 00000000..b0de0284 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/tap/Makefile @@ -0,0 +1,93 @@ +# BSD LICENSE +# +# Copyright(c) 2016 Intel Corporation. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +include $(RTE_SDK)/mk/rte.vars.mk + +# +# library name +# +LIB = librte_pmd_tap.a + +EXPORT_MAP := rte_pmd_tap_version.map + +LIBABIVER := 1 + +CFLAGS += -O3 +CFLAGS += -I$(SRCDIR) +CFLAGS += -I. +CFLAGS += $(WERROR_FLAGS) + +# +# all source are stored in SRCS-y +# +SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += rte_eth_tap.c +SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_flow.c +SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_netlink.c +SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_tcmsgs.c + +include $(RTE_SDK)/mk/rte.lib.mk + +# Generate and clean-up tap_autoconf.h. + +export CC CFLAGS CPPFLAGS EXTRA_CFLAGS EXTRA_CPPFLAGS +export AUTO_CONFIG_CFLAGS = -Wno-error + +ifndef V +AUTOCONF_OUTPUT := >/dev/null +endif + +tap_autoconf.h.new: FORCE + +tap_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh + $Q $(RM) -f -- '$@' + $Q sh -- '$<' '$@' \ + HAVE_TC_FLOWER \ + linux/pkt_cls.h \ + enum TCA_FLOWER_UNSPEC \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TC_VLAN_ID \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_VLAN_PRIO \ + $(AUTOCONF_OUTPUT) + +# Create tap_autoconf.h or update it in case it differs from the new one. + +tap_autoconf.h: tap_autoconf.h.new + $Q [ -f '$@' ] && \ + cmp '$<' '$@' $(AUTOCONF_OUTPUT) || \ + mv '$<' '$@' + +$(SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP):.c=.o): tap_autoconf.h + +clean_tap: FORCE + $Q rm -f -- tap_autoconf.h tap_autoconf.h.new + +clean: clean_tap diff --git a/src/seastar/dpdk/drivers/net/tap/rte_eth_tap.c b/src/seastar/dpdk/drivers/net/tap/rte_eth_tap.c new file mode 100644 index 00000000..e44de027 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/tap/rte_eth_tap.c @@ -0,0 +1,1394 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <rte_atomic.h> +#include <rte_branch_prediction.h> +#include <rte_common.h> +#include <rte_mbuf.h> +#include <rte_ethdev.h> +#include <rte_ethdev_vdev.h> +#include <rte_malloc.h> +#include <rte_vdev.h> +#include <rte_kvargs.h> +#include <rte_net.h> + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <sys/ioctl.h> +#include <sys/utsname.h> +#include <sys/mman.h> +#include <errno.h> +#include <signal.h> +#include <stdint.h> +#include <sys/uio.h> +#include <unistd.h> +#include <arpa/inet.h> +#include <net/if.h> +#include <linux/if_tun.h> +#include <linux/if_ether.h> +#include <linux/version.h> +#include <fcntl.h> + +#include <rte_eth_tap.h> +#include <tap_flow.h> +#include <tap_netlink.h> +#include <tap_tcmsgs.h> + +/* Linux based path to the TUN device */ +#define TUN_TAP_DEV_PATH "/dev/net/tun" +#define DEFAULT_TAP_NAME "dtap" + +#define ETH_TAP_IFACE_ARG "iface" +#define ETH_TAP_SPEED_ARG "speed" +#define ETH_TAP_REMOTE_ARG "remote" + +#define FLOWER_KERNEL_VERSION KERNEL_VERSION(4, 2, 0) +#define FLOWER_VLAN_KERNEL_VERSION KERNEL_VERSION(4, 9, 0) + +static struct rte_vdev_driver pmd_tap_drv; + +static const char *valid_arguments[] = { + ETH_TAP_IFACE_ARG, + ETH_TAP_SPEED_ARG, + ETH_TAP_REMOTE_ARG, + NULL +}; + +static int tap_unit; + +static volatile uint32_t tap_trigger; /* Rx trigger */ + +static struct rte_eth_link pmd_link = { + .link_speed = ETH_SPEED_NUM_10G, + .link_duplex = ETH_LINK_FULL_DUPLEX, + .link_status = ETH_LINK_DOWN, + .link_autoneg = ETH_LINK_SPEED_AUTONEG +}; + +static void +tap_trigger_cb(int sig __rte_unused) +{ + /* Valid trigger values are nonzero */ + tap_trigger = (tap_trigger + 1) | 0x80000000; +} + +/* Specifies on what netdevices the ioctl should be applied */ +enum ioctl_mode { + LOCAL_AND_REMOTE, + LOCAL_ONLY, + REMOTE_ONLY, +}; + +static int +tap_ioctl(struct pmd_internals *pmd, unsigned long request, + struct ifreq *ifr, int set, enum ioctl_mode mode); + +static int tap_intr_handle_set(struct rte_eth_dev *dev, int set); + +/* Tun/Tap allocation routine + * + * name is the number of the interface to use, unless NULL to take the host + * supplied name. + */ +static int +tun_alloc(struct pmd_internals *pmd, uint16_t qid) +{ + struct ifreq ifr; +#ifdef IFF_MULTI_QUEUE + unsigned int features; +#endif + int fd; + + memset(&ifr, 0, sizeof(struct ifreq)); + + /* + * Do not set IFF_NO_PI as packet information header will be needed + * to check if a received packet has been truncated. + */ + ifr.ifr_flags = IFF_TAP; + snprintf(ifr.ifr_name, IFNAMSIZ, "%s", pmd->name); + + RTE_LOG(DEBUG, PMD, "ifr_name '%s'\n", ifr.ifr_name); + + fd = open(TUN_TAP_DEV_PATH, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, PMD, "Unable to create TAP interface"); + goto error; + } + +#ifdef IFF_MULTI_QUEUE + /* Grab the TUN features to verify we can work multi-queue */ + if (ioctl(fd, TUNGETFEATURES, &features) < 0) { + RTE_LOG(ERR, PMD, "TAP unable to get TUN/TAP features\n"); + goto error; + } + RTE_LOG(DEBUG, PMD, " TAP Features %08x\n", features); + + if (features & IFF_MULTI_QUEUE) { + RTE_LOG(DEBUG, PMD, " Multi-queue support for %d queues\n", + RTE_PMD_TAP_MAX_QUEUES); + ifr.ifr_flags |= IFF_MULTI_QUEUE; + } else +#endif + { + ifr.ifr_flags |= IFF_ONE_QUEUE; + RTE_LOG(DEBUG, PMD, " Single queue only support\n"); + } + + /* Set the TUN/TAP configuration and set the name if needed */ + if (ioctl(fd, TUNSETIFF, (void *)&ifr) < 0) { + RTE_LOG(WARNING, PMD, + "Unable to set TUNSETIFF for %s\n", + ifr.ifr_name); + perror("TUNSETIFF"); + goto error; + } + + /* Always set the file descriptor to non-blocking */ + if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) { + RTE_LOG(WARNING, PMD, + "Unable to set %s to nonblocking\n", + ifr.ifr_name); + perror("F_SETFL, NONBLOCK"); + goto error; + } + + /* Set up trigger to optimize empty Rx bursts */ + errno = 0; + do { + struct sigaction sa; + int flags = fcntl(fd, F_GETFL); + + if (flags == -1 || sigaction(SIGIO, NULL, &sa) == -1) + break; + if (sa.sa_handler != tap_trigger_cb) { + /* + * Make sure SIGIO is not already taken. This is done + * as late as possible to leave the application a + * chance to set up its own signal handler first. + */ + if (sa.sa_handler != SIG_IGN && + sa.sa_handler != SIG_DFL) { + errno = EBUSY; + break; + } + sa = (struct sigaction){ + .sa_flags = SA_RESTART, + .sa_handler = tap_trigger_cb, + }; + if (sigaction(SIGIO, &sa, NULL) == -1) + break; + } + /* Enable SIGIO on file descriptor */ + fcntl(fd, F_SETFL, flags | O_ASYNC); + fcntl(fd, F_SETOWN, getpid()); + } while (0); + if (errno) { + /* Disable trigger globally in case of error */ + tap_trigger = 0; + RTE_LOG(WARNING, PMD, "Rx trigger disabled: %s\n", + strerror(errno)); + } + + if (qid == 0) { + struct ifreq ifr; + + /* + * pmd->eth_addr contains the desired MAC, either from remote + * or from a random assignment. Sync it with the tap netdevice. + */ + ifr.ifr_hwaddr.sa_family = AF_LOCAL; + rte_memcpy(ifr.ifr_hwaddr.sa_data, &pmd->eth_addr, + ETHER_ADDR_LEN); + if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0) + goto error; + + pmd->if_index = if_nametoindex(pmd->name); + if (!pmd->if_index) { + RTE_LOG(ERR, PMD, + "Could not find ifindex for %s: rte_flow won't be usable.\n", + pmd->name); + return fd; + } + if (!pmd->flower_support) + return fd; + if (qdisc_create_multiq(pmd->nlsk_fd, pmd->if_index) < 0) { + RTE_LOG(ERR, PMD, + "Could not create multiq qdisc for %s: rte_flow won't be usable.\n", + pmd->name); + return fd; + } + if (qdisc_create_ingress(pmd->nlsk_fd, pmd->if_index) < 0) { + RTE_LOG(ERR, PMD, + "Could not create multiq qdisc for %s: rte_flow won't be usable.\n", + pmd->name); + return fd; + } + if (pmd->remote_if_index) { + /* + * Flush usually returns negative value because it tries + * to delete every QDISC (and on a running device, one + * QDISC at least is needed). Ignore negative return + * value. + */ + qdisc_flush(pmd->nlsk_fd, pmd->remote_if_index); + if (qdisc_create_ingress(pmd->nlsk_fd, + pmd->remote_if_index) < 0) + goto remote_fail; + LIST_INIT(&pmd->implicit_flows); + if (tap_flow_implicit_create( + pmd, TAP_REMOTE_LOCAL_MAC) < 0) + goto remote_fail; + if (tap_flow_implicit_create( + pmd, TAP_REMOTE_BROADCAST) < 0) + goto remote_fail; + if (tap_flow_implicit_create( + pmd, TAP_REMOTE_BROADCASTV6) < 0) + goto remote_fail; + if (tap_flow_implicit_create( + pmd, TAP_REMOTE_TX) < 0) + goto remote_fail; + } + } + + return fd; + +remote_fail: + RTE_LOG(ERR, PMD, + "Could not set up remote flow rules for %s: remote disabled.\n", + pmd->name); + pmd->remote_if_index = 0; + tap_flow_implicit_flush(pmd, NULL); + return fd; + +error: + if (fd > 0) + close(fd); + return -1; +} + +/* Callback to handle the rx burst of packets to the correct interface and + * file descriptor(s) in a multi-queue setup. + */ +static uint16_t +pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) +{ + struct rx_queue *rxq = queue; + uint16_t num_rx; + unsigned long num_rx_bytes = 0; + uint32_t trigger = tap_trigger; + + if (trigger == rxq->trigger_seen) + return 0; + if (trigger) + rxq->trigger_seen = trigger; + rte_compiler_barrier(); + for (num_rx = 0; num_rx < nb_pkts; ) { + struct rte_mbuf *mbuf = rxq->pool; + struct rte_mbuf *seg = NULL; + struct rte_mbuf *new_tail = NULL; + uint16_t data_off = rte_pktmbuf_headroom(mbuf); + int len; + + len = readv(rxq->fd, *rxq->iovecs, + 1 + (rxq->rxmode->enable_scatter ? + rxq->nb_rx_desc : 1)); + if (len < (int)sizeof(struct tun_pi)) + break; + + /* Packet couldn't fit in the provided mbuf */ + if (unlikely(rxq->pi.flags & TUN_PKT_STRIP)) { + rxq->stats.ierrors++; + continue; + } + + len -= sizeof(struct tun_pi); + + mbuf->pkt_len = len; + mbuf->port = rxq->in_port; + while (1) { + struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp); + + if (unlikely(!buf)) { + rxq->stats.rx_nombuf++; + /* No new buf has been allocated: do nothing */ + if (!new_tail || !seg) + goto end; + + seg->next = NULL; + rte_pktmbuf_free(mbuf); + + goto end; + } + seg = seg ? seg->next : mbuf; + if (rxq->pool == mbuf) + rxq->pool = buf; + if (new_tail) + new_tail->next = buf; + new_tail = buf; + new_tail->next = seg->next; + + /* iovecs[0] is reserved for packet info (pi) */ + (*rxq->iovecs)[mbuf->nb_segs].iov_len = + buf->buf_len - data_off; + (*rxq->iovecs)[mbuf->nb_segs].iov_base = + (char *)buf->buf_addr + data_off; + + seg->data_len = RTE_MIN(seg->buf_len - data_off, len); + seg->data_off = data_off; + + len -= seg->data_len; + if (len <= 0) + break; + mbuf->nb_segs++; + /* First segment has headroom, not the others */ + data_off = 0; + } + seg->next = NULL; + mbuf->packet_type = rte_net_get_ptype(mbuf, NULL, + RTE_PTYPE_ALL_MASK); + + /* account for the receive frame */ + bufs[num_rx++] = mbuf; + num_rx_bytes += mbuf->pkt_len; + } +end: + rxq->stats.ipackets += num_rx; + rxq->stats.ibytes += num_rx_bytes; + + return num_rx; +} + +/* Callback to handle sending packets from the tap interface + */ +static uint16_t +pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) +{ + struct tx_queue *txq = queue; + uint16_t num_tx = 0; + unsigned long num_tx_bytes = 0; + uint32_t max_size; + int i; + + if (unlikely(nb_pkts == 0)) + return 0; + + max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4); + for (i = 0; i < nb_pkts; i++) { + struct rte_mbuf *mbuf = bufs[num_tx]; + struct iovec iovecs[mbuf->nb_segs + 1]; + struct tun_pi pi = { .flags = 0 }; + struct rte_mbuf *seg = mbuf; + int n; + int j; + + /* stats.errs will be incremented */ + if (rte_pktmbuf_pkt_len(mbuf) > max_size) + break; + + iovecs[0].iov_base = π + iovecs[0].iov_len = sizeof(pi); + for (j = 1; j <= mbuf->nb_segs; j++) { + iovecs[j].iov_len = rte_pktmbuf_data_len(seg); + iovecs[j].iov_base = + rte_pktmbuf_mtod(seg, void *); + seg = seg->next; + } + /* copy the tx frame data */ + n = writev(txq->fd, iovecs, mbuf->nb_segs + 1); + if (n <= 0) + break; + + num_tx++; + num_tx_bytes += mbuf->pkt_len; + rte_pktmbuf_free(mbuf); + } + + txq->stats.opackets += num_tx; + txq->stats.errs += nb_pkts - num_tx; + txq->stats.obytes += num_tx_bytes; + + return num_tx; +} + +static int +tap_ioctl(struct pmd_internals *pmd, unsigned long request, + struct ifreq *ifr, int set, enum ioctl_mode mode) +{ + short req_flags = ifr->ifr_flags; + int remote = pmd->remote_if_index && + (mode == REMOTE_ONLY || mode == LOCAL_AND_REMOTE); + + if (!pmd->remote_if_index && mode == REMOTE_ONLY) + return 0; + /* + * If there is a remote netdevice, apply ioctl on it, then apply it on + * the tap netdevice. + */ +apply: + if (remote) + snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->remote_iface); + else if (mode == LOCAL_ONLY || mode == LOCAL_AND_REMOTE) + snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->name); + switch (request) { + case SIOCSIFFLAGS: + /* fetch current flags to leave other flags untouched */ + if (ioctl(pmd->ioctl_sock, SIOCGIFFLAGS, ifr) < 0) + goto error; + if (set) + ifr->ifr_flags |= req_flags; + else + ifr->ifr_flags &= ~req_flags; + break; + case SIOCGIFFLAGS: + case SIOCGIFHWADDR: + case SIOCSIFHWADDR: + case SIOCSIFMTU: + break; + default: + RTE_LOG(WARNING, PMD, "%s: ioctl() called with wrong arg\n", + pmd->name); + return -EINVAL; + } + if (ioctl(pmd->ioctl_sock, request, ifr) < 0) + goto error; + if (remote-- && mode == LOCAL_AND_REMOTE) + goto apply; + return 0; + +error: + RTE_LOG(ERR, PMD, "%s: ioctl(%lu) failed with error: %s\n", + ifr->ifr_name, request, strerror(errno)); + return -errno; +} + +static int +tap_link_set_down(struct rte_eth_dev *dev) +{ + struct pmd_internals *pmd = dev->data->dev_private; + struct ifreq ifr = { .ifr_flags = IFF_UP }; + + dev->data->dev_link.link_status = ETH_LINK_DOWN; + return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE); +} + +static int +tap_link_set_up(struct rte_eth_dev *dev) +{ + struct pmd_internals *pmd = dev->data->dev_private; + struct ifreq ifr = { .ifr_flags = IFF_UP }; + + dev->data->dev_link.link_status = ETH_LINK_UP; + return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE); +} + +static int +tap_dev_start(struct rte_eth_dev *dev) +{ + int err; + + err = tap_intr_handle_set(dev, 1); + if (err) + return err; + return tap_link_set_up(dev); +} + +/* This function gets called when the current port gets stopped. + */ +static void +tap_dev_stop(struct rte_eth_dev *dev) +{ + tap_intr_handle_set(dev, 0); + tap_link_set_down(dev); +} + +static int +tap_dev_configure(struct rte_eth_dev *dev __rte_unused) +{ + return 0; +} + +static uint32_t +tap_dev_speed_capa(void) +{ + uint32_t speed = pmd_link.link_speed; + uint32_t capa = 0; + + if (speed >= ETH_SPEED_NUM_10M) + capa |= ETH_LINK_SPEED_10M; + if (speed >= ETH_SPEED_NUM_100M) + capa |= ETH_LINK_SPEED_100M; + if (speed >= ETH_SPEED_NUM_1G) + capa |= ETH_LINK_SPEED_1G; + if (speed >= ETH_SPEED_NUM_5G) + capa |= ETH_LINK_SPEED_2_5G; + if (speed >= ETH_SPEED_NUM_5G) + capa |= ETH_LINK_SPEED_5G; + if (speed >= ETH_SPEED_NUM_10G) + capa |= ETH_LINK_SPEED_10G; + if (speed >= ETH_SPEED_NUM_20G) + capa |= ETH_LINK_SPEED_20G; + if (speed >= ETH_SPEED_NUM_25G) + capa |= ETH_LINK_SPEED_25G; + if (speed >= ETH_SPEED_NUM_40G) + capa |= ETH_LINK_SPEED_40G; + if (speed >= ETH_SPEED_NUM_50G) + capa |= ETH_LINK_SPEED_50G; + if (speed >= ETH_SPEED_NUM_56G) + capa |= ETH_LINK_SPEED_56G; + if (speed >= ETH_SPEED_NUM_100G) + capa |= ETH_LINK_SPEED_100G; + + return capa; +} + +static void +tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) +{ + struct pmd_internals *internals = dev->data->dev_private; + + dev_info->if_index = internals->if_index; + dev_info->max_mac_addrs = 1; + dev_info->max_rx_pktlen = (uint32_t)ETHER_MAX_VLAN_FRAME_LEN; + dev_info->max_rx_queues = internals->nb_queues; + dev_info->max_tx_queues = internals->nb_queues; + dev_info->min_rx_bufsize = 0; + dev_info->pci_dev = NULL; + dev_info->speed_capa = tap_dev_speed_capa(); +} + +static void +tap_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *tap_stats) +{ + unsigned int i, imax; + unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0; + unsigned long rx_bytes_total = 0, tx_bytes_total = 0; + unsigned long rx_nombuf = 0, ierrors = 0; + const struct pmd_internals *pmd = dev->data->dev_private; + + imax = (pmd->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ? + pmd->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS; + + for (i = 0; i < imax; i++) { + tap_stats->q_ipackets[i] = pmd->rxq[i].stats.ipackets; + tap_stats->q_ibytes[i] = pmd->rxq[i].stats.ibytes; + rx_total += tap_stats->q_ipackets[i]; + rx_bytes_total += tap_stats->q_ibytes[i]; + rx_nombuf += pmd->rxq[i].stats.rx_nombuf; + ierrors += pmd->rxq[i].stats.ierrors; + + tap_stats->q_opackets[i] = pmd->txq[i].stats.opackets; + tap_stats->q_errors[i] = pmd->txq[i].stats.errs; + tap_stats->q_obytes[i] = pmd->txq[i].stats.obytes; + tx_total += tap_stats->q_opackets[i]; + tx_err_total += tap_stats->q_errors[i]; + tx_bytes_total += tap_stats->q_obytes[i]; + } + + tap_stats->ipackets = rx_total; + tap_stats->ibytes = rx_bytes_total; + tap_stats->ierrors = ierrors; + tap_stats->rx_nombuf = rx_nombuf; + tap_stats->opackets = tx_total; + tap_stats->oerrors = tx_err_total; + tap_stats->obytes = tx_bytes_total; +} + +static void +tap_stats_reset(struct rte_eth_dev *dev) +{ + int i; + struct pmd_internals *pmd = dev->data->dev_private; + + for (i = 0; i < pmd->nb_queues; i++) { + pmd->rxq[i].stats.ipackets = 0; + pmd->rxq[i].stats.ibytes = 0; + pmd->rxq[i].stats.ierrors = 0; + pmd->rxq[i].stats.rx_nombuf = 0; + + pmd->txq[i].stats.opackets = 0; + pmd->txq[i].stats.errs = 0; + pmd->txq[i].stats.obytes = 0; + } +} + +static void +tap_dev_close(struct rte_eth_dev *dev __rte_unused) +{ + int i; + struct pmd_internals *internals = dev->data->dev_private; + + tap_link_set_down(dev); + tap_flow_flush(dev, NULL); + tap_flow_implicit_flush(internals, NULL); + + for (i = 0; i < internals->nb_queues; i++) { + if (internals->rxq[i].fd != -1) + close(internals->rxq[i].fd); + internals->rxq[i].fd = -1; + internals->txq[i].fd = -1; + } +} + +static void +tap_rx_queue_release(void *queue) +{ + struct rx_queue *rxq = queue; + + if (rxq && (rxq->fd > 0)) { + close(rxq->fd); + rxq->fd = -1; + rte_pktmbuf_free(rxq->pool); + rte_free(rxq->iovecs); + rxq->pool = NULL; + rxq->iovecs = NULL; + } +} + +static void +tap_tx_queue_release(void *queue) +{ + struct tx_queue *txq = queue; + + if (txq && (txq->fd > 0)) { + close(txq->fd); + txq->fd = -1; + } +} + +static int +tap_link_update(struct rte_eth_dev *dev, int wait_to_complete __rte_unused) +{ + struct rte_eth_link *dev_link = &dev->data->dev_link; + struct pmd_internals *pmd = dev->data->dev_private; + struct ifreq ifr = { .ifr_flags = 0 }; + + if (pmd->remote_if_index) { + tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, REMOTE_ONLY); + if (!(ifr.ifr_flags & IFF_UP) || + !(ifr.ifr_flags & IFF_RUNNING)) { + dev_link->link_status = ETH_LINK_DOWN; + return 0; + } + } + tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, LOCAL_ONLY); + dev_link->link_status = + ((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING) ? + ETH_LINK_UP : + ETH_LINK_DOWN); + return 0; +} + +static void +tap_promisc_enable(struct rte_eth_dev *dev) +{ + struct pmd_internals *pmd = dev->data->dev_private; + struct ifreq ifr = { .ifr_flags = IFF_PROMISC }; + + dev->data->promiscuous = 1; + tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE); + if (pmd->remote_if_index) + tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC); +} + +static void +tap_promisc_disable(struct rte_eth_dev *dev) +{ + struct pmd_internals *pmd = dev->data->dev_private; + struct ifreq ifr = { .ifr_flags = IFF_PROMISC }; + + dev->data->promiscuous = 0; + tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE); + if (pmd->remote_if_index) + tap_flow_implicit_destroy(pmd, TAP_REMOTE_PROMISC); +} + +static void +tap_allmulti_enable(struct rte_eth_dev *dev) +{ + struct pmd_internals *pmd = dev->data->dev_private; + struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI }; + + dev->data->all_multicast = 1; + tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE); + if (pmd->remote_if_index) + tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI); +} + +static void +tap_allmulti_disable(struct rte_eth_dev *dev) +{ + struct pmd_internals *pmd = dev->data->dev_private; + struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI }; + + dev->data->all_multicast = 0; + tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE); + if (pmd->remote_if_index) + tap_flow_implicit_destroy(pmd, TAP_REMOTE_ALLMULTI); +} + + +static void +tap_mac_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr) +{ + struct pmd_internals *pmd = dev->data->dev_private; + struct ifreq ifr; + + if (is_zero_ether_addr(mac_addr)) { + RTE_LOG(ERR, PMD, "%s: can't set an empty MAC address\n", + dev->data->name); + return; + } + /* Check the actual current MAC address on the tap netdevice */ + if (tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, LOCAL_ONLY) != 0) { + RTE_LOG(ERR, PMD, + "%s: couldn't check current tap MAC address\n", + dev->data->name); + return; + } + if (is_same_ether_addr((struct ether_addr *)&ifr.ifr_hwaddr.sa_data, + mac_addr)) + return; + + ifr.ifr_hwaddr.sa_family = AF_LOCAL; + rte_memcpy(ifr.ifr_hwaddr.sa_data, mac_addr, ETHER_ADDR_LEN); + if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 1, LOCAL_AND_REMOTE) < 0) + return; + rte_memcpy(&pmd->eth_addr, mac_addr, ETHER_ADDR_LEN); + if (pmd->remote_if_index) { + /* Replace MAC redirection rule after a MAC change */ + if (tap_flow_implicit_destroy(pmd, TAP_REMOTE_LOCAL_MAC) < 0) { + RTE_LOG(ERR, PMD, + "%s: Couldn't delete MAC redirection rule\n", + dev->data->name); + return; + } + if (tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0) + RTE_LOG(ERR, PMD, + "%s: Couldn't add MAC redirection rule\n", + dev->data->name); + } +} + +static int +tap_setup_queue(struct rte_eth_dev *dev, + struct pmd_internals *internals, + uint16_t qid) +{ + struct pmd_internals *pmd = dev->data->dev_private; + struct rx_queue *rx = &internals->rxq[qid]; + struct tx_queue *tx = &internals->txq[qid]; + int fd; + + fd = rx->fd; + if (fd < 0) { + fd = tx->fd; + if (fd < 0) { + RTE_LOG(INFO, PMD, "Add queue to TAP %s for qid %d\n", + pmd->name, qid); + fd = tun_alloc(pmd, qid); + if (fd < 0) { + RTE_LOG(ERR, PMD, "tun_alloc(%s, %d) failed\n", + pmd->name, qid); + return -1; + } + if (qid == 0) { + struct ifreq ifr; + + ifr.ifr_mtu = dev->data->mtu; + if (tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, + LOCAL_AND_REMOTE) < 0) { + close(fd); + return -1; + } + } + } + } + + rx->fd = fd; + tx->fd = fd; + tx->mtu = &dev->data->mtu; + rx->rxmode = &dev->data->dev_conf.rxmode; + + return fd; +} + +static int +rx_setup_queue(struct rte_eth_dev *dev, + struct pmd_internals *internals, + uint16_t qid) +{ + dev->data->rx_queues[qid] = &internals->rxq[qid]; + + return tap_setup_queue(dev, internals, qid); +} + +static int +tx_setup_queue(struct rte_eth_dev *dev, + struct pmd_internals *internals, + uint16_t qid) +{ + dev->data->tx_queues[qid] = &internals->txq[qid]; + + return tap_setup_queue(dev, internals, qid); +} + +static int +tap_rx_queue_setup(struct rte_eth_dev *dev, + uint16_t rx_queue_id, + uint16_t nb_rx_desc, + unsigned int socket_id, + const struct rte_eth_rxconf *rx_conf __rte_unused, + struct rte_mempool *mp) +{ + struct pmd_internals *internals = dev->data->dev_private; + struct rx_queue *rxq = &internals->rxq[rx_queue_id]; + struct rte_mbuf **tmp = &rxq->pool; + long iov_max = sysconf(_SC_IOV_MAX); + uint16_t nb_desc = RTE_MIN(nb_rx_desc, iov_max - 1); + struct iovec (*iovecs)[nb_desc + 1]; + int data_off = RTE_PKTMBUF_HEADROOM; + int ret = 0; + int fd; + int i; + + if ((rx_queue_id >= internals->nb_queues) || !mp) { + RTE_LOG(WARNING, PMD, + "nb_queues %d too small or mempool NULL\n", + internals->nb_queues); + return -1; + } + + rxq->mp = mp; + rxq->trigger_seen = 1; /* force initial burst */ + rxq->in_port = dev->data->port_id; + rxq->nb_rx_desc = nb_desc; + iovecs = rte_zmalloc_socket(dev->data->name, sizeof(*iovecs), 0, + socket_id); + if (!iovecs) { + RTE_LOG(WARNING, PMD, + "%s: Couldn't allocate %d RX descriptors\n", + dev->data->name, nb_desc); + return -ENOMEM; + } + rxq->iovecs = iovecs; + + fd = rx_setup_queue(dev, internals, rx_queue_id); + if (fd == -1) { + ret = fd; + goto error; + } + + (*rxq->iovecs)[0].iov_len = sizeof(struct tun_pi); + (*rxq->iovecs)[0].iov_base = &rxq->pi; + + for (i = 1; i <= nb_desc; i++) { + *tmp = rte_pktmbuf_alloc(rxq->mp); + if (!*tmp) { + RTE_LOG(WARNING, PMD, + "%s: couldn't allocate memory for queue %d\n", + dev->data->name, rx_queue_id); + ret = -ENOMEM; + goto error; + } + (*rxq->iovecs)[i].iov_len = (*tmp)->buf_len - data_off; + (*rxq->iovecs)[i].iov_base = + (char *)(*tmp)->buf_addr + data_off; + data_off = 0; + tmp = &(*tmp)->next; + } + + RTE_LOG(DEBUG, PMD, " RX TAP device name %s, qid %d on fd %d\n", + internals->name, rx_queue_id, internals->rxq[rx_queue_id].fd); + + return 0; + +error: + rte_pktmbuf_free(rxq->pool); + rxq->pool = NULL; + rte_free(rxq->iovecs); + rxq->iovecs = NULL; + return ret; +} + +static int +tap_tx_queue_setup(struct rte_eth_dev *dev, + uint16_t tx_queue_id, + uint16_t nb_tx_desc __rte_unused, + unsigned int socket_id __rte_unused, + const struct rte_eth_txconf *tx_conf __rte_unused) +{ + struct pmd_internals *internals = dev->data->dev_private; + int ret; + + if (tx_queue_id >= internals->nb_queues) + return -1; + + ret = tx_setup_queue(dev, internals, tx_queue_id); + if (ret == -1) + return -1; + + RTE_LOG(DEBUG, PMD, " TX TAP device name %s, qid %d on fd %d\n", + internals->name, tx_queue_id, internals->txq[tx_queue_id].fd); + + return 0; +} + +static int +tap_mtu_set(struct rte_eth_dev *dev, uint16_t mtu) +{ + struct pmd_internals *pmd = dev->data->dev_private; + struct ifreq ifr = { .ifr_mtu = mtu }; + int err = 0; + + err = tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, LOCAL_AND_REMOTE); + if (!err) + dev->data->mtu = mtu; + + return err; +} + +static int +tap_set_mc_addr_list(struct rte_eth_dev *dev __rte_unused, + struct ether_addr *mc_addr_set __rte_unused, + uint32_t nb_mc_addr __rte_unused) +{ + /* + * Nothing to do actually: the tap has no filtering whatsoever, every + * packet is received. + */ + return 0; +} + +static int +tap_nl_msg_handler(struct nlmsghdr *nh, void *arg) +{ + struct rte_eth_dev *dev = arg; + struct pmd_internals *pmd = dev->data->dev_private; + struct ifinfomsg *info = NLMSG_DATA(nh); + + if (nh->nlmsg_type != RTM_NEWLINK || + (info->ifi_index != pmd->if_index && + info->ifi_index != pmd->remote_if_index)) + return 0; + return tap_link_update(dev, 0); +} + +static void +tap_dev_intr_handler(void *cb_arg) +{ + struct rte_eth_dev *dev = cb_arg; + struct pmd_internals *pmd = dev->data->dev_private; + + nl_recv(pmd->intr_handle.fd, tap_nl_msg_handler, dev); +} + +static int +tap_intr_handle_set(struct rte_eth_dev *dev, int set) +{ + struct pmd_internals *pmd = dev->data->dev_private; + + /* In any case, disable interrupt if the conf is no longer there. */ + if (!dev->data->dev_conf.intr_conf.lsc) { + if (pmd->intr_handle.fd != -1) + nl_final(pmd->intr_handle.fd); + rte_intr_callback_unregister( + &pmd->intr_handle, tap_dev_intr_handler, dev); + return 0; + } + if (set) { + pmd->intr_handle.fd = nl_init(RTMGRP_LINK); + if (unlikely(pmd->intr_handle.fd == -1)) + return -EBADF; + return rte_intr_callback_register( + &pmd->intr_handle, tap_dev_intr_handler, dev); + } + nl_final(pmd->intr_handle.fd); + return rte_intr_callback_unregister(&pmd->intr_handle, + tap_dev_intr_handler, dev); +} + +static const uint32_t* +tap_dev_supported_ptypes_get(struct rte_eth_dev *dev __rte_unused) +{ + static const uint32_t ptypes[] = { + RTE_PTYPE_INNER_L2_ETHER, + RTE_PTYPE_INNER_L2_ETHER_VLAN, + RTE_PTYPE_INNER_L2_ETHER_QINQ, + RTE_PTYPE_INNER_L3_IPV4, + RTE_PTYPE_INNER_L3_IPV4_EXT, + RTE_PTYPE_INNER_L3_IPV6, + RTE_PTYPE_INNER_L3_IPV6_EXT, + RTE_PTYPE_INNER_L4_FRAG, + RTE_PTYPE_INNER_L4_UDP, + RTE_PTYPE_INNER_L4_TCP, + RTE_PTYPE_INNER_L4_SCTP, + RTE_PTYPE_L2_ETHER, + RTE_PTYPE_L2_ETHER_VLAN, + RTE_PTYPE_L2_ETHER_QINQ, + RTE_PTYPE_L3_IPV4, + RTE_PTYPE_L3_IPV4_EXT, + RTE_PTYPE_L3_IPV6_EXT, + RTE_PTYPE_L3_IPV6, + RTE_PTYPE_L4_FRAG, + RTE_PTYPE_L4_UDP, + RTE_PTYPE_L4_TCP, + RTE_PTYPE_L4_SCTP, + }; + + return ptypes; +} + +static int +tap_flow_ctrl_get(struct rte_eth_dev *dev __rte_unused, + struct rte_eth_fc_conf *fc_conf) +{ + fc_conf->mode = RTE_FC_NONE; + return 0; +} + +static int +tap_flow_ctrl_set(struct rte_eth_dev *dev __rte_unused, + struct rte_eth_fc_conf *fc_conf) +{ + if (fc_conf->mode != RTE_FC_NONE) + return -ENOTSUP; + return 0; +} + +static const struct eth_dev_ops ops = { + .dev_start = tap_dev_start, + .dev_stop = tap_dev_stop, + .dev_close = tap_dev_close, + .dev_configure = tap_dev_configure, + .dev_infos_get = tap_dev_info, + .rx_queue_setup = tap_rx_queue_setup, + .tx_queue_setup = tap_tx_queue_setup, + .rx_queue_release = tap_rx_queue_release, + .tx_queue_release = tap_tx_queue_release, + .flow_ctrl_get = tap_flow_ctrl_get, + .flow_ctrl_set = tap_flow_ctrl_set, + .link_update = tap_link_update, + .dev_set_link_up = tap_link_set_up, + .dev_set_link_down = tap_link_set_down, + .promiscuous_enable = tap_promisc_enable, + .promiscuous_disable = tap_promisc_disable, + .allmulticast_enable = tap_allmulti_enable, + .allmulticast_disable = tap_allmulti_disable, + .mac_addr_set = tap_mac_set, + .mtu_set = tap_mtu_set, + .set_mc_addr_list = tap_set_mc_addr_list, + .stats_get = tap_stats_get, + .stats_reset = tap_stats_reset, + .dev_supported_ptypes_get = tap_dev_supported_ptypes_get, + .filter_ctrl = tap_dev_filter_ctrl, +}; + +static int +tap_kernel_support(struct pmd_internals *pmd) +{ + struct utsname utsname; + int ver[3]; + + if (uname(&utsname) == -1 || + sscanf(utsname.release, "%d.%d.%d", + &ver[0], &ver[1], &ver[2]) != 3) + return 0; + if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >= FLOWER_KERNEL_VERSION) + pmd->flower_support = 1; + if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >= + FLOWER_VLAN_KERNEL_VERSION) + pmd->flower_vlan_support = 1; + return 1; +} + +static int +eth_dev_tap_create(struct rte_vdev_device *vdev, char *tap_name, + char *remote_iface) +{ + int numa_node = rte_socket_id(); + struct rte_eth_dev *dev; + struct pmd_internals *pmd; + struct rte_eth_dev_data *data; + int i; + + RTE_LOG(DEBUG, PMD, " TAP device on numa %u\n", rte_socket_id()); + + data = rte_zmalloc_socket(tap_name, sizeof(*data), 0, numa_node); + if (!data) { + RTE_LOG(ERR, PMD, "TAP Failed to allocate data\n"); + goto error_exit; + } + + dev = rte_eth_vdev_allocate(vdev, sizeof(*pmd)); + if (!dev) { + RTE_LOG(ERR, PMD, "TAP Unable to allocate device struct\n"); + goto error_exit; + } + + pmd = dev->data->dev_private; + snprintf(pmd->name, sizeof(pmd->name), "%s", tap_name); + pmd->nb_queues = RTE_PMD_TAP_MAX_QUEUES; + + pmd->ioctl_sock = socket(AF_INET, SOCK_DGRAM, 0); + if (pmd->ioctl_sock == -1) { + RTE_LOG(ERR, PMD, + "TAP Unable to get a socket for management: %s\n", + strerror(errno)); + goto error_exit; + } + + /* Setup some default values */ + rte_memcpy(data, dev->data, sizeof(*data)); + data->dev_private = pmd; + data->dev_flags = RTE_ETH_DEV_DETACHABLE | RTE_ETH_DEV_INTR_LSC; + data->numa_node = numa_node; + data->drv_name = pmd_tap_drv.driver.name; + + data->dev_link = pmd_link; + data->mac_addrs = &pmd->eth_addr; + data->nb_rx_queues = pmd->nb_queues; + data->nb_tx_queues = pmd->nb_queues; + + dev->data = data; + dev->dev_ops = &ops; + dev->rx_pkt_burst = pmd_rx_burst; + dev->tx_pkt_burst = pmd_tx_burst; + + pmd->intr_handle.type = RTE_INTR_HANDLE_EXT; + pmd->intr_handle.fd = -1; + + /* Presetup the fds to -1 as being not valid */ + for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) { + pmd->rxq[i].fd = -1; + pmd->txq[i].fd = -1; + } + + tap_kernel_support(pmd); + if (!pmd->flower_support) + return 0; + LIST_INIT(&pmd->flows); + /* + * If no netlink socket can be created, then it will fail when + * creating/destroying flow rules. + */ + pmd->nlsk_fd = nl_init(0); + if (strlen(remote_iface)) { + struct ifreq ifr; + + pmd->remote_if_index = if_nametoindex(remote_iface); + snprintf(pmd->remote_iface, RTE_ETH_NAME_MAX_LEN, + "%s", remote_iface); + if (!pmd->remote_if_index) { + RTE_LOG(ERR, PMD, "Could not find %s ifindex: " + "remote interface will remain unconfigured\n", + remote_iface); + return 0; + } + if (tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, REMOTE_ONLY) < 0) { + RTE_LOG(ERR, PMD, "Could not get remote MAC address\n"); + goto error_exit; + } + rte_memcpy(&pmd->eth_addr, ifr.ifr_hwaddr.sa_data, + ETHER_ADDR_LEN); + } else { + eth_random_addr((uint8_t *)&pmd->eth_addr); + } + + return 0; + +error_exit: + RTE_LOG(DEBUG, PMD, "TAP Unable to initialize %s\n", + rte_vdev_device_name(vdev)); + + rte_free(data); + return -EINVAL; +} + +static int +set_interface_name(const char *key __rte_unused, + const char *value, + void *extra_args) +{ + char *name = (char *)extra_args; + + if (value) + snprintf(name, RTE_ETH_NAME_MAX_LEN - 1, "%s", value); + else + snprintf(name, RTE_ETH_NAME_MAX_LEN - 1, "%s%d", + DEFAULT_TAP_NAME, (tap_unit - 1)); + + return 0; +} + +static int +set_interface_speed(const char *key __rte_unused, + const char *value, + void *extra_args) +{ + *(int *)extra_args = (value) ? atoi(value) : ETH_SPEED_NUM_10G; + + return 0; +} + +static int +set_remote_iface(const char *key __rte_unused, + const char *value, + void *extra_args) +{ + char *name = (char *)extra_args; + + if (value) + snprintf(name, RTE_ETH_NAME_MAX_LEN, "%s", value); + + return 0; +} + +/* Open a TAP interface device. + */ +static int +rte_pmd_tap_probe(struct rte_vdev_device *dev) +{ + const char *name, *params; + int ret; + struct rte_kvargs *kvlist = NULL; + int speed; + char tap_name[RTE_ETH_NAME_MAX_LEN]; + char remote_iface[RTE_ETH_NAME_MAX_LEN]; + + name = rte_vdev_device_name(dev); + params = rte_vdev_device_args(dev); + + speed = ETH_SPEED_NUM_10G; + snprintf(tap_name, sizeof(tap_name), "%s%d", + DEFAULT_TAP_NAME, tap_unit++); + memset(remote_iface, 0, RTE_ETH_NAME_MAX_LEN); + + if (params && (params[0] != '\0')) { + RTE_LOG(DEBUG, PMD, "paramaters (%s)\n", params); + + kvlist = rte_kvargs_parse(params, valid_arguments); + if (kvlist) { + if (rte_kvargs_count(kvlist, ETH_TAP_SPEED_ARG) == 1) { + ret = rte_kvargs_process(kvlist, + ETH_TAP_SPEED_ARG, + &set_interface_speed, + &speed); + if (ret == -1) + goto leave; + } + + if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) { + ret = rte_kvargs_process(kvlist, + ETH_TAP_IFACE_ARG, + &set_interface_name, + tap_name); + if (ret == -1) + goto leave; + } + + if (rte_kvargs_count(kvlist, ETH_TAP_REMOTE_ARG) == 1) { + ret = rte_kvargs_process(kvlist, + ETH_TAP_REMOTE_ARG, + &set_remote_iface, + remote_iface); + if (ret == -1) + goto leave; + } + } + } + pmd_link.link_speed = speed; + + RTE_LOG(NOTICE, PMD, "Initializing pmd_tap for %s as %s\n", + name, tap_name); + + ret = eth_dev_tap_create(dev, tap_name, remote_iface); + +leave: + if (ret == -1) { + RTE_LOG(ERR, PMD, "Failed to create pmd for %s as %s\n", + name, tap_name); + tap_unit--; /* Restore the unit number */ + } + rte_kvargs_free(kvlist); + + return ret; +} + +/* detach a TAP device. + */ +static int +rte_pmd_tap_remove(struct rte_vdev_device *dev) +{ + struct rte_eth_dev *eth_dev = NULL; + struct pmd_internals *internals; + int i; + + RTE_LOG(DEBUG, PMD, "Closing TUN/TAP Ethernet device on numa %u\n", + rte_socket_id()); + + /* find the ethdev entry */ + eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev)); + if (!eth_dev) + return 0; + + internals = eth_dev->data->dev_private; + if (internals->flower_support && internals->nlsk_fd) { + tap_flow_flush(eth_dev, NULL); + tap_flow_implicit_flush(internals, NULL); + nl_final(internals->nlsk_fd); + } + for (i = 0; i < internals->nb_queues; i++) + if (internals->rxq[i].fd != -1) + close(internals->rxq[i].fd); + + close(internals->ioctl_sock); + rte_free(eth_dev->data->dev_private); + rte_free(eth_dev->data); + + rte_eth_dev_release_port(eth_dev); + + return 0; +} + +static struct rte_vdev_driver pmd_tap_drv = { + .probe = rte_pmd_tap_probe, + .remove = rte_pmd_tap_remove, +}; +RTE_PMD_REGISTER_VDEV(net_tap, pmd_tap_drv); +RTE_PMD_REGISTER_ALIAS(net_tap, eth_tap); +RTE_PMD_REGISTER_PARAM_STRING(net_tap, + ETH_TAP_IFACE_ARG "=<string> " + ETH_TAP_SPEED_ARG "=<int> " + ETH_TAP_REMOTE_ARG "=<string>"); diff --git a/src/seastar/dpdk/drivers/net/tap/rte_eth_tap.h b/src/seastar/dpdk/drivers/net/tap/rte_eth_tap.h new file mode 100644 index 00000000..ad497b3d --- /dev/null +++ b/src/seastar/dpdk/drivers/net/tap/rte_eth_tap.h @@ -0,0 +1,100 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RTE_ETH_TAP_H_ +#define _RTE_ETH_TAP_H_ + +#include <sys/queue.h> +#include <sys/uio.h> +#include <inttypes.h> + +#include <linux/if_tun.h> + +#include <rte_ethdev.h> +#include <rte_ether.h> + +#ifdef IFF_MULTI_QUEUE +#define RTE_PMD_TAP_MAX_QUEUES 16 +#else +#define RTE_PMD_TAP_MAX_QUEUES 1 +#endif + +struct pkt_stats { + uint64_t opackets; /* Number of output packets */ + uint64_t ipackets; /* Number of input packets */ + uint64_t obytes; /* Number of bytes on output */ + uint64_t ibytes; /* Number of bytes on input */ + uint64_t errs; /* Number of TX error packets */ + uint64_t ierrors; /* Number of RX error packets */ + uint64_t rx_nombuf; /* Nb of RX mbuf alloc failures */ +}; + +struct rx_queue { + struct rte_mempool *mp; /* Mempool for RX packets */ + uint32_t trigger_seen; /* Last seen Rx trigger value */ + uint16_t in_port; /* Port ID */ + int fd; + struct pkt_stats stats; /* Stats for this RX queue */ + uint16_t nb_rx_desc; /* max number of mbufs available */ + struct rte_eth_rxmode *rxmode; /* RX features */ + struct rte_mbuf *pool; /* mbufs pool for this queue */ + struct iovec (*iovecs)[]; /* descriptors for this queue */ + struct tun_pi pi; /* packet info for iovecs */ +}; + +struct tx_queue { + int fd; + uint16_t *mtu; /* Pointer to MTU from dev_data */ + struct pkt_stats stats; /* Stats for this TX queue */ +}; + +struct pmd_internals { + char remote_iface[RTE_ETH_NAME_MAX_LEN]; /* Remote netdevice name */ + char name[RTE_ETH_NAME_MAX_LEN]; /* Internal Tap device name */ + uint16_t nb_queues; /* Number of queues supported */ + struct ether_addr eth_addr; /* Mac address of the device port */ + int remote_if_index; /* remote netdevice IF_INDEX */ + int if_index; /* IF_INDEX for the port */ + int ioctl_sock; /* socket for ioctl calls */ + int nlsk_fd; /* Netlink socket fd */ + int flower_support; /* 1 if kernel supports, else 0 */ + int flower_vlan_support; /* 1 if kernel supports, else 0 */ + LIST_HEAD(tap_flows, rte_flow) flows; /* rte_flow rules */ + /* implicit rte_flow rules set when a remote device is active */ + LIST_HEAD(tap_implicit_flows, rte_flow) implicit_flows; + struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */ + struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */ + struct rte_intr_handle intr_handle; /* LSC interrupt handle. */ +}; + +#endif /* _RTE_ETH_TAP_H_ */ diff --git a/src/seastar/dpdk/drivers/net/tap/rte_pmd_tap_version.map b/src/seastar/dpdk/drivers/net/tap/rte_pmd_tap_version.map new file mode 100644 index 00000000..31eca32e --- /dev/null +++ b/src/seastar/dpdk/drivers/net/tap/rte_pmd_tap_version.map @@ -0,0 +1,4 @@ +DPDK_17.02 { + + local: *; +}; diff --git a/src/seastar/dpdk/drivers/net/tap/tap_flow.c b/src/seastar/dpdk/drivers/net/tap/tap_flow.c new file mode 100644 index 00000000..cf1c8a26 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/tap/tap_flow.c @@ -0,0 +1,1507 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <errno.h> +#include <string.h> +#include <sys/queue.h> + +#include <rte_byteorder.h> +#include <rte_jhash.h> +#include <rte_malloc.h> +#include <rte_eth_tap.h> +#include <tap_flow.h> +#include <tap_autoconf.h> +#include <tap_tcmsgs.h> + +#ifndef HAVE_TC_FLOWER +/* + * For kernels < 4.2, this enum is not defined. Runtime checks will be made to + * avoid sending TC messages the kernel cannot understand. + */ +enum { + TCA_FLOWER_UNSPEC, + TCA_FLOWER_CLASSID, + TCA_FLOWER_INDEV, + TCA_FLOWER_ACT, + TCA_FLOWER_KEY_ETH_DST, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_DST_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_SRC, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_SRC_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_TYPE, /* be16 */ + TCA_FLOWER_KEY_IP_PROTO, /* u8 */ + TCA_FLOWER_KEY_IPV4_SRC, /* be32 */ + TCA_FLOWER_KEY_IPV4_SRC_MASK, /* be32 */ + TCA_FLOWER_KEY_IPV4_DST, /* be32 */ + TCA_FLOWER_KEY_IPV4_DST_MASK, /* be32 */ + TCA_FLOWER_KEY_IPV6_SRC, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_SRC_MASK, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_DST, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_DST_MASK, /* struct in6_addr */ + TCA_FLOWER_KEY_TCP_SRC, /* be16 */ + TCA_FLOWER_KEY_TCP_DST, /* be16 */ + TCA_FLOWER_KEY_UDP_SRC, /* be16 */ + TCA_FLOWER_KEY_UDP_DST, /* be16 */ +}; +#endif +#ifndef HAVE_TC_VLAN_ID +enum { + /* TCA_FLOWER_FLAGS, */ + TCA_FLOWER_KEY_VLAN_ID = TCA_FLOWER_KEY_UDP_DST + 2, /* be16 */ + TCA_FLOWER_KEY_VLAN_PRIO, /* u8 */ + TCA_FLOWER_KEY_VLAN_ETH_TYPE, /* be16 */ +}; +#endif + +struct rte_flow { + LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */ + struct rte_flow *remote_flow; /* associated remote flow */ + struct nlmsg msg; +}; + +struct convert_data { + uint16_t eth_type; + uint16_t ip_proto; + uint8_t vlan; + struct rte_flow *flow; +}; + +struct remote_rule { + struct rte_flow_attr attr; + struct rte_flow_item items[2]; + int mirred; +}; + +static int tap_flow_create_eth(const struct rte_flow_item *item, void *data); +static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data); +static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data); +static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data); +static int tap_flow_create_udp(const struct rte_flow_item *item, void *data); +static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data); +static int +tap_flow_validate(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error); + +static struct rte_flow * +tap_flow_create(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error); + +static int +tap_flow_destroy(struct rte_eth_dev *dev, + struct rte_flow *flow, + struct rte_flow_error *error); + +static const struct rte_flow_ops tap_flow_ops = { + .validate = tap_flow_validate, + .create = tap_flow_create, + .destroy = tap_flow_destroy, + .flush = tap_flow_flush, +}; + +/* Static initializer for items. */ +#define ITEMS(...) \ + (const enum rte_flow_item_type []){ \ + __VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \ + } + +/* Structure to generate a simple graph of layers supported by the NIC. */ +struct tap_flow_items { + /* Bit-mask corresponding to what is supported for this item. */ + const void *mask; + const unsigned int mask_sz; /* Bit-mask size in bytes. */ + /* + * Bit-mask corresponding to the default mask, if none is provided + * along with the item. + */ + const void *default_mask; + /** + * Conversion function from rte_flow to netlink attributes. + * + * @param item + * rte_flow item to convert. + * @param data + * Internal structure to store the conversion. + * + * @return + * 0 on success, negative value otherwise. + */ + int (*convert)(const struct rte_flow_item *item, void *data); + /** List of possible following items. */ + const enum rte_flow_item_type *const items; +}; + +/* Graph of supported items and associated actions. */ +static const struct tap_flow_items tap_flow_items[] = { + [RTE_FLOW_ITEM_TYPE_END] = { + .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH), + }, + [RTE_FLOW_ITEM_TYPE_ETH] = { + .items = ITEMS( + RTE_FLOW_ITEM_TYPE_VLAN, + RTE_FLOW_ITEM_TYPE_IPV4, + RTE_FLOW_ITEM_TYPE_IPV6), + .mask = &(const struct rte_flow_item_eth){ + .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", + .src.addr_bytes = "\xff\xff\xff\xff\xff\xff", + .type = -1, + }, + .mask_sz = sizeof(struct rte_flow_item_eth), + .default_mask = &rte_flow_item_eth_mask, + .convert = tap_flow_create_eth, + }, + [RTE_FLOW_ITEM_TYPE_VLAN] = { + .items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4, + RTE_FLOW_ITEM_TYPE_IPV6), + .mask = &(const struct rte_flow_item_vlan){ + .tpid = -1, + /* DEI matching is not supported */ +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN + .tci = 0xffef, +#else + .tci = 0xefff, +#endif + }, + .mask_sz = sizeof(struct rte_flow_item_vlan), + .default_mask = &rte_flow_item_vlan_mask, + .convert = tap_flow_create_vlan, + }, + [RTE_FLOW_ITEM_TYPE_IPV4] = { + .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP, + RTE_FLOW_ITEM_TYPE_TCP), + .mask = &(const struct rte_flow_item_ipv4){ + .hdr = { + .src_addr = -1, + .dst_addr = -1, + .next_proto_id = -1, + }, + }, + .mask_sz = sizeof(struct rte_flow_item_ipv4), + .default_mask = &rte_flow_item_ipv4_mask, + .convert = tap_flow_create_ipv4, + }, + [RTE_FLOW_ITEM_TYPE_IPV6] = { + .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP, + RTE_FLOW_ITEM_TYPE_TCP), + .mask = &(const struct rte_flow_item_ipv6){ + .hdr = { + .src_addr = { + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + }, + .dst_addr = { + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + }, + .proto = -1, + }, + }, + .mask_sz = sizeof(struct rte_flow_item_ipv6), + .default_mask = &rte_flow_item_ipv6_mask, + .convert = tap_flow_create_ipv6, + }, + [RTE_FLOW_ITEM_TYPE_UDP] = { + .mask = &(const struct rte_flow_item_udp){ + .hdr = { + .src_port = -1, + .dst_port = -1, + }, + }, + .mask_sz = sizeof(struct rte_flow_item_udp), + .default_mask = &rte_flow_item_udp_mask, + .convert = tap_flow_create_udp, + }, + [RTE_FLOW_ITEM_TYPE_TCP] = { + .mask = &(const struct rte_flow_item_tcp){ + .hdr = { + .src_port = -1, + .dst_port = -1, + }, + }, + .mask_sz = sizeof(struct rte_flow_item_tcp), + .default_mask = &rte_flow_item_tcp_mask, + .convert = tap_flow_create_tcp, + }, +}; + +static struct remote_rule implicit_rte_flows[TAP_REMOTE_MAX_IDX] = { + [TAP_REMOTE_LOCAL_MAC] = { + .attr = { + .group = MAX_GROUP, + .priority = PRIORITY_MASK - TAP_REMOTE_LOCAL_MAC, + .ingress = 1, + }, + .items[0] = { + .type = RTE_FLOW_ITEM_TYPE_ETH, + .mask = &(const struct rte_flow_item_eth){ + .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", + }, + }, + .items[1] = { + .type = RTE_FLOW_ITEM_TYPE_END, + }, + .mirred = TCA_EGRESS_REDIR, + }, + [TAP_REMOTE_BROADCAST] = { + .attr = { + .group = MAX_GROUP, + .priority = PRIORITY_MASK - TAP_REMOTE_BROADCAST, + .ingress = 1, + }, + .items[0] = { + .type = RTE_FLOW_ITEM_TYPE_ETH, + .mask = &(const struct rte_flow_item_eth){ + .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", + }, + .spec = &(const struct rte_flow_item_eth){ + .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", + }, + }, + .items[1] = { + .type = RTE_FLOW_ITEM_TYPE_END, + }, + .mirred = TCA_EGRESS_MIRROR, + }, + [TAP_REMOTE_BROADCASTV6] = { + .attr = { + .group = MAX_GROUP, + .priority = PRIORITY_MASK - TAP_REMOTE_BROADCASTV6, + .ingress = 1, + }, + .items[0] = { + .type = RTE_FLOW_ITEM_TYPE_ETH, + .mask = &(const struct rte_flow_item_eth){ + .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00", + }, + .spec = &(const struct rte_flow_item_eth){ + .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00", + }, + }, + .items[1] = { + .type = RTE_FLOW_ITEM_TYPE_END, + }, + .mirred = TCA_EGRESS_MIRROR, + }, + [TAP_REMOTE_PROMISC] = { + .attr = { + .group = MAX_GROUP, + .priority = PRIORITY_MASK - TAP_REMOTE_PROMISC, + .ingress = 1, + }, + .items[0] = { + .type = RTE_FLOW_ITEM_TYPE_VOID, + }, + .items[1] = { + .type = RTE_FLOW_ITEM_TYPE_END, + }, + .mirred = TCA_EGRESS_MIRROR, + }, + [TAP_REMOTE_ALLMULTI] = { + .attr = { + .group = MAX_GROUP, + .priority = PRIORITY_MASK - TAP_REMOTE_ALLMULTI, + .ingress = 1, + }, + .items[0] = { + .type = RTE_FLOW_ITEM_TYPE_ETH, + .mask = &(const struct rte_flow_item_eth){ + .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00", + }, + .spec = &(const struct rte_flow_item_eth){ + .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00", + }, + }, + .items[1] = { + .type = RTE_FLOW_ITEM_TYPE_END, + }, + .mirred = TCA_EGRESS_MIRROR, + }, + [TAP_REMOTE_TX] = { + .attr = { + .group = 0, + .priority = TAP_REMOTE_TX, + .egress = 1, + }, + .items[0] = { + .type = RTE_FLOW_ITEM_TYPE_VOID, + }, + .items[1] = { + .type = RTE_FLOW_ITEM_TYPE_END, + }, + .mirred = TCA_EGRESS_MIRROR, + }, +}; + +/** + * Make as much checks as possible on an Ethernet item, and if a flow is + * provided, fill it appropriately with Ethernet info. + * + * @param[in] item + * Item specification. + * @param[in, out] data + * Additional data structure to tell next layers we've been here. + * + * @return + * 0 if checks are alright, -1 otherwise. + */ +static int +tap_flow_create_eth(const struct rte_flow_item *item, void *data) +{ + struct convert_data *info = (struct convert_data *)data; + const struct rte_flow_item_eth *spec = item->spec; + const struct rte_flow_item_eth *mask = item->mask; + struct rte_flow *flow = info->flow; + struct nlmsg *msg; + + /* use default mask if none provided */ + if (!mask) + mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask; + /* TC does not support eth_type masking. Only accept if exact match. */ + if (mask->type && mask->type != 0xffff) + return -1; + if (!spec) + return 0; + /* store eth_type for consistency if ipv4/6 pattern item comes next */ + if (spec->type & mask->type) + info->eth_type = spec->type; + if (!flow) + return 0; + msg = &flow->msg; + if (spec->type & mask->type) + msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, + (spec->type & mask->type)); + if (!is_zero_ether_addr(&spec->dst)) { + nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST, ETHER_ADDR_LEN, + &spec->dst.addr_bytes); + nlattr_add(&msg->nh, + TCA_FLOWER_KEY_ETH_DST_MASK, ETHER_ADDR_LEN, + &mask->dst.addr_bytes); + } + if (!is_zero_ether_addr(&mask->src)) { + nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC, ETHER_ADDR_LEN, + &spec->src.addr_bytes); + nlattr_add(&msg->nh, + TCA_FLOWER_KEY_ETH_SRC_MASK, ETHER_ADDR_LEN, + &mask->src.addr_bytes); + } + return 0; +} + +/** + * Make as much checks as possible on a VLAN item, and if a flow is provided, + * fill it appropriately with VLAN info. + * + * @param[in] item + * Item specification. + * @param[in, out] data + * Additional data structure to tell next layers we've been here. + * + * @return + * 0 if checks are alright, -1 otherwise. + */ +static int +tap_flow_create_vlan(const struct rte_flow_item *item, void *data) +{ + struct convert_data *info = (struct convert_data *)data; + const struct rte_flow_item_vlan *spec = item->spec; + const struct rte_flow_item_vlan *mask = item->mask; + struct rte_flow *flow = info->flow; + struct nlmsg *msg; + + /* use default mask if none provided */ + if (!mask) + mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask; + /* TC does not support tpid masking. Only accept if exact match. */ + if (mask->tpid && mask->tpid != 0xffff) + return -1; + /* Double-tagging not supported. */ + if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q)) + return -1; + info->vlan = 1; + if (!flow) + return 0; + msg = &flow->msg; + msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q)); +#define VLAN_PRIO(tci) ((tci) >> 13) +#define VLAN_ID(tci) ((tci) & 0xfff) + if (!spec) + return 0; + if (spec->tci) { + uint16_t tci = ntohs(spec->tci) & mask->tci; + uint16_t prio = VLAN_PRIO(tci); + uint8_t vid = VLAN_ID(tci); + + if (prio) + nlattr_add8(&msg->nh, TCA_FLOWER_KEY_VLAN_PRIO, prio); + if (vid) + nlattr_add16(&msg->nh, TCA_FLOWER_KEY_VLAN_ID, vid); + } + return 0; +} + +/** + * Make as much checks as possible on an IPv4 item, and if a flow is provided, + * fill it appropriately with IPv4 info. + * + * @param[in] item + * Item specification. + * @param[in, out] data + * Additional data structure to tell next layers we've been here. + * + * @return + * 0 if checks are alright, -1 otherwise. + */ +static int +tap_flow_create_ipv4(const struct rte_flow_item *item, void *data) +{ + struct convert_data *info = (struct convert_data *)data; + const struct rte_flow_item_ipv4 *spec = item->spec; + const struct rte_flow_item_ipv4 *mask = item->mask; + struct rte_flow *flow = info->flow; + struct nlmsg *msg; + + /* use default mask if none provided */ + if (!mask) + mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask; + /* check that previous eth type is compatible with ipv4 */ + if (info->eth_type && info->eth_type != htons(ETH_P_IP)) + return -1; + /* store ip_proto for consistency if udp/tcp pattern item comes next */ + if (spec) + info->ip_proto = spec->hdr.next_proto_id; + if (!flow) + return 0; + msg = &flow->msg; + if (!info->eth_type) + info->eth_type = htons(ETH_P_IP); + if (!info->vlan) + msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IP)); + if (!spec) + return 0; + if (spec->hdr.dst_addr) { + nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST, + spec->hdr.dst_addr); + nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK, + mask->hdr.dst_addr); + } + if (spec->hdr.src_addr) { + nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC, + spec->hdr.src_addr); + nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK, + mask->hdr.src_addr); + } + if (spec->hdr.next_proto_id) + nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, + spec->hdr.next_proto_id); + return 0; +} + +/** + * Make as much checks as possible on an IPv6 item, and if a flow is provided, + * fill it appropriately with IPv6 info. + * + * @param[in] item + * Item specification. + * @param[in, out] data + * Additional data structure to tell next layers we've been here. + * + * @return + * 0 if checks are alright, -1 otherwise. + */ +static int +tap_flow_create_ipv6(const struct rte_flow_item *item, void *data) +{ + struct convert_data *info = (struct convert_data *)data; + const struct rte_flow_item_ipv6 *spec = item->spec; + const struct rte_flow_item_ipv6 *mask = item->mask; + struct rte_flow *flow = info->flow; + uint8_t empty_addr[16] = { 0 }; + struct nlmsg *msg; + + /* use default mask if none provided */ + if (!mask) + mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask; + /* check that previous eth type is compatible with ipv6 */ + if (info->eth_type && info->eth_type != htons(ETH_P_IPV6)) + return -1; + /* store ip_proto for consistency if udp/tcp pattern item comes next */ + if (spec) + info->ip_proto = spec->hdr.proto; + if (!flow) + return 0; + msg = &flow->msg; + if (!info->eth_type) + info->eth_type = htons(ETH_P_IPV6); + if (!info->vlan) + msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IPV6)); + if (!spec) + return 0; + if (memcmp(spec->hdr.dst_addr, empty_addr, 16)) { + nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST, + sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr); + nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK, + sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr); + } + if (memcmp(spec->hdr.src_addr, empty_addr, 16)) { + nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC, + sizeof(spec->hdr.src_addr), &spec->hdr.src_addr); + nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK, + sizeof(mask->hdr.src_addr), &mask->hdr.src_addr); + } + if (spec->hdr.proto) + nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto); + return 0; +} + +/** + * Make as much checks as possible on a UDP item, and if a flow is provided, + * fill it appropriately with UDP info. + * + * @param[in] item + * Item specification. + * @param[in, out] data + * Additional data structure to tell next layers we've been here. + * + * @return + * 0 if checks are alright, -1 otherwise. + */ +static int +tap_flow_create_udp(const struct rte_flow_item *item, void *data) +{ + struct convert_data *info = (struct convert_data *)data; + const struct rte_flow_item_udp *spec = item->spec; + const struct rte_flow_item_udp *mask = item->mask; + struct rte_flow *flow = info->flow; + struct nlmsg *msg; + + /* use default mask if none provided */ + if (!mask) + mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask; + /* check that previous ip_proto is compatible with udp */ + if (info->ip_proto && info->ip_proto != IPPROTO_UDP) + return -1; + /* TC does not support UDP port masking. Only accept if exact match. */ + if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) || + (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff)) + return -1; + if (!flow) + return 0; + msg = &flow->msg; + nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP); + if (!spec) + return 0; + if (spec->hdr.dst_port & mask->hdr.dst_port) + nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST, + spec->hdr.dst_port); + if (spec->hdr.src_port & mask->hdr.src_port) + nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC, + spec->hdr.src_port); + return 0; +} + +/** + * Make as much checks as possible on a TCP item, and if a flow is provided, + * fill it appropriately with TCP info. + * + * @param[in] item + * Item specification. + * @param[in, out] data + * Additional data structure to tell next layers we've been here. + * + * @return + * 0 if checks are alright, -1 otherwise. + */ +static int +tap_flow_create_tcp(const struct rte_flow_item *item, void *data) +{ + struct convert_data *info = (struct convert_data *)data; + const struct rte_flow_item_tcp *spec = item->spec; + const struct rte_flow_item_tcp *mask = item->mask; + struct rte_flow *flow = info->flow; + struct nlmsg *msg; + + /* use default mask if none provided */ + if (!mask) + mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask; + /* check that previous ip_proto is compatible with tcp */ + if (info->ip_proto && info->ip_proto != IPPROTO_TCP) + return -1; + /* TC does not support TCP port masking. Only accept if exact match. */ + if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) || + (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff)) + return -1; + if (!flow) + return 0; + msg = &flow->msg; + nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP); + if (!spec) + return 0; + if (spec->hdr.dst_port & mask->hdr.dst_port) + nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST, + spec->hdr.dst_port); + if (spec->hdr.src_port & mask->hdr.src_port) + nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC, + spec->hdr.src_port); + return 0; +} + +/** + * Check support for a given item. + * + * @param[in] item + * Item specification. + * @param size + * Bit-Mask size in bytes. + * @param[in] supported_mask + * Bit-mask covering supported fields to compare with spec, last and mask in + * \item. + * @param[in] default_mask + * Bit-mask default mask if none is provided in \item. + * + * @return + * 0 on success. + */ +static int +tap_flow_item_validate(const struct rte_flow_item *item, + unsigned int size, + const uint8_t *supported_mask, + const uint8_t *default_mask) +{ + int ret = 0; + + /* An empty layer is allowed, as long as all fields are NULL */ + if (!item->spec && (item->mask || item->last)) + return -1; + /* Is the item spec compatible with what the NIC supports? */ + if (item->spec && !item->mask) { + unsigned int i; + const uint8_t *spec = item->spec; + + for (i = 0; i < size; ++i) + if ((spec[i] | supported_mask[i]) != supported_mask[i]) + return -1; + /* Is the default mask compatible with what the NIC supports? */ + for (i = 0; i < size; i++) + if ((default_mask[i] | supported_mask[i]) != + supported_mask[i]) + return -1; + } + /* Is the item last compatible with what the NIC supports? */ + if (item->last && !item->mask) { + unsigned int i; + const uint8_t *spec = item->last; + + for (i = 0; i < size; ++i) + if ((spec[i] | supported_mask[i]) != supported_mask[i]) + return -1; + } + /* Is the item mask compatible with what the NIC supports? */ + if (item->mask) { + unsigned int i; + const uint8_t *spec = item->mask; + + for (i = 0; i < size; ++i) + if ((spec[i] | supported_mask[i]) != supported_mask[i]) + return -1; + } + /** + * Once masked, Are item spec and item last equal? + * TC does not support range so anything else is invalid. + */ + if (item->spec && item->last) { + uint8_t spec[size]; + uint8_t last[size]; + const uint8_t *apply = default_mask; + unsigned int i; + + if (item->mask) + apply = item->mask; + for (i = 0; i < size; ++i) { + spec[i] = ((const uint8_t *)item->spec)[i] & apply[i]; + last[i] = ((const uint8_t *)item->last)[i] & apply[i]; + } + ret = memcmp(spec, last, size); + } + return ret; +} + +/** + * Transform a DROP/PASSTHRU action item in the provided flow for TC. + * + * @param[in, out] flow + * Flow to be filled. + * @param[in] action + * Appropriate action to be set in the TCA_GACT_PARMS structure. + * + * @return + * 0 if checks are alright, -1 otherwise. + */ +static int +add_action_gact(struct rte_flow *flow, int action) +{ + struct nlmsg *msg = &flow->msg; + size_t act_index = 1; + struct tc_gact p = { + .action = action + }; + + if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0) + return -1; + if (nlattr_nested_start(msg, act_index++) < 0) + return -1; + nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("gact"), "gact"); + if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0) + return -1; + nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(p), &p); + nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */ + nlattr_nested_finish(msg); /* nested act_index */ + nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */ + return 0; +} + +/** + * Transform a MIRRED action item in the provided flow for TC. + * + * @param[in, out] flow + * Flow to be filled. + * @param[in] ifindex + * Netdevice ifindex, where to mirror/redirect packet to. + * @param[in] action_type + * Either TCA_EGRESS_REDIR for redirection or TCA_EGRESS_MIRROR for mirroring. + * + * @return + * 0 if checks are alright, -1 otherwise. + */ +static int +add_action_mirred(struct rte_flow *flow, uint16_t ifindex, uint16_t action_type) +{ + struct nlmsg *msg = &flow->msg; + size_t act_index = 1; + struct tc_mirred p = { + .eaction = action_type, + .ifindex = ifindex, + }; + + if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0) + return -1; + if (nlattr_nested_start(msg, act_index++) < 0) + return -1; + nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("mirred"), "mirred"); + if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0) + return -1; + if (action_type == TCA_EGRESS_MIRROR) + p.action = TC_ACT_PIPE; + else /* REDIRECT */ + p.action = TC_ACT_STOLEN; + nlattr_add(&msg->nh, TCA_MIRRED_PARMS, sizeof(p), &p); + nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */ + nlattr_nested_finish(msg); /* nested act_index */ + nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */ + return 0; +} + +/** + * Transform a QUEUE action item in the provided flow for TC. + * + * @param[in, out] flow + * Flow to be filled. + * @param[in] queue + * Queue id to use. + * + * @return + * 0 if checks are alright, -1 otherwise. + */ +static int +add_action_skbedit(struct rte_flow *flow, uint16_t queue) +{ + struct nlmsg *msg = &flow->msg; + size_t act_index = 1; + struct tc_skbedit p = { + .action = TC_ACT_PIPE + }; + + if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0) + return -1; + if (nlattr_nested_start(msg, act_index++) < 0) + return -1; + nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("skbedit"), "skbedit"); + if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0) + return -1; + nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, sizeof(p), &p); + nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, queue); + nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */ + nlattr_nested_finish(msg); /* nested act_index */ + nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */ + return 0; +} + +/** + * Validate a flow supported by TC. + * If flow param is not NULL, then also fill the netlink message inside. + * + * @param pmd + * Pointer to private structure. + * @param[in] attr + * Flow rule attributes. + * @param[in] pattern + * Pattern specification (list terminated by the END pattern item). + * @param[in] actions + * Associated actions (list terminated by the END action). + * @param[out] error + * Perform verbose error reporting if not NULL. + * @param[in, out] flow + * Flow structure to update. + * @param[in] mirred + * If set to TCA_EGRESS_REDIR, provided actions will be replaced with a + * redirection to the tap netdevice, and the TC rule will be configured + * on the remote netdevice in pmd. + * If set to TCA_EGRESS_MIRROR, provided actions will be replaced with a + * mirroring to the tap netdevice, and the TC rule will be configured + * on the remote netdevice in pmd. Matching packets will thus be duplicated. + * If set to 0, the standard behavior is to be used: set correct actions for + * the TC rule, and apply it on the tap netdevice. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +priv_flow_process(struct pmd_internals *pmd, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error, + struct rte_flow *flow, + int mirred) +{ + const struct tap_flow_items *cur_item = tap_flow_items; + struct convert_data data = { + .eth_type = 0, + .ip_proto = 0, + .flow = flow, + }; + int action = 0; /* Only one action authorized for now */ + + if (attr->group > MAX_GROUP) { + rte_flow_error_set( + error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP, + NULL, "group value too big: cannot exceed 15"); + return -rte_errno; + } + if (attr->priority > MAX_PRIORITY) { + rte_flow_error_set( + error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, + NULL, "priority value too big"); + return -rte_errno; + } else if (flow) { + uint16_t group = attr->group << GROUP_SHIFT; + uint16_t prio = group | (attr->priority + PRIORITY_OFFSET); + flow->msg.t.tcm_info = TC_H_MAKE(prio << 16, + flow->msg.t.tcm_info); + } + if (flow) { + if (mirred) { + /* + * If attr->ingress, the rule applies on remote ingress + * to match incoming packets + * If attr->egress, the rule applies on tap ingress (as + * seen from the kernel) to deal with packets going out + * from the DPDK app. + */ + flow->msg.t.tcm_parent = TC_H_MAKE(TC_H_INGRESS, 0); + } else { + /* Standard rule on tap egress (kernel standpoint). */ + flow->msg.t.tcm_parent = + TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0); + } + /* use flower filter type */ + nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower"); + if (nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0) + goto exit_item_not_supported; + } + for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) { + const struct tap_flow_items *token = NULL; + unsigned int i; + int err = 0; + + if (items->type == RTE_FLOW_ITEM_TYPE_VOID) + continue; + for (i = 0; + cur_item->items && + cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END; + ++i) { + if (cur_item->items[i] == items->type) { + token = &tap_flow_items[items->type]; + break; + } + } + if (!token) + goto exit_item_not_supported; + cur_item = token; + err = tap_flow_item_validate( + items, cur_item->mask_sz, + (const uint8_t *)cur_item->mask, + (const uint8_t *)cur_item->default_mask); + if (err) + goto exit_item_not_supported; + if (flow && cur_item->convert) { + if (!pmd->flower_vlan_support && + cur_item->convert == tap_flow_create_vlan) + goto exit_item_not_supported; + err = cur_item->convert(items, &data); + if (err) + goto exit_item_not_supported; + } + } + if (flow) { + if (pmd->flower_vlan_support && data.vlan) { + nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE, + htons(ETH_P_8021Q)); + nlattr_add16(&flow->msg.nh, + TCA_FLOWER_KEY_VLAN_ETH_TYPE, + data.eth_type ? + data.eth_type : htons(ETH_P_ALL)); + } else if (data.eth_type) { + nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE, + data.eth_type); + } + } + if (mirred && flow) { + uint16_t if_index = pmd->if_index; + + /* + * If attr->egress && mirred, then this is a special + * case where the rule must be applied on the tap, to + * redirect packets coming from the DPDK App, out + * through the remote netdevice. + */ + if (attr->egress) + if_index = pmd->remote_if_index; + if (add_action_mirred(flow, if_index, mirred) < 0) + goto exit_action_not_supported; + else + goto end; + } + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) { + int err = 0; + + if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) { + continue; + } else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) { + if (action) + goto exit_action_not_supported; + action = 1; + if (flow) + err = add_action_gact(flow, TC_ACT_SHOT); + } else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) { + if (action) + goto exit_action_not_supported; + action = 1; + if (flow) + err = add_action_gact(flow, TC_ACT_UNSPEC); + } else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) { + const struct rte_flow_action_queue *queue = + (const struct rte_flow_action_queue *) + actions->conf; + if (action) + goto exit_action_not_supported; + action = 1; + if (!queue || (queue->index >= pmd->nb_queues)) + goto exit_action_not_supported; + if (flow) + err = add_action_skbedit(flow, queue->index); + } else { + goto exit_action_not_supported; + } + if (err) + goto exit_action_not_supported; + } +end: + if (flow) + nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */ + return 0; +exit_item_not_supported: + rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + items, "item not supported"); + return -rte_errno; +exit_action_not_supported: + rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, + actions, "action not supported"); + return -rte_errno; +} + + + +/** + * Validate a flow. + * + * @see rte_flow_validate() + * @see rte_flow_ops + */ +static int +tap_flow_validate(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + struct pmd_internals *pmd = dev->data->dev_private; + + return priv_flow_process(pmd, attr, items, actions, error, NULL, 0); +} + +/** + * Set a unique handle in a flow. + * + * The kernel supports TC rules with equal priority, as long as they use the + * same matching fields (e.g.: dst mac and ipv4) with different values (and + * full mask to ensure no collision is possible). + * In those rules, the handle (uint32_t) is the part that would identify + * specifically each rule. + * + * On 32-bit architectures, the handle can simply be the flow's pointer address. + * On 64-bit architectures, we rely on jhash(flow) to find a (sufficiently) + * unique handle. + * + * @param[in, out] flow + * The flow that needs its handle set. + */ +static void +tap_flow_set_handle(struct rte_flow *flow) +{ + uint32_t handle = 0; + + if (sizeof(flow) > 4) + handle = rte_jhash(&flow, sizeof(flow), 1); + else + handle = (uintptr_t)flow; + /* must be at least 1 to avoid letting the kernel choose one for us */ + if (!handle) + handle = 1; + flow->msg.t.tcm_handle = handle; +} + +/** + * Create a flow. + * + * @see rte_flow_create() + * @see rte_flow_ops + */ +static struct rte_flow * +tap_flow_create(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + struct pmd_internals *pmd = dev->data->dev_private; + struct rte_flow *remote_flow = NULL; + struct rte_flow *flow = NULL; + struct nlmsg *msg = NULL; + int err; + + if (!pmd->if_index) { + rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, + "can't create rule, ifindex not found"); + goto fail; + } + /* + * No rules configured through standard rte_flow should be set on the + * priorities used by implicit rules. + */ + if ((attr->group == MAX_GROUP) && + attr->priority > (MAX_PRIORITY - TAP_REMOTE_MAX_IDX)) { + rte_flow_error_set( + error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, + NULL, "priority value too big"); + goto fail; + } + flow = rte_malloc(__func__, sizeof(struct rte_flow), 0); + if (!flow) { + rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "cannot allocate memory for rte_flow"); + goto fail; + } + msg = &flow->msg; + tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER, + NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE); + msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL)); + tap_flow_set_handle(flow); + if (priv_flow_process(pmd, attr, items, actions, error, flow, 0)) + goto fail; + err = nl_send(pmd->nlsk_fd, &msg->nh); + if (err < 0) { + rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "couldn't send request to kernel"); + goto fail; + } + err = nl_recv_ack(pmd->nlsk_fd); + if (err < 0) { + RTE_LOG(ERR, PMD, + "Kernel refused TC filter rule creation (%d): %s\n", + errno, strerror(errno)); + rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "overlapping rules"); + goto fail; + } + LIST_INSERT_HEAD(&pmd->flows, flow, next); + /** + * If a remote device is configured, a TC rule with identical items for + * matching must be set on that device, with a single action: redirect + * to the local pmd->if_index. + */ + if (pmd->remote_if_index) { + remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0); + if (!remote_flow) { + rte_flow_error_set( + error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, + "cannot allocate memory for rte_flow"); + goto fail; + } + msg = &remote_flow->msg; + /* set the rule if_index for the remote netdevice */ + tc_init_msg( + msg, pmd->remote_if_index, RTM_NEWTFILTER, + NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE); + msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL)); + tap_flow_set_handle(remote_flow); + if (priv_flow_process(pmd, attr, items, NULL, + error, remote_flow, TCA_EGRESS_REDIR)) { + rte_flow_error_set( + error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "rte flow rule validation failed"); + goto fail; + } + err = nl_send(pmd->nlsk_fd, &msg->nh); + if (err < 0) { + rte_flow_error_set( + error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "Failure sending nl request"); + goto fail; + } + err = nl_recv_ack(pmd->nlsk_fd); + if (err < 0) { + RTE_LOG(ERR, PMD, + "Kernel refused TC filter rule creation (%d): %s\n", + errno, strerror(errno)); + rte_flow_error_set( + error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "overlapping rules"); + goto fail; + } + flow->remote_flow = remote_flow; + } + return flow; +fail: + if (remote_flow) + rte_free(remote_flow); + if (flow) + rte_free(flow); + return NULL; +} + +/** + * Destroy a flow using pointer to pmd_internal. + * + * @param[in, out] pmd + * Pointer to private structure. + * @param[in] flow + * Pointer to the flow to destroy. + * @param[in, out] error + * Pointer to the flow error handler + * + * @return 0 if the flow could be destroyed, -1 otherwise. + */ +static int +tap_flow_destroy_pmd(struct pmd_internals *pmd, + struct rte_flow *flow, + struct rte_flow_error *error) +{ + struct rte_flow *remote_flow = flow->remote_flow; + int ret = 0; + + LIST_REMOVE(flow, next); + flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + flow->msg.nh.nlmsg_type = RTM_DELTFILTER; + + ret = nl_send(pmd->nlsk_fd, &flow->msg.nh); + if (ret < 0) { + rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "couldn't send request to kernel"); + goto end; + } + ret = nl_recv_ack(pmd->nlsk_fd); + /* If errno is ENOENT, the rule is already no longer in the kernel. */ + if (ret < 0 && errno == ENOENT) + ret = 0; + if (ret < 0) { + RTE_LOG(ERR, PMD, + "Kernel refused TC filter rule deletion (%d): %s\n", + errno, strerror(errno)); + rte_flow_error_set( + error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, + "couldn't receive kernel ack to our request"); + goto end; + } + if (remote_flow) { + remote_flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + remote_flow->msg.nh.nlmsg_type = RTM_DELTFILTER; + + ret = nl_send(pmd->nlsk_fd, &remote_flow->msg.nh); + if (ret < 0) { + rte_flow_error_set( + error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "Failure sending nl request"); + goto end; + } + ret = nl_recv_ack(pmd->nlsk_fd); + if (ret < 0 && errno == ENOENT) + ret = 0; + if (ret < 0) { + RTE_LOG(ERR, PMD, + "Kernel refused TC filter rule deletion (%d): %s\n", + errno, strerror(errno)); + rte_flow_error_set( + error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "Failure trying to receive nl ack"); + goto end; + } + } +end: + if (remote_flow) + rte_free(remote_flow); + rte_free(flow); + return ret; +} + +/** + * Destroy a flow. + * + * @see rte_flow_destroy() + * @see rte_flow_ops + */ +static int +tap_flow_destroy(struct rte_eth_dev *dev, + struct rte_flow *flow, + struct rte_flow_error *error) +{ + struct pmd_internals *pmd = dev->data->dev_private; + + return tap_flow_destroy_pmd(pmd, flow, error); +} + +/** + * Destroy all flows. + * + * @see rte_flow_flush() + * @see rte_flow_ops + */ +int +tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error) +{ + struct pmd_internals *pmd = dev->data->dev_private; + struct rte_flow *flow; + + while (!LIST_EMPTY(&pmd->flows)) { + flow = LIST_FIRST(&pmd->flows); + if (tap_flow_destroy(dev, flow, error) < 0) + return -1; + } + return 0; +} + +/** + * Add an implicit flow rule on the remote device to make sure traffic gets to + * the tap netdevice from there. + * + * @param pmd + * Pointer to private structure. + * @param[in] idx + * The idx in the implicit_rte_flows array specifying which rule to apply. + * + * @return -1 if the rule couldn't be applied, 0 otherwise. + */ +int tap_flow_implicit_create(struct pmd_internals *pmd, + enum implicit_rule_index idx) +{ + struct rte_flow_item *items = implicit_rte_flows[idx].items; + struct rte_flow_attr *attr = &implicit_rte_flows[idx].attr; + struct rte_flow_item_eth eth_local = { .type = 0 }; + uint16_t if_index = pmd->remote_if_index; + struct rte_flow *remote_flow = NULL; + struct nlmsg *msg = NULL; + int err = 0; + struct rte_flow_item items_local[2] = { + [0] = { + .type = items[0].type, + .spec = ð_local, + .mask = items[0].mask, + }, + [1] = { + .type = items[1].type, + } + }; + + remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0); + if (!remote_flow) { + RTE_LOG(ERR, PMD, "Cannot allocate memory for rte_flow"); + goto fail; + } + msg = &remote_flow->msg; + if (idx == TAP_REMOTE_TX) { + if_index = pmd->if_index; + } else if (idx == TAP_REMOTE_LOCAL_MAC) { + /* + * eth addr couldn't be set in implicit_rte_flows[] as it is not + * known at compile time. + */ + memcpy(ð_local.dst, &pmd->eth_addr, sizeof(pmd->eth_addr)); + items = items_local; + } + tc_init_msg(msg, if_index, RTM_NEWTFILTER, + NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE); + msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL)); + tap_flow_set_handle(remote_flow); + if (priv_flow_process(pmd, attr, items, NULL, NULL, + remote_flow, implicit_rte_flows[idx].mirred)) { + RTE_LOG(ERR, PMD, "rte flow rule validation failed\n"); + goto fail; + } + err = nl_send(pmd->nlsk_fd, &msg->nh); + if (err < 0) { + RTE_LOG(ERR, PMD, "Failure sending nl request"); + goto fail; + } + err = nl_recv_ack(pmd->nlsk_fd); + if (err < 0) { + RTE_LOG(ERR, PMD, + "Kernel refused TC filter rule creation (%d): %s\n", + errno, strerror(errno)); + goto fail; + } + LIST_INSERT_HEAD(&pmd->implicit_flows, remote_flow, next); + return 0; +fail: + if (remote_flow) + rte_free(remote_flow); + return -1; +} + +/** + * Remove specific implicit flow rule on the remote device. + * + * @param[in, out] pmd + * Pointer to private structure. + * @param[in] idx + * The idx in the implicit_rte_flows array specifying which rule to remove. + * + * @return -1 if one of the implicit rules couldn't be created, 0 otherwise. + */ +int tap_flow_implicit_destroy(struct pmd_internals *pmd, + enum implicit_rule_index idx) +{ + struct rte_flow *remote_flow; + int cur_prio = -1; + int idx_prio = implicit_rte_flows[idx].attr.priority + PRIORITY_OFFSET; + + for (remote_flow = LIST_FIRST(&pmd->implicit_flows); + remote_flow; + remote_flow = LIST_NEXT(remote_flow, next)) { + cur_prio = (remote_flow->msg.t.tcm_info >> 16) & PRIORITY_MASK; + if (cur_prio != idx_prio) + continue; + return tap_flow_destroy_pmd(pmd, remote_flow, NULL); + } + return 0; +} + +/** + * Destroy all implicit flows. + * + * @see rte_flow_flush() + */ +int +tap_flow_implicit_flush(struct pmd_internals *pmd, struct rte_flow_error *error) +{ + struct rte_flow *remote_flow; + + while (!LIST_EMPTY(&pmd->implicit_flows)) { + remote_flow = LIST_FIRST(&pmd->implicit_flows); + if (tap_flow_destroy_pmd(pmd, remote_flow, error) < 0) + return -1; + } + return 0; +} + +/** + * Manage filter operations. + * + * @param dev + * Pointer to Ethernet device structure. + * @param filter_type + * Filter type. + * @param filter_op + * Operation to perform. + * @param arg + * Pointer to operation-specific structure. + * + * @return + * 0 on success, negative errno value on failure. + */ +int +tap_dev_filter_ctrl(struct rte_eth_dev *dev, + enum rte_filter_type filter_type, + enum rte_filter_op filter_op, + void *arg) +{ + struct pmd_internals *pmd = dev->data->dev_private; + + if (!pmd->flower_support) + return -ENOTSUP; + switch (filter_type) { + case RTE_ETH_FILTER_GENERIC: + if (filter_op != RTE_ETH_FILTER_GET) + return -EINVAL; + *(const void **)arg = &tap_flow_ops; + return 0; + default: + RTE_LOG(ERR, PMD, "%p: filter type (%d) not supported", + (void *)dev, filter_type); + } + return -EINVAL; +} + diff --git a/src/seastar/dpdk/drivers/net/tap/tap_flow.h b/src/seastar/dpdk/drivers/net/tap/tap_flow.h new file mode 100644 index 00000000..94414f18 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/tap/tap_flow.h @@ -0,0 +1,82 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TAP_FLOW_H_ +#define _TAP_FLOW_H_ + +#include <rte_flow.h> +#include <rte_flow_driver.h> +#include <rte_eth_tap.h> + +/** + * In TC, priority 0 means we require the kernel to allocate one for us. + * In rte_flow, however, we want the priority 0 to be the most important one. + * Use an offset to have the most important priority being 1 in TC. + */ +#define PRIORITY_OFFSET 1 +#define PRIORITY_MASK (0xfff) +#define MAX_PRIORITY (PRIORITY_MASK - PRIORITY_OFFSET) +#define GROUP_MASK (0xf) +#define GROUP_SHIFT 12 +#define MAX_GROUP GROUP_MASK + +/** + * These index are actually in reversed order: their priority is processed + * by subtracting their value to the lowest priority (PRIORITY_MASK). + * Thus the first one will have the lowest priority in the end + * (but biggest value). + */ +enum implicit_rule_index { + TAP_REMOTE_TX, + TAP_REMOTE_BROADCASTV6, + TAP_REMOTE_BROADCAST, + TAP_REMOTE_ALLMULTI, + TAP_REMOTE_PROMISC, + TAP_REMOTE_LOCAL_MAC, + TAP_REMOTE_MAX_IDX, +}; + +int tap_dev_filter_ctrl(struct rte_eth_dev *dev, + enum rte_filter_type filter_type, + enum rte_filter_op filter_op, + void *arg); +int tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error); + +int tap_flow_implicit_create(struct pmd_internals *pmd, + enum implicit_rule_index idx); +int tap_flow_implicit_destroy(struct pmd_internals *pmd, + enum implicit_rule_index idx); +int tap_flow_implicit_flush(struct pmd_internals *pmd, + struct rte_flow_error *error); + +#endif /* _TAP_FLOW_H_ */ diff --git a/src/seastar/dpdk/drivers/net/tap/tap_netlink.c b/src/seastar/dpdk/drivers/net/tap/tap_netlink.c new file mode 100644 index 00000000..ee92e2e7 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/tap/tap_netlink.c @@ -0,0 +1,367 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <errno.h> +#include <inttypes.h> +#include <linux/netlink.h> +#include <string.h> +#include <sys/socket.h> +#include <unistd.h> + +#include <rte_malloc.h> +#include <tap_netlink.h> +#include <rte_random.h> + +/* Must be quite large to support dumping a huge list of QDISC or filters. */ +#define BUF_SIZE (32 * 1024) /* Size of the buffer to receive kernel messages */ +#define SNDBUF_SIZE 32768 /* Send buffer size for the netlink socket */ +#define RCVBUF_SIZE 32768 /* Receive buffer size for the netlink socket */ + +struct nested_tail { + struct rtattr *tail; + struct nested_tail *prev; +}; + +/** + * Initialize a netlink socket for communicating with the kernel. + * + * @param nl_groups + * Set it to a netlink group value (e.g. RTMGRP_LINK) to receive messages for + * specific netlink multicast groups. Otherwise, no subscription will be made. + * + * @return + * netlink socket file descriptor on success, -1 otherwise. + */ +int +nl_init(uint32_t nl_groups) +{ + int fd, sndbuf_size = SNDBUF_SIZE, rcvbuf_size = RCVBUF_SIZE; + struct sockaddr_nl local = { + .nl_family = AF_NETLINK, + .nl_groups = nl_groups, + }; + + fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); + if (fd < 0) { + RTE_LOG(ERR, PMD, "Unable to create a netlink socket\n"); + return -1; + } + if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int))) { + RTE_LOG(ERR, PMD, "Unable to set socket buffer send size\n"); + return -1; + } + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int))) { + RTE_LOG(ERR, PMD, "Unable to set socket buffer receive size\n"); + return -1; + } + if (bind(fd, (struct sockaddr *)&local, sizeof(local)) < 0) { + RTE_LOG(ERR, PMD, "Unable to bind to the netlink socket\n"); + return -1; + } + return fd; +} + +/** + * Clean up a netlink socket once all communicating with the kernel is finished. + * + * @param[in] nlsk_fd + * The netlink socket file descriptor used for communication. + * + * @return + * 0 on success, -1 otherwise. + */ +int +nl_final(int nlsk_fd) +{ + if (close(nlsk_fd)) { + RTE_LOG(ERR, PMD, "Failed to close netlink socket: %s (%d)\n", + strerror(errno), errno); + return -1; + } + return 0; +} + +/** + * Send a message to the kernel on the netlink socket. + * + * @param[in] nlsk_fd + * The netlink socket file descriptor used for communication. + * @param[in] nh + * The netlink message send to the kernel. + * + * @return + * the number of sent bytes on success, -1 otherwise. + */ +int +nl_send(int nlsk_fd, struct nlmsghdr *nh) +{ + /* man 7 netlink EXAMPLE */ + struct sockaddr_nl sa = { + .nl_family = AF_NETLINK, + }; + struct iovec iov = { + .iov_base = nh, + .iov_len = nh->nlmsg_len, + }; + struct msghdr msg = { + .msg_name = &sa, + .msg_namelen = sizeof(sa), + .msg_iov = &iov, + .msg_iovlen = 1, + }; + int send_bytes; + + nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */ + nh->nlmsg_seq = (uint32_t)rte_rand(); + send_bytes = sendmsg(nlsk_fd, &msg, 0); + if (send_bytes < 0) { + RTE_LOG(ERR, PMD, "Failed to send netlink message: %s (%d)\n", + strerror(errno), errno); + return -1; + } + return send_bytes; +} + +/** + * Check that the kernel sends an appropriate ACK in response to an nl_send(). + * + * @param[in] nlsk_fd + * The netlink socket file descriptor used for communication. + * + * @return + * 0 on success, -1 otherwise with errno set. + */ +int +nl_recv_ack(int nlsk_fd) +{ + return nl_recv(nlsk_fd, NULL, NULL); +} + +/** + * Receive a message from the kernel on the netlink socket, following an + * nl_send(). + * + * @param[in] nlsk_fd + * The netlink socket file descriptor used for communication. + * @param[in] cb + * The callback function to call for each netlink message received. + * @param[in, out] arg + * Custom arguments for the callback. + * + * @return + * 0 on success, -1 otherwise with errno set. + */ +int +nl_recv(int nlsk_fd, int (*cb)(struct nlmsghdr *, void *arg), void *arg) +{ + /* man 7 netlink EXAMPLE */ + struct sockaddr_nl sa; + char buf[BUF_SIZE]; + struct iovec iov = { + .iov_base = buf, + .iov_len = sizeof(buf), + }; + struct msghdr msg = { + .msg_name = &sa, + .msg_namelen = sizeof(sa), + .msg_iov = &iov, + /* One message at a time */ + .msg_iovlen = 1, + }; + int multipart = 0; + int ret = 0; + + do { + struct nlmsghdr *nh; + int recv_bytes = 0; + + recv_bytes = recvmsg(nlsk_fd, &msg, 0); + if (recv_bytes < 0) + return -1; + for (nh = (struct nlmsghdr *)buf; + NLMSG_OK(nh, (unsigned int)recv_bytes); + nh = NLMSG_NEXT(nh, recv_bytes)) { + if (nh->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *err_data = NLMSG_DATA(nh); + + if (err_data->error < 0) { + errno = -err_data->error; + return -1; + } + /* Ack message. */ + return 0; + } + /* Multi-part msgs and their trailing DONE message. */ + if (nh->nlmsg_flags & NLM_F_MULTI) { + if (nh->nlmsg_type == NLMSG_DONE) + return 0; + multipart = 1; + } + if (cb) + ret = cb(nh, arg); + } + } while (multipart); + return ret; +} + +/** + * Append a netlink attribute to a message. + * + * @param[in, out] nh + * The netlink message to parse, received from the kernel. + * @param[in] type + * The type of attribute to append. + * @param[in] data_len + * The length of the data to append. + * @param[in] data + * The data to append. + */ +void +nlattr_add(struct nlmsghdr *nh, unsigned short type, + unsigned int data_len, const void *data) +{ + /* see man 3 rtnetlink */ + struct rtattr *rta; + + rta = (struct rtattr *)NLMSG_TAIL(nh); + rta->rta_len = RTA_LENGTH(data_len); + rta->rta_type = type; + memcpy(RTA_DATA(rta), data, data_len); + nh->nlmsg_len = NLMSG_ALIGN(nh->nlmsg_len) + RTA_ALIGN(rta->rta_len); +} + +/** + * Append a uint8_t netlink attribute to a message. + * + * @param[in, out] nh + * The netlink message to parse, received from the kernel. + * @param[in] type + * The type of attribute to append. + * @param[in] data + * The data to append. + */ +void +nlattr_add8(struct nlmsghdr *nh, unsigned short type, uint8_t data) +{ + nlattr_add(nh, type, sizeof(uint8_t), &data); +} + +/** + * Append a uint16_t netlink attribute to a message. + * + * @param[in, out] nh + * The netlink message to parse, received from the kernel. + * @param[in] type + * The type of attribute to append. + * @param[in] data + * The data to append. + */ +void +nlattr_add16(struct nlmsghdr *nh, unsigned short type, uint16_t data) +{ + nlattr_add(nh, type, sizeof(uint16_t), &data); +} + +/** + * Append a uint16_t netlink attribute to a message. + * + * @param[in, out] nh + * The netlink message to parse, received from the kernel. + * @param[in] type + * The type of attribute to append. + * @param[in] data + * The data to append. + */ +void +nlattr_add32(struct nlmsghdr *nh, unsigned short type, uint32_t data) +{ + nlattr_add(nh, type, sizeof(uint32_t), &data); +} + +/** + * Start a nested netlink attribute. + * It must be followed later by a call to nlattr_nested_finish(). + * + * @param[in, out] msg + * The netlink message where to edit the nested_tails metadata. + * @param[in] type + * The nested attribute type to append. + * + * @return + * -1 if adding a nested netlink attribute failed, 0 otherwise. + */ +int +nlattr_nested_start(struct nlmsg *msg, uint16_t type) +{ + struct nested_tail *tail; + + tail = rte_zmalloc(NULL, sizeof(struct nested_tail), 0); + if (!tail) { + RTE_LOG(ERR, PMD, + "Couldn't allocate memory for nested netlink" + " attribute\n"); + return -1; + } + + tail->tail = (struct rtattr *)NLMSG_TAIL(&msg->nh); + + nlattr_add(&msg->nh, type, 0, NULL); + + tail->prev = msg->nested_tails; + + msg->nested_tails = tail; + + return 0; +} + +/** + * End a nested netlink attribute. + * It follows a call to nlattr_nested_start(). + * In effect, it will modify the nested attribute length to include every bytes + * from the nested attribute start, up to here. + * + * @param[in, out] msg + * The netlink message where to edit the nested_tails metadata. + */ +void +nlattr_nested_finish(struct nlmsg *msg) +{ + struct nested_tail *tail = msg->nested_tails; + + tail->tail->rta_len = (char *)NLMSG_TAIL(&msg->nh) - (char *)tail->tail; + + if (tail->prev) + msg->nested_tails = tail->prev; + + rte_free(tail); +} diff --git a/src/seastar/dpdk/drivers/net/tap/tap_netlink.h b/src/seastar/dpdk/drivers/net/tap/tap_netlink.h new file mode 100644 index 00000000..98e13902 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/tap/tap_netlink.h @@ -0,0 +1,69 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TAP_NETLINK_H_ +#define _TAP_NETLINK_H_ + +#include <ctype.h> +#include <inttypes.h> +#include <linux/rtnetlink.h> +#include <linux/netlink.h> +#include <stdio.h> + +#include <rte_log.h> + +#define NLMSG_BUF 512 + +struct nlmsg { + struct nlmsghdr nh; + struct tcmsg t; + char buf[NLMSG_BUF]; + struct nested_tail *nested_tails; +}; + +#define NLMSG_TAIL(nlh) (void *)((char *)(nlh) + NLMSG_ALIGN((nlh)->nlmsg_len)) + +int nl_init(uint32_t nl_groups); +int nl_final(int nlsk_fd); +int nl_send(int nlsk_fd, struct nlmsghdr *nh); +int nl_recv(int nlsk_fd, int (*callback)(struct nlmsghdr *, void *), void *arg); +int nl_recv_ack(int nlsk_fd); +void nlattr_add(struct nlmsghdr *nh, unsigned short type, + unsigned int data_len, const void *data); +void nlattr_add8(struct nlmsghdr *nh, unsigned short type, uint8_t data); +void nlattr_add16(struct nlmsghdr *nh, unsigned short type, uint16_t data); +void nlattr_add32(struct nlmsghdr *nh, unsigned short type, uint32_t data); +int nlattr_nested_start(struct nlmsg *msg, uint16_t type); +void nlattr_nested_finish(struct nlmsg *msg); + +#endif /* _TAP_NETLINK_H_ */ diff --git a/src/seastar/dpdk/drivers/net/tap/tap_tcmsgs.c b/src/seastar/dpdk/drivers/net/tap/tap_tcmsgs.c new file mode 100644 index 00000000..d74ac805 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/tap/tap_tcmsgs.c @@ -0,0 +1,323 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <inttypes.h> +#include <linux/netlink.h> +#include <net/if.h> +#include <string.h> + +#include <rte_log.h> +#include <tap_tcmsgs.h> + +struct qdisc { + uint32_t handle; + uint32_t parent; +}; + +struct list_args { + int nlsk_fd; + uint16_t ifindex; + void *custom_arg; +}; + +struct qdisc_custom_arg { + uint32_t handle; + uint32_t parent; + uint8_t exists; +}; + +/** + * Initialize a netlink message with a TC header. + * + * @param[in, out] msg + * The netlink message to fill. + * @param[in] ifindex + * The netdevice ifindex where the rule will be applied. + * @param[in] type + * The type of TC message to create (RTM_NEWTFILTER, RTM_NEWQDISC, etc.). + * @param[in] flags + * Overrides the default netlink flags for this msg with those specified. + */ +void +tc_init_msg(struct nlmsg *msg, uint16_t ifindex, uint16_t type, uint16_t flags) +{ + struct nlmsghdr *n = &msg->nh; + + n->nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)); + n->nlmsg_type = type; + if (flags) + n->nlmsg_flags = flags; + else + n->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + msg->t.tcm_family = AF_UNSPEC; + msg->t.tcm_ifindex = ifindex; +} + +/** + * Delete a specific QDISC identified by its iface, and it's handle and parent. + * + * @param[in] nlsk_fd + * The netlink socket file descriptor used for communication. + * @param[in] ifindex + * The netdevice ifindex on whom the deletion will happen. + * @param[in] qinfo + * Additional info to identify the QDISC (handle and parent). + * + * @return + * 0 on success, -1 otherwise with errno set. + */ +static int +qdisc_del(int nlsk_fd, uint16_t ifindex, struct qdisc *qinfo) +{ + struct nlmsg msg; + int fd = 0; + + tc_init_msg(&msg, ifindex, RTM_DELQDISC, 0); + msg.t.tcm_handle = qinfo->handle; + msg.t.tcm_parent = qinfo->parent; + /* if no netlink socket is provided, create one */ + if (!nlsk_fd) { + fd = nl_init(0); + if (fd < 0) { + RTE_LOG(ERR, PMD, + "Could not delete QDISC: null netlink socket\n"); + return -1; + } + } else { + fd = nlsk_fd; + } + if (nl_send(fd, &msg.nh) < 0) + goto error; + if (nl_recv_ack(fd) < 0) + goto error; + if (!nlsk_fd) + return nl_final(fd); + return 0; +error: + if (!nlsk_fd) + nl_final(fd); + return -1; +} + +/** + * Add the multiqueue QDISC with MULTIQ_MAJOR_HANDLE handle. + * + * @param[in] nlsk_fd + * The netlink socket file descriptor used for communication. + * @param[in] ifindex + * The netdevice ifindex where to add the multiqueue QDISC. + * + * @return + * 0 on success, -1 otherwise with errno set. + */ +int +qdisc_add_multiq(int nlsk_fd, uint16_t ifindex) +{ + struct tc_multiq_qopt opt; + struct nlmsg msg; + + tc_init_msg(&msg, ifindex, RTM_NEWQDISC, + NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE); + msg.t.tcm_handle = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0); + msg.t.tcm_parent = TC_H_ROOT; + nlattr_add(&msg.nh, TCA_KIND, sizeof("multiq"), "multiq"); + nlattr_add(&msg.nh, TCA_OPTIONS, sizeof(opt), &opt); + if (nl_send(nlsk_fd, &msg.nh) < 0) + return -1; + if (nl_recv_ack(nlsk_fd) < 0) + return -1; + return 0; +} + +/** + * Add the ingress QDISC with default ffff: handle. + * + * @param[in] nlsk_fd + * The netlink socket file descriptor used for communication. + * @param[in] ifindex + * The netdevice ifindex where the QDISC will be added. + * + * @return + * 0 on success, -1 otherwise with errno set. + */ +int +qdisc_add_ingress(int nlsk_fd, uint16_t ifindex) +{ + struct nlmsg msg; + + tc_init_msg(&msg, ifindex, RTM_NEWQDISC, + NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE); + msg.t.tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0); + msg.t.tcm_parent = TC_H_INGRESS; + nlattr_add(&msg.nh, TCA_KIND, sizeof("ingress"), "ingress"); + if (nl_send(nlsk_fd, &msg.nh) < 0) + return -1; + if (nl_recv_ack(nlsk_fd) < 0) + return -1; + return 0; +} + +/** + * Callback function to delete a QDISC. + * + * @param[in] nh + * The netlink message to parse, received from the kernel. + * @param[in] arg + * Custom arguments for the callback. + * + * @return + * 0 on success, -1 otherwise with errno set. + */ +static int +qdisc_del_cb(struct nlmsghdr *nh, void *arg) +{ + struct tcmsg *t = NLMSG_DATA(nh); + struct list_args *args = arg; + + struct qdisc qinfo = { + .handle = t->tcm_handle, + .parent = t->tcm_parent, + }; + + /* filter out other ifaces' qdiscs */ + if (args->ifindex != (unsigned int)t->tcm_ifindex) + return 0; + /* + * Use another nlsk_fd (0) to avoid tampering with the current list + * iteration. + */ + return qdisc_del(0, args->ifindex, &qinfo); +} + +/** + * Iterate over all QDISC, and call the callback() function for each. + * + * @param[in] nlsk_fd + * The netlink socket file descriptor used for communication. + * @param[in] ifindex + * The netdevice ifindex where to find QDISCs. + * @param[in] callback + * The function to call for each QDISC. + * @param[in, out] arg + * The arguments to provide the callback function with. + * + * @return + * 0 on success, -1 otherwise with errno set. + */ +static int +qdisc_iterate(int nlsk_fd, uint16_t ifindex, + int (*callback)(struct nlmsghdr *, void *), void *arg) +{ + struct nlmsg msg; + struct list_args args = { + .nlsk_fd = nlsk_fd, + .ifindex = ifindex, + .custom_arg = arg, + }; + + tc_init_msg(&msg, ifindex, RTM_GETQDISC, NLM_F_REQUEST | NLM_F_DUMP); + if (nl_send(nlsk_fd, &msg.nh) < 0) + return -1; + if (nl_recv(nlsk_fd, callback, &args) < 0) + return -1; + return 0; +} + +/** + * Delete all QDISCs for a given netdevice. + * + * @param[in] nlsk_fd + * The netlink socket file descriptor used for communication. + * @param[in] ifindex + * The netdevice ifindex where to find QDISCs. + * + * @return + * 0 on success, -1 otherwise with errno set. + */ +int +qdisc_flush(int nlsk_fd, uint16_t ifindex) +{ + return qdisc_iterate(nlsk_fd, ifindex, qdisc_del_cb, NULL); +} + +/** + * Create the multiqueue QDISC, only if it does not exist already. + * + * @param[in] nlsk_fd + * The netlink socket file descriptor used for communication. + * @param[in] ifindex + * The netdevice ifindex where to add the multiqueue QDISC. + * + * @return + * 0 if the qdisc exists or if has been successfully added. + * Return -1 otherwise. + */ +int +qdisc_create_multiq(int nlsk_fd, uint16_t ifindex) +{ + int err = 0; + + err = qdisc_add_multiq(nlsk_fd, ifindex); + if (err < 0 && errno != -EEXIST) { + RTE_LOG(ERR, PMD, "Could not add multiq qdisc (%d): %s\n", + errno, strerror(errno)); + return -1; + } + return 0; +} + +/** + * Create the ingress QDISC, only if it does not exist already. + * + * @param[in] nlsk_fd + * The netlink socket file descriptor used for communication. + * @param[in] ifindex + * The netdevice ifindex where to add the ingress QDISC. + * + * @return + * 0 if the qdisc exists or if has been successfully added. + * Return -1 otherwise. + */ +int +qdisc_create_ingress(int nlsk_fd, uint16_t ifindex) +{ + int err = 0; + + err = qdisc_add_ingress(nlsk_fd, ifindex); + if (err < 0 && errno != -EEXIST) { + RTE_LOG(ERR, PMD, "Could not add ingress qdisc (%d): %s\n", + errno, strerror(errno)); + return -1; + } + return 0; +} diff --git a/src/seastar/dpdk/drivers/net/tap/tap_tcmsgs.h b/src/seastar/dpdk/drivers/net/tap/tap_tcmsgs.h new file mode 100644 index 00000000..78959577 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/tap/tap_tcmsgs.h @@ -0,0 +1,61 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TAP_TCMSGS_H_ +#define _TAP_TCMSGS_H_ + +#include <linux/if_ether.h> +#include <linux/rtnetlink.h> +#include <linux/pkt_sched.h> +#include <linux/pkt_cls.h> +#include <linux/tc_act/tc_mirred.h> +#include <linux/tc_act/tc_gact.h> +#include <linux/tc_act/tc_skbedit.h> +#include <inttypes.h> + +#include <rte_ether.h> +#include <tap_netlink.h> + +#define MULTIQ_MAJOR_HANDLE (1 << 16) + +void tc_init_msg(struct nlmsg *msg, uint16_t ifindex, uint16_t type, + uint16_t flags); +int qdisc_list(int nlsk_fd, uint16_t ifindex); +int qdisc_flush(int nlsk_fd, uint16_t ifindex); +int qdisc_create_ingress(int nlsk_fd, uint16_t ifindex); +int qdisc_create_multiq(int nlsk_fd, uint16_t ifindex); +int qdisc_add_ingress(int nlsk_fd, uint16_t ifindex); +int qdisc_add_multiq(int nlsk_fd, uint16_t ifindex); +int filter_list_ingress(int nlsk_fd, uint16_t ifindex); + +#endif /* _TAP_TCMSGS_H_ */ |