diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
commit | 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch) | |
tree | 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/seastar/dpdk/drivers/net/mlx4 | |
parent | Initial commit. (diff) | |
download | ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip |
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/seastar/dpdk/drivers/net/mlx4')
21 files changed, 10759 insertions, 0 deletions
diff --git a/src/seastar/dpdk/drivers/net/mlx4/Makefile b/src/seastar/dpdk/drivers/net/mlx4/Makefile new file mode 100644 index 000000000..8126b0dfc --- /dev/null +++ b/src/seastar/dpdk/drivers/net/mlx4/Makefile @@ -0,0 +1,141 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2012 6WIND S.A. +# Copyright 2012 Mellanox Technologies, Ltd + +include $(RTE_SDK)/mk/rte.vars.mk + +# Library name. +LIB = librte_pmd_mlx4.a +LIB_GLUE = $(LIB_GLUE_BASE).$(LIB_GLUE_VERSION) +LIB_GLUE_BASE = librte_pmd_mlx4_glue.so +LIB_GLUE_VERSION = 18.02.0 + +# Sources. +SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_ethdev.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_flow.c +ifneq ($(CONFIG_RTE_IBVERBS_LINK_DLOPEN),y) +SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_glue.c +endif +SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_intr.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_mp.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_mr.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxq.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxtx.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_txq.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_utils.c + +ifeq ($(CONFIG_RTE_IBVERBS_LINK_DLOPEN),y) +INSTALL-$(CONFIG_RTE_LIBRTE_MLX4_PMD)-lib += $(LIB_GLUE) +endif + +# Basic CFLAGS. +CFLAGS += -O3 +CFLAGS += -std=c11 -Wall -Wextra +CFLAGS += -g +CFLAGS += -I. +CFLAGS += -D_BSD_SOURCE +CFLAGS += -D_DEFAULT_SOURCE +CFLAGS += -D_XOPEN_SOURCE=600 +CFLAGS += $(WERROR_FLAGS) +CFLAGS += -DALLOW_EXPERIMENTAL_API +ifeq ($(CONFIG_RTE_IBVERBS_LINK_DLOPEN),y) +CFLAGS += -DMLX4_GLUE='"$(LIB_GLUE)"' +CFLAGS += -DMLX4_GLUE_VERSION='"$(LIB_GLUE_VERSION)"' +CFLAGS_mlx4_glue.o += -fPIC +LDLIBS += -ldl +else ifeq ($(CONFIG_RTE_IBVERBS_LINK_STATIC),y) +LDLIBS += $(shell $(RTE_SDK)/buildtools/options-ibverbs-static.sh) +else +LDLIBS += -libverbs -lmlx4 +endif +LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring +LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs +LDLIBS += -lrte_bus_pci + +# A few warnings cannot be avoided in external headers. +CFLAGS += -Wno-error=cast-qual + +EXPORT_MAP := rte_pmd_mlx4_version.map +LIBABIVER := 1 + +# DEBUG which is usually provided on the command-line may enable +# CONFIG_RTE_LIBRTE_MLX4_DEBUG. +ifeq ($(DEBUG),1) +CONFIG_RTE_LIBRTE_MLX4_DEBUG := y +endif + +# User-defined CFLAGS. +ifeq ($(CONFIG_RTE_LIBRTE_MLX4_DEBUG),y) +CFLAGS += -pedantic -UNDEBUG -DPEDANTIC +else +CFLAGS += -DNDEBUG -UPEDANTIC +endif + +include $(RTE_SDK)/mk/rte.lib.mk + +# Generate and clean-up mlx4_autoconf.h. + +export CC CFLAGS CPPFLAGS EXTRA_CFLAGS EXTRA_CPPFLAGS +export AUTO_CONFIG_CFLAGS = -Wno-error + +ifndef V +AUTOCONF_OUTPUT := >/dev/null +endif + +mlx4_autoconf.h.new: FORCE + +mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh + $Q $(RM) -f -- '$@' + $Q : > '$@' + $Q sh -- '$<' '$@' \ + HAVE_IBV_MLX4_BUF_ALLOCATORS \ + infiniband/mlx4dv.h \ + enum MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_IBV_MLX4_UAR_MMAP_OFFSET \ + infiniband/mlx4dv.h \ + enum MLX4DV_QP_MASK_UAR_MMAP_OFFSET \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_IBV_MLX4_WQE_LSO_SEG \ + infiniband/mlx4dv.h \ + type 'struct mlx4_wqe_lso_seg' \ + $(AUTOCONF_OUTPUT) + +# Create mlx4_autoconf.h or update it in case it differs from the new one. + +mlx4_autoconf.h: mlx4_autoconf.h.new + $Q [ -f '$@' ] && \ + cmp '$<' '$@' $(AUTOCONF_OUTPUT) || \ + mv '$<' '$@' + +$(SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD):.c=.o): mlx4_autoconf.h + +# Generate dependency plug-in for rdma-core when the PMD must not be linked +# directly, so that applications do not inherit this dependency. + +ifeq ($(CONFIG_RTE_IBVERBS_LINK_DLOPEN),y) + +$(LIB): $(LIB_GLUE) + +ifeq ($(LINK_USING_CC),1) +GLUE_LDFLAGS := $(call linkerprefix,$(LDFLAGS)) +else +GLUE_LDFLAGS := $(LDFLAGS) +endif +$(LIB_GLUE): mlx4_glue.o + $Q $(LD) $(GLUE_LDFLAGS) $(EXTRA_LDFLAGS) \ + -Wl,-h,$(LIB_GLUE) \ + -shared -o $@ $< -libverbs -lmlx4 + +mlx4_glue.o: mlx4_autoconf.h + +endif + +clean_mlx4: FORCE + $Q rm -f -- mlx4_autoconf.h mlx4_autoconf.h.new + $Q rm -f -- mlx4_glue.o $(LIB_GLUE_BASE)* + +clean: clean_mlx4 diff --git a/src/seastar/dpdk/drivers/net/mlx4/meson.build b/src/seastar/dpdk/drivers/net/mlx4/meson.build new file mode 100644 index 000000000..2540489bb --- /dev/null +++ b/src/seastar/dpdk/drivers/net/mlx4/meson.build @@ -0,0 +1,118 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2018 6WIND S.A. +# Copyright 2018 Mellanox Technologies, Ltd + +pmd_dlopen = (get_option('ibverbs_link') == 'dlopen') +LIB_GLUE_BASE = 'librte_pmd_mlx4_glue.so' +LIB_GLUE_VERSION = '18.02.0' +LIB_GLUE = LIB_GLUE_BASE + '.' + LIB_GLUE_VERSION +if pmd_dlopen + dpdk_conf.set('RTE_IBVERBS_LINK_DLOPEN', 1) + cflags += [ + '-DMLX4_GLUE="@0@"'.format(LIB_GLUE), + '-DMLX4_GLUE_VERSION="@0@"'.format(LIB_GLUE_VERSION), + ] +endif +libnames = [ 'mnl', 'mlx4', 'ibverbs' ] +libs = [] +build = true +foreach libname:libnames + lib = dependency('lib' + libname, required:false) + if not lib.found() + lib = cc.find_library(libname, required:false) + endif + if lib.found() + libs += [ lib ] + else + build = false + endif +endforeach +# Compile PMD +if build + allow_experimental_apis = true + ext_deps += libs + sources = files( + 'mlx4.c', + 'mlx4_ethdev.c', + 'mlx4_flow.c', + 'mlx4_intr.c', + 'mlx4_mp.c', + 'mlx4_mr.c', + 'mlx4_rxq.c', + 'mlx4_rxtx.c', + 'mlx4_txq.c', + 'mlx4_utils.c', + ) + if not pmd_dlopen + sources += files('mlx4_glue.c') + endif + cflags_options = [ + '-Wextra', + '-std=c11', + '-Wno-strict-prototypes', + '-D_BSD_SOURCE', + '-D_DEFAULT_SOURCE', + '-D_XOPEN_SOURCE=600' + ] + foreach option:cflags_options + if cc.has_argument(option) + cflags += option + endif + endforeach + if get_option('buildtype').contains('debug') + cflags += [ '-pedantic', '-UNDEBUG', '-DPEDANTIC' ] + else + cflags += [ '-DNDEBUG', '-UPEDANTIC' ] + endif + # To maintain the compatibility with the make build system + # mlx4_autoconf.h file is still generated. + # input array for meson member search: + # [ "MACRO to define if found", "header for the search", + # "symbol to search","struct member to search" ] + # + has_member_args = [ + [ 'HAVE_IBV_MLX4_WQE_LSO_SEG', 'infiniband/mlx4dv.h', + 'struct mlx4_wqe_lso_seg', 'mss_hdr_size' ], + ] + # input array for meson symbol search: + # [ "MACRO to define if found", "header for the search", + # "symbol to search" ] + has_sym_args = [ + [ 'HAVE_IBV_MLX4_BUF_ALLOCATORS', 'infiniband/mlx4dv.h', + 'MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS' ], + [ 'HAVE_IBV_MLX4_UAR_MMAP_OFFSET', 'infiniband/mlx4dv.h', + 'MLX4DV_QP_MASK_UAR_MMAP_OFFSET' ], + ] + config = configuration_data() + foreach arg:has_sym_args + config.set(arg[0], cc.has_header_symbol(arg[1], arg[2])) + endforeach + foreach arg:has_member_args + file_prefix = '#include<' + arg[1] + '>' + config.set(arg[0], cc.has_member(arg[2], arg[3], + prefix : file_prefix)) + endforeach + configure_file(output : 'mlx4_autoconf.h', configuration : config) +endif +# Build Glue Library +if pmd_dlopen and build + dlopen_name = 'mlx4_glue' + dlopen_lib_name = driver_name_fmt.format(dlopen_name) + dlopen_so_version = LIB_GLUE_VERSION + dlopen_sources = files('mlx4_glue.c') + dlopen_install_dir = [ eal_pmd_path + '-glue' ] + shared_lib = shared_library( + dlopen_lib_name, + dlopen_sources, + include_directories: global_inc, + c_args: cflags, + dependencies: libs, + link_args: [ + '-Wl,-export-dynamic', + '-Wl,-h,@0@'.format(LIB_GLUE), + ], + soversion: dlopen_so_version, + install: true, + install_dir: dlopen_install_dir, + ) +endif diff --git a/src/seastar/dpdk/drivers/net/mlx4/mlx4.c b/src/seastar/dpdk/drivers/net/mlx4/mlx4.c new file mode 100644 index 000000000..fe559c040 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/mlx4/mlx4.c @@ -0,0 +1,1314 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2012 6WIND S.A. + * Copyright 2012 Mellanox Technologies, Ltd + */ + +/** + * @file + * mlx4 driver initialization. + */ + +#include <assert.h> +#include <dlfcn.h> +#include <errno.h> +#include <inttypes.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <unistd.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_common.h> +#include <rte_config.h> +#include <rte_dev.h> +#include <rte_errno.h> +#include <rte_ethdev_driver.h> +#include <rte_ethdev_pci.h> +#include <rte_ether.h> +#include <rte_flow.h> +#include <rte_interrupts.h> +#include <rte_kvargs.h> +#include <rte_malloc.h> +#include <rte_mbuf.h> + +#include "mlx4.h" +#include "mlx4_glue.h" +#include "mlx4_flow.h" +#include "mlx4_mr.h" +#include "mlx4_rxtx.h" +#include "mlx4_utils.h" + +static const char *MZ_MLX4_PMD_SHARED_DATA = "mlx4_pmd_shared_data"; + +/* Shared memory between primary and secondary processes. */ +struct mlx4_shared_data *mlx4_shared_data; + +/* Spinlock for mlx4_shared_data allocation. */ +static rte_spinlock_t mlx4_shared_data_lock = RTE_SPINLOCK_INITIALIZER; + +/* Process local data for secondary processes. */ +static struct mlx4_local_data mlx4_local_data; + +/** Configuration structure for device arguments. */ +struct mlx4_conf { + struct { + uint32_t present; /**< Bit-field for existing ports. */ + uint32_t enabled; /**< Bit-field for user-enabled ports. */ + } ports; + int mr_ext_memseg_en; + /** Whether memseg should be extended for MR creation. */ +}; + +/* Available parameters list. */ +const char *pmd_mlx4_init_params[] = { + MLX4_PMD_PORT_KVARG, + MLX4_MR_EXT_MEMSEG_EN_KVARG, + NULL, +}; + +static void mlx4_dev_stop(struct rte_eth_dev *dev); + +/** + * Initialize shared data between primary and secondary process. + * + * A memzone is reserved by primary process and secondary processes attach to + * the memzone. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_init_shared_data(void) +{ + const struct rte_memzone *mz; + int ret = 0; + + rte_spinlock_lock(&mlx4_shared_data_lock); + if (mlx4_shared_data == NULL) { + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + /* Allocate shared memory. */ + mz = rte_memzone_reserve(MZ_MLX4_PMD_SHARED_DATA, + sizeof(*mlx4_shared_data), + SOCKET_ID_ANY, 0); + if (mz == NULL) { + ERROR("Cannot allocate mlx4 shared data\n"); + ret = -rte_errno; + goto error; + } + mlx4_shared_data = mz->addr; + memset(mlx4_shared_data, 0, sizeof(*mlx4_shared_data)); + rte_spinlock_init(&mlx4_shared_data->lock); + } else { + /* Lookup allocated shared memory. */ + mz = rte_memzone_lookup(MZ_MLX4_PMD_SHARED_DATA); + if (mz == NULL) { + ERROR("Cannot attach mlx4 shared data\n"); + ret = -rte_errno; + goto error; + } + mlx4_shared_data = mz->addr; + memset(&mlx4_local_data, 0, sizeof(mlx4_local_data)); + } + } +error: + rte_spinlock_unlock(&mlx4_shared_data_lock); + return ret; +} + +#ifdef HAVE_IBV_MLX4_BUF_ALLOCATORS +/** + * Verbs callback to allocate a memory. This function should allocate the space + * according to the size provided residing inside a huge page. + * Please note that all allocation must respect the alignment from libmlx4 + * (i.e. currently sysconf(_SC_PAGESIZE)). + * + * @param[in] size + * The size in bytes of the memory to allocate. + * @param[in] data + * A pointer to the callback data. + * + * @return + * Allocated buffer, NULL otherwise and rte_errno is set. + */ +static void * +mlx4_alloc_verbs_buf(size_t size, void *data) +{ + struct mlx4_priv *priv = data; + void *ret; + size_t alignment = sysconf(_SC_PAGESIZE); + unsigned int socket = SOCKET_ID_ANY; + + if (priv->verbs_alloc_ctx.type == MLX4_VERBS_ALLOC_TYPE_TX_QUEUE) { + const struct txq *txq = priv->verbs_alloc_ctx.obj; + + socket = txq->socket; + } else if (priv->verbs_alloc_ctx.type == + MLX4_VERBS_ALLOC_TYPE_RX_QUEUE) { + const struct rxq *rxq = priv->verbs_alloc_ctx.obj; + + socket = rxq->socket; + } + assert(data != NULL); + ret = rte_malloc_socket(__func__, size, alignment, socket); + if (!ret && size) + rte_errno = ENOMEM; + return ret; +} + +/** + * Verbs callback to free a memory. + * + * @param[in] ptr + * A pointer to the memory to free. + * @param[in] data + * A pointer to the callback data. + */ +static void +mlx4_free_verbs_buf(void *ptr, void *data __rte_unused) +{ + assert(data != NULL); + rte_free(ptr); +} +#endif + +/** + * Initialize process private data structure. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_proc_priv_init(struct rte_eth_dev *dev) +{ + struct mlx4_proc_priv *ppriv; + size_t ppriv_size; + + /* + * UAR register table follows the process private structure. BlueFlame + * registers for Tx queues are stored in the table. + */ + ppriv_size = sizeof(struct mlx4_proc_priv) + + dev->data->nb_tx_queues * sizeof(void *); + ppriv = rte_malloc_socket("mlx4_proc_priv", ppriv_size, + RTE_CACHE_LINE_SIZE, dev->device->numa_node); + if (!ppriv) { + rte_errno = ENOMEM; + return -rte_errno; + } + ppriv->uar_table_sz = ppriv_size; + dev->process_private = ppriv; + return 0; +} + +/** + * Un-initialize process private data structure. + * + * @param dev + * Pointer to Ethernet device structure. + */ +static void +mlx4_proc_priv_uninit(struct rte_eth_dev *dev) +{ + if (!dev->process_private) + return; + rte_free(dev->process_private); + dev->process_private = NULL; +} + +/** + * DPDK callback for Ethernet device configuration. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_dev_configure(struct rte_eth_dev *dev) +{ + struct mlx4_priv *priv = dev->data->dev_private; + struct rte_flow_error error; + int ret; + + /* Prepare internal flow rules. */ + ret = mlx4_flow_sync(priv, &error); + if (ret) { + ERROR("cannot set up internal flow rules (code %d, \"%s\")," + " flow error type %d, cause %p, message: %s", + -ret, strerror(-ret), error.type, error.cause, + error.message ? error.message : "(unspecified)"); + goto exit; + } + ret = mlx4_intr_install(priv); + if (ret) { + ERROR("%p: interrupt handler installation failed", + (void *)dev); + goto exit; + } + ret = mlx4_proc_priv_init(dev); + if (ret) { + ERROR("%p: process private data allocation failed", + (void *)dev); + goto exit; + } +exit: + return ret; +} + +/** + * DPDK callback to start the device. + * + * Simulate device start by initializing common RSS resources and attaching + * all configured flows. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_dev_start(struct rte_eth_dev *dev) +{ + struct mlx4_priv *priv = dev->data->dev_private; + struct rte_flow_error error; + int ret; + + if (priv->started) + return 0; + DEBUG("%p: attaching configured flows to all RX queues", (void *)dev); + priv->started = 1; + ret = mlx4_rss_init(priv); + if (ret) { + ERROR("%p: cannot initialize RSS resources: %s", + (void *)dev, strerror(-ret)); + goto err; + } +#ifndef NDEBUG + mlx4_mr_dump_dev(dev); +#endif + ret = mlx4_rxq_intr_enable(priv); + if (ret) { + ERROR("%p: interrupt handler installation failed", + (void *)dev); + goto err; + } + ret = mlx4_flow_sync(priv, &error); + if (ret) { + ERROR("%p: cannot attach flow rules (code %d, \"%s\")," + " flow error type %d, cause %p, message: %s", + (void *)dev, + -ret, strerror(-ret), error.type, error.cause, + error.message ? error.message : "(unspecified)"); + goto err; + } + rte_wmb(); + dev->tx_pkt_burst = mlx4_tx_burst; + dev->rx_pkt_burst = mlx4_rx_burst; + /* Enable datapath on secondary process. */ + mlx4_mp_req_start_rxtx(dev); + return 0; +err: + mlx4_dev_stop(dev); + return ret; +} + +/** + * DPDK callback to stop the device. + * + * Simulate device stop by detaching all configured flows. + * + * @param dev + * Pointer to Ethernet device structure. + */ +static void +mlx4_dev_stop(struct rte_eth_dev *dev) +{ + struct mlx4_priv *priv = dev->data->dev_private; + + if (!priv->started) + return; + DEBUG("%p: detaching flows from all RX queues", (void *)dev); + priv->started = 0; + dev->tx_pkt_burst = mlx4_tx_burst_removed; + dev->rx_pkt_burst = mlx4_rx_burst_removed; + rte_wmb(); + /* Disable datapath on secondary process. */ + mlx4_mp_req_stop_rxtx(dev); + mlx4_flow_sync(priv, NULL); + mlx4_rxq_intr_disable(priv); + mlx4_rss_deinit(priv); +} + +/** + * DPDK callback to close the device. + * + * Destroy all queues and objects, free memory. + * + * @param dev + * Pointer to Ethernet device structure. + */ +static void +mlx4_dev_close(struct rte_eth_dev *dev) +{ + struct mlx4_priv *priv = dev->data->dev_private; + unsigned int i; + + DEBUG("%p: closing device \"%s\"", + (void *)dev, + ((priv->ctx != NULL) ? priv->ctx->device->name : "")); + dev->rx_pkt_burst = mlx4_rx_burst_removed; + dev->tx_pkt_burst = mlx4_tx_burst_removed; + rte_wmb(); + /* Disable datapath on secondary process. */ + mlx4_mp_req_stop_rxtx(dev); + mlx4_flow_clean(priv); + mlx4_rss_deinit(priv); + for (i = 0; i != dev->data->nb_rx_queues; ++i) + mlx4_rx_queue_release(dev->data->rx_queues[i]); + for (i = 0; i != dev->data->nb_tx_queues; ++i) + mlx4_tx_queue_release(dev->data->tx_queues[i]); + mlx4_proc_priv_uninit(dev); + mlx4_mr_release(dev); + if (priv->pd != NULL) { + assert(priv->ctx != NULL); + claim_zero(mlx4_glue->dealloc_pd(priv->pd)); + claim_zero(mlx4_glue->close_device(priv->ctx)); + } else + assert(priv->ctx == NULL); + mlx4_intr_uninstall(priv); + memset(priv, 0, sizeof(*priv)); +} + +static const struct eth_dev_ops mlx4_dev_ops = { + .dev_configure = mlx4_dev_configure, + .dev_start = mlx4_dev_start, + .dev_stop = mlx4_dev_stop, + .dev_set_link_down = mlx4_dev_set_link_down, + .dev_set_link_up = mlx4_dev_set_link_up, + .dev_close = mlx4_dev_close, + .link_update = mlx4_link_update, + .promiscuous_enable = mlx4_promiscuous_enable, + .promiscuous_disable = mlx4_promiscuous_disable, + .allmulticast_enable = mlx4_allmulticast_enable, + .allmulticast_disable = mlx4_allmulticast_disable, + .mac_addr_remove = mlx4_mac_addr_remove, + .mac_addr_add = mlx4_mac_addr_add, + .mac_addr_set = mlx4_mac_addr_set, + .set_mc_addr_list = mlx4_set_mc_addr_list, + .stats_get = mlx4_stats_get, + .stats_reset = mlx4_stats_reset, + .fw_version_get = mlx4_fw_version_get, + .dev_infos_get = mlx4_dev_infos_get, + .dev_supported_ptypes_get = mlx4_dev_supported_ptypes_get, + .vlan_filter_set = mlx4_vlan_filter_set, + .rx_queue_setup = mlx4_rx_queue_setup, + .tx_queue_setup = mlx4_tx_queue_setup, + .rx_queue_release = mlx4_rx_queue_release, + .tx_queue_release = mlx4_tx_queue_release, + .flow_ctrl_get = mlx4_flow_ctrl_get, + .flow_ctrl_set = mlx4_flow_ctrl_set, + .mtu_set = mlx4_mtu_set, + .filter_ctrl = mlx4_filter_ctrl, + .rx_queue_intr_enable = mlx4_rx_intr_enable, + .rx_queue_intr_disable = mlx4_rx_intr_disable, + .is_removed = mlx4_is_removed, +}; + +/* Available operations from secondary process. */ +static const struct eth_dev_ops mlx4_dev_sec_ops = { + .stats_get = mlx4_stats_get, + .stats_reset = mlx4_stats_reset, + .fw_version_get = mlx4_fw_version_get, + .dev_infos_get = mlx4_dev_infos_get, +}; + +/** + * Get PCI information from struct ibv_device. + * + * @param device + * Pointer to Ethernet device structure. + * @param[out] pci_addr + * PCI bus address output buffer. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_ibv_device_to_pci_addr(const struct ibv_device *device, + struct rte_pci_addr *pci_addr) +{ + FILE *file; + char line[32]; + MKSTR(path, "%s/device/uevent", device->ibdev_path); + + file = fopen(path, "rb"); + if (file == NULL) { + rte_errno = errno; + return -rte_errno; + } + while (fgets(line, sizeof(line), file) == line) { + size_t len = strlen(line); + int ret; + + /* Truncate long lines. */ + if (len == (sizeof(line) - 1)) + while (line[(len - 1)] != '\n') { + ret = fgetc(file); + if (ret == EOF) + break; + line[(len - 1)] = ret; + } + /* Extract information. */ + if (sscanf(line, + "PCI_SLOT_NAME=" + "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", + &pci_addr->domain, + &pci_addr->bus, + &pci_addr->devid, + &pci_addr->function) == 4) { + ret = 0; + break; + } + } + fclose(file); + return 0; +} + +/** + * Verify and store value for device argument. + * + * @param[in] key + * Key argument to verify. + * @param[in] val + * Value associated with key. + * @param[in, out] conf + * Shared configuration data. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_arg_parse(const char *key, const char *val, struct mlx4_conf *conf) +{ + unsigned long tmp; + + errno = 0; + tmp = strtoul(val, NULL, 0); + if (errno) { + rte_errno = errno; + WARN("%s: \"%s\" is not a valid integer", key, val); + return -rte_errno; + } + if (strcmp(MLX4_PMD_PORT_KVARG, key) == 0) { + uint32_t ports = rte_log2_u32(conf->ports.present + 1); + + if (tmp >= ports) { + ERROR("port index %lu outside range [0,%" PRIu32 ")", + tmp, ports); + return -EINVAL; + } + if (!(conf->ports.present & (1 << tmp))) { + rte_errno = EINVAL; + ERROR("invalid port index %lu", tmp); + return -rte_errno; + } + conf->ports.enabled |= 1 << tmp; + } else if (strcmp(MLX4_MR_EXT_MEMSEG_EN_KVARG, key) == 0) { + conf->mr_ext_memseg_en = !!tmp; + } else { + rte_errno = EINVAL; + WARN("%s: unknown parameter", key); + return -rte_errno; + } + return 0; +} + +/** + * Parse device parameters. + * + * @param devargs + * Device arguments structure. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_args(struct rte_devargs *devargs, struct mlx4_conf *conf) +{ + struct rte_kvargs *kvlist; + unsigned int arg_count; + int ret = 0; + int i; + + if (devargs == NULL) + return 0; + kvlist = rte_kvargs_parse(devargs->args, pmd_mlx4_init_params); + if (kvlist == NULL) { + rte_errno = EINVAL; + ERROR("failed to parse kvargs"); + return -rte_errno; + } + /* Process parameters. */ + for (i = 0; pmd_mlx4_init_params[i]; ++i) { + arg_count = rte_kvargs_count(kvlist, pmd_mlx4_init_params[i]); + while (arg_count-- > 0) { + ret = rte_kvargs_process(kvlist, + pmd_mlx4_init_params[i], + (int (*)(const char *, + const char *, + void *)) + mlx4_arg_parse, + conf); + if (ret != 0) + goto free_kvlist; + } + } +free_kvlist: + rte_kvargs_free(kvlist); + return ret; +} + +/** + * Interpret RSS capabilities reported by device. + * + * This function returns the set of usable Verbs RSS hash fields, kernel + * quirks taken into account. + * + * @param ctx + * Verbs context. + * @param pd + * Verbs protection domain. + * @param device_attr_ex + * Extended device attributes to interpret. + * + * @return + * Usable RSS hash fields mask in Verbs format. + */ +static uint64_t +mlx4_hw_rss_sup(struct ibv_context *ctx, struct ibv_pd *pd, + struct ibv_device_attr_ex *device_attr_ex) +{ + uint64_t hw_rss_sup = device_attr_ex->rss_caps.rx_hash_fields_mask; + struct ibv_cq *cq = NULL; + struct ibv_wq *wq = NULL; + struct ibv_rwq_ind_table *ind = NULL; + struct ibv_qp *qp = NULL; + + if (!hw_rss_sup) { + WARN("no RSS capabilities reported; disabling support for UDP" + " RSS and inner VXLAN RSS"); + return IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4 | + IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6 | + IBV_RX_HASH_SRC_PORT_TCP | IBV_RX_HASH_DST_PORT_TCP; + } + if (!(hw_rss_sup & IBV_RX_HASH_INNER)) + return hw_rss_sup; + /* + * Although reported as supported, missing code in some Linux + * versions (v4.15, v4.16) prevents the creation of hash QPs with + * inner capability. + * + * There is no choice but to attempt to instantiate a temporary RSS + * context in order to confirm its support. + */ + cq = mlx4_glue->create_cq(ctx, 1, NULL, NULL, 0); + wq = cq ? mlx4_glue->create_wq + (ctx, + &(struct ibv_wq_init_attr){ + .wq_type = IBV_WQT_RQ, + .max_wr = 1, + .max_sge = 1, + .pd = pd, + .cq = cq, + }) : NULL; + ind = wq ? mlx4_glue->create_rwq_ind_table + (ctx, + &(struct ibv_rwq_ind_table_init_attr){ + .log_ind_tbl_size = 0, + .ind_tbl = &wq, + .comp_mask = 0, + }) : NULL; + qp = ind ? mlx4_glue->create_qp_ex + (ctx, + &(struct ibv_qp_init_attr_ex){ + .comp_mask = + (IBV_QP_INIT_ATTR_PD | + IBV_QP_INIT_ATTR_RX_HASH | + IBV_QP_INIT_ATTR_IND_TABLE), + .qp_type = IBV_QPT_RAW_PACKET, + .pd = pd, + .rwq_ind_tbl = ind, + .rx_hash_conf = { + .rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ, + .rx_hash_key_len = MLX4_RSS_HASH_KEY_SIZE, + .rx_hash_key = mlx4_rss_hash_key_default, + .rx_hash_fields_mask = hw_rss_sup, + }, + }) : NULL; + if (!qp) { + WARN("disabling unusable inner RSS capability due to kernel" + " quirk"); + hw_rss_sup &= ~IBV_RX_HASH_INNER; + } else { + claim_zero(mlx4_glue->destroy_qp(qp)); + } + if (ind) + claim_zero(mlx4_glue->destroy_rwq_ind_table(ind)); + if (wq) + claim_zero(mlx4_glue->destroy_wq(wq)); + if (cq) + claim_zero(mlx4_glue->destroy_cq(cq)); + return hw_rss_sup; +} + +static struct rte_pci_driver mlx4_driver; + +/** + * PMD global initialization. + * + * Independent from individual device, this function initializes global + * per-PMD data structures distinguishing primary and secondary processes. + * Hence, each initialization is called once per a process. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_init_once(void) +{ + struct mlx4_shared_data *sd; + struct mlx4_local_data *ld = &mlx4_local_data; + + if (mlx4_init_shared_data()) + return -rte_errno; + sd = mlx4_shared_data; + assert(sd); + rte_spinlock_lock(&sd->lock); + switch (rte_eal_process_type()) { + case RTE_PROC_PRIMARY: + if (sd->init_done) + break; + LIST_INIT(&sd->mem_event_cb_list); + rte_rwlock_init(&sd->mem_event_rwlock); + rte_mem_event_callback_register("MLX4_MEM_EVENT_CB", + mlx4_mr_mem_event_cb, NULL); + mlx4_mp_init_primary(); + sd->init_done = true; + break; + case RTE_PROC_SECONDARY: + if (ld->init_done) + break; + mlx4_mp_init_secondary(); + ++sd->secondary_cnt; + ld->init_done = true; + break; + default: + break; + } + rte_spinlock_unlock(&sd->lock); + return 0; +} + +/** + * DPDK callback to register a PCI device. + * + * This function creates an Ethernet device for each port of a given + * PCI device. + * + * @param[in] pci_drv + * PCI driver structure (mlx4_driver). + * @param[in] pci_dev + * PCI device information. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) +{ + struct ibv_device **list; + struct ibv_device *ibv_dev; + int err = 0; + struct ibv_context *attr_ctx = NULL; + struct ibv_device_attr device_attr; + struct ibv_device_attr_ex device_attr_ex; + struct mlx4_conf conf = { + .ports.present = 0, + .mr_ext_memseg_en = 1, + }; + unsigned int vf; + int i; + + (void)pci_drv; + err = mlx4_init_once(); + if (err) { + ERROR("unable to init PMD global data: %s", + strerror(rte_errno)); + return -rte_errno; + } + assert(pci_drv == &mlx4_driver); + list = mlx4_glue->get_device_list(&i); + if (list == NULL) { + rte_errno = errno; + assert(rte_errno); + if (rte_errno == ENOSYS) + ERROR("cannot list devices, is ib_uverbs loaded?"); + return -rte_errno; + } + assert(i >= 0); + /* + * For each listed device, check related sysfs entry against + * the provided PCI ID. + */ + while (i != 0) { + struct rte_pci_addr pci_addr; + + --i; + DEBUG("checking device \"%s\"", list[i]->name); + if (mlx4_ibv_device_to_pci_addr(list[i], &pci_addr)) + continue; + if ((pci_dev->addr.domain != pci_addr.domain) || + (pci_dev->addr.bus != pci_addr.bus) || + (pci_dev->addr.devid != pci_addr.devid) || + (pci_dev->addr.function != pci_addr.function)) + continue; + vf = (pci_dev->id.device_id == + PCI_DEVICE_ID_MELLANOX_CONNECTX3VF); + INFO("PCI information matches, using device \"%s\" (VF: %s)", + list[i]->name, (vf ? "true" : "false")); + attr_ctx = mlx4_glue->open_device(list[i]); + err = errno; + break; + } + if (attr_ctx == NULL) { + mlx4_glue->free_device_list(list); + switch (err) { + case 0: + rte_errno = ENODEV; + ERROR("cannot access device, is mlx4_ib loaded?"); + return -rte_errno; + case EINVAL: + rte_errno = EINVAL; + ERROR("cannot use device, are drivers up to date?"); + return -rte_errno; + } + assert(err > 0); + rte_errno = err; + return -rte_errno; + } + ibv_dev = list[i]; + DEBUG("device opened"); + if (mlx4_glue->query_device(attr_ctx, &device_attr)) { + err = ENODEV; + goto error; + } + INFO("%u port(s) detected", device_attr.phys_port_cnt); + conf.ports.present |= (UINT64_C(1) << device_attr.phys_port_cnt) - 1; + if (mlx4_args(pci_dev->device.devargs, &conf)) { + ERROR("failed to process device arguments"); + err = EINVAL; + goto error; + } + /* Use all ports when none are defined */ + if (!conf.ports.enabled) + conf.ports.enabled = conf.ports.present; + /* Retrieve extended device attributes. */ + if (mlx4_glue->query_device_ex(attr_ctx, NULL, &device_attr_ex)) { + err = ENODEV; + goto error; + } + assert(device_attr.max_sge >= MLX4_MAX_SGE); + for (i = 0; i < device_attr.phys_port_cnt; i++) { + uint32_t port = i + 1; /* ports are indexed from one */ + struct ibv_context *ctx = NULL; + struct ibv_port_attr port_attr; + struct ibv_pd *pd = NULL; + struct mlx4_priv *priv = NULL; + struct rte_eth_dev *eth_dev = NULL; + struct ether_addr mac; + char name[RTE_ETH_NAME_MAX_LEN]; + + /* If port is not enabled, skip. */ + if (!(conf.ports.enabled & (1 << i))) + continue; + DEBUG("using port %u", port); + ctx = mlx4_glue->open_device(ibv_dev); + if (ctx == NULL) { + err = ENODEV; + goto port_error; + } + snprintf(name, sizeof(name), "%s port %u", + mlx4_glue->get_device_name(ibv_dev), port); + if (rte_eal_process_type() == RTE_PROC_SECONDARY) { + eth_dev = rte_eth_dev_attach_secondary(name); + if (eth_dev == NULL) { + ERROR("can not attach rte ethdev"); + rte_errno = ENOMEM; + err = rte_errno; + goto error; + } + priv = eth_dev->data->dev_private; + if (!priv->verbs_alloc_ctx.enabled) { + ERROR("secondary process is not supported" + " due to lack of external allocator" + " from Verbs"); + rte_errno = ENOTSUP; + err = rte_errno; + goto error; + } + eth_dev->device = &pci_dev->device; + eth_dev->dev_ops = &mlx4_dev_sec_ops; + err = mlx4_proc_priv_init(eth_dev); + if (err) + goto error; + /* Receive command fd from primary process. */ + err = mlx4_mp_req_verbs_cmd_fd(eth_dev); + if (err < 0) { + err = rte_errno; + goto error; + } + /* Remap UAR for Tx queues. */ + err = mlx4_tx_uar_init_secondary(eth_dev, err); + if (err) { + err = rte_errno; + goto error; + } + /* + * Ethdev pointer is still required as input since + * the primary device is not accessible from the + * secondary process. + */ + eth_dev->tx_pkt_burst = mlx4_tx_burst; + eth_dev->rx_pkt_burst = mlx4_rx_burst; + claim_zero(mlx4_glue->close_device(ctx)); + rte_eth_copy_pci_info(eth_dev, pci_dev); + rte_eth_dev_probing_finish(eth_dev); + continue; + } + /* Check port status. */ + err = mlx4_glue->query_port(ctx, port, &port_attr); + if (err) { + err = ENODEV; + ERROR("port query failed: %s", strerror(err)); + goto port_error; + } + if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { + err = ENOTSUP; + ERROR("port %d is not configured in Ethernet mode", + port); + goto port_error; + } + if (port_attr.state != IBV_PORT_ACTIVE) + DEBUG("port %d is not active: \"%s\" (%d)", + port, mlx4_glue->port_state_str(port_attr.state), + port_attr.state); + /* Make asynchronous FD non-blocking to handle interrupts. */ + err = mlx4_fd_set_non_blocking(ctx->async_fd); + if (err) { + ERROR("cannot make asynchronous FD non-blocking: %s", + strerror(err)); + goto port_error; + } + /* Allocate protection domain. */ + pd = mlx4_glue->alloc_pd(ctx); + if (pd == NULL) { + err = ENOMEM; + ERROR("PD allocation failure"); + goto port_error; + } + /* from rte_ethdev.c */ + priv = rte_zmalloc("ethdev private structure", + sizeof(*priv), + RTE_CACHE_LINE_SIZE); + if (priv == NULL) { + err = ENOMEM; + ERROR("priv allocation failure"); + goto port_error; + } + priv->ctx = ctx; + priv->device_attr = device_attr; + priv->port = port; + priv->pd = pd; + priv->mtu = ETHER_MTU; + priv->vf = vf; + priv->hw_csum = !!(device_attr.device_cap_flags & + IBV_DEVICE_RAW_IP_CSUM); + DEBUG("checksum offloading is %ssupported", + (priv->hw_csum ? "" : "not ")); + /* Only ConnectX-3 Pro supports tunneling. */ + priv->hw_csum_l2tun = + priv->hw_csum && + (device_attr.vendor_part_id == + PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO); + DEBUG("L2 tunnel checksum offloads are %ssupported", + priv->hw_csum_l2tun ? "" : "not "); + priv->hw_rss_sup = mlx4_hw_rss_sup(priv->ctx, priv->pd, + &device_attr_ex); + DEBUG("supported RSS hash fields mask: %016" PRIx64, + priv->hw_rss_sup); + priv->hw_rss_max_qps = + device_attr_ex.rss_caps.max_rwq_indirection_table_size; + DEBUG("MAX RSS queues %d", priv->hw_rss_max_qps); + priv->hw_fcs_strip = !!(device_attr_ex.raw_packet_caps & + IBV_RAW_PACKET_CAP_SCATTER_FCS); + DEBUG("FCS stripping toggling is %ssupported", + priv->hw_fcs_strip ? "" : "not "); + priv->tso = + ((device_attr_ex.tso_caps.max_tso > 0) && + (device_attr_ex.tso_caps.supported_qpts & + (1 << IBV_QPT_RAW_PACKET))); + if (priv->tso) + priv->tso_max_payload_sz = + device_attr_ex.tso_caps.max_tso; + DEBUG("TSO is %ssupported", + priv->tso ? "" : "not "); + priv->mr_ext_memseg_en = conf.mr_ext_memseg_en; + /* Configure the first MAC address by default. */ + err = mlx4_get_mac(priv, &mac.addr_bytes); + if (err) { + ERROR("cannot get MAC address, is mlx4_en loaded?" + " (error: %s)", strerror(err)); + goto port_error; + } + INFO("port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", + priv->port, + mac.addr_bytes[0], mac.addr_bytes[1], + mac.addr_bytes[2], mac.addr_bytes[3], + mac.addr_bytes[4], mac.addr_bytes[5]); + /* Register MAC address. */ + priv->mac[0] = mac; +#ifndef NDEBUG + { + char ifname[IF_NAMESIZE]; + + if (mlx4_get_ifname(priv, &ifname) == 0) + DEBUG("port %u ifname is \"%s\"", + priv->port, ifname); + else + DEBUG("port %u ifname is unknown", priv->port); + } +#endif + /* Get actual MTU if possible. */ + mlx4_mtu_get(priv, &priv->mtu); + DEBUG("port %u MTU is %u", priv->port, priv->mtu); + eth_dev = rte_eth_dev_allocate(name); + if (eth_dev == NULL) { + err = ENOMEM; + ERROR("can not allocate rte ethdev"); + goto port_error; + } + eth_dev->data->dev_private = priv; + eth_dev->data->mac_addrs = priv->mac; + eth_dev->device = &pci_dev->device; + rte_eth_copy_pci_info(eth_dev, pci_dev); + /* Initialize local interrupt handle for current port. */ + priv->intr_handle = (struct rte_intr_handle){ + .fd = -1, + .type = RTE_INTR_HANDLE_EXT, + }; + /* + * Override ethdev interrupt handle pointer with private + * handle instead of that of the parent PCI device used by + * default. This prevents it from being shared between all + * ports of the same PCI device since each of them is + * associated its own Verbs context. + * + * Rx interrupts in particular require this as the PMD has + * no control over the registration of queue interrupts + * besides setting up eth_dev->intr_handle, the rest is + * handled by rte_intr_rx_ctl(). + */ + eth_dev->intr_handle = &priv->intr_handle; + priv->dev_data = eth_dev->data; + eth_dev->dev_ops = &mlx4_dev_ops; +#ifdef HAVE_IBV_MLX4_BUF_ALLOCATORS + /* Hint libmlx4 to use PMD allocator for data plane resources */ + struct mlx4dv_ctx_allocators alctr = { + .alloc = &mlx4_alloc_verbs_buf, + .free = &mlx4_free_verbs_buf, + .data = priv, + }; + err = mlx4_glue->dv_set_context_attr + (ctx, MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS, + (void *)((uintptr_t)&alctr)); + if (err) + WARN("Verbs external allocator is not supported"); + else + priv->verbs_alloc_ctx.enabled = 1; +#endif + /* Bring Ethernet device up. */ + DEBUG("forcing Ethernet interface up"); + mlx4_dev_set_link_up(eth_dev); + /* Update link status once if waiting for LSC. */ + if (eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC) + mlx4_link_update(eth_dev, 0); + /* + * Once the device is added to the list of memory event + * callback, its global MR cache table cannot be expanded + * on the fly because of deadlock. If it overflows, lookup + * should be done by searching MR list linearly, which is slow. + */ + err = mlx4_mr_btree_init(&priv->mr.cache, + MLX4_MR_BTREE_CACHE_N * 2, + eth_dev->device->numa_node); + if (err) { + /* rte_errno is already set. */ + goto port_error; + } + /* Add device to memory callback list. */ + rte_rwlock_write_lock(&mlx4_shared_data->mem_event_rwlock); + LIST_INSERT_HEAD(&mlx4_shared_data->mem_event_cb_list, + priv, mem_event_cb); + rte_rwlock_write_unlock(&mlx4_shared_data->mem_event_rwlock); + rte_eth_dev_probing_finish(eth_dev); + continue; +port_error: + rte_free(priv); + if (eth_dev != NULL) + eth_dev->data->dev_private = NULL; + if (pd) + claim_zero(mlx4_glue->dealloc_pd(pd)); + if (ctx) + claim_zero(mlx4_glue->close_device(ctx)); + if (eth_dev != NULL) { + /* mac_addrs must not be freed because part of dev_private */ + eth_dev->data->mac_addrs = NULL; + rte_eth_dev_release_port(eth_dev); + } + break; + } + /* + * XXX if something went wrong in the loop above, there is a resource + * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as + * long as the dpdk does not provide a way to deallocate a ethdev and a + * way to enumerate the registered ethdevs to free the previous ones. + */ +error: + if (attr_ctx) + claim_zero(mlx4_glue->close_device(attr_ctx)); + if (list) + mlx4_glue->free_device_list(list); + if (err) + rte_errno = err; + return -err; +} + +static const struct rte_pci_id mlx4_pci_id_map[] = { + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX3) + }, + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO) + }, + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX3VF) + }, + { + .vendor_id = 0 + } +}; + +static struct rte_pci_driver mlx4_driver = { + .driver = { + .name = MLX4_DRIVER_NAME + }, + .id_table = mlx4_pci_id_map, + .probe = mlx4_pci_probe, + .drv_flags = RTE_PCI_DRV_INTR_LSC | + RTE_PCI_DRV_INTR_RMV, +}; + +#ifdef RTE_IBVERBS_LINK_DLOPEN + +/** + * Suffix RTE_EAL_PMD_PATH with "-glue". + * + * This function performs a sanity check on RTE_EAL_PMD_PATH before + * suffixing its last component. + * + * @param buf[out] + * Output buffer, should be large enough otherwise NULL is returned. + * @param size + * Size of @p out. + * + * @return + * Pointer to @p buf or @p NULL in case suffix cannot be appended. + */ +static char * +mlx4_glue_path(char *buf, size_t size) +{ + static const char *const bad[] = { "/", ".", "..", NULL }; + const char *path = RTE_EAL_PMD_PATH; + size_t len = strlen(path); + size_t off; + int i; + + while (len && path[len - 1] == '/') + --len; + for (off = len; off && path[off - 1] != '/'; --off) + ; + for (i = 0; bad[i]; ++i) + if (!strncmp(path + off, bad[i], (int)(len - off))) + goto error; + i = snprintf(buf, size, "%.*s-glue", (int)len, path); + if (i == -1 || (size_t)i >= size) + goto error; + return buf; +error: + ERROR("unable to append \"-glue\" to last component of" + " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\")," + " please re-configure DPDK"); + return NULL; +} + +/** + * Initialization routine for run-time dependency on rdma-core. + */ +static int +mlx4_glue_init(void) +{ + char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")]; + const char *path[] = { + /* + * A basic security check is necessary before trusting + * MLX4_GLUE_PATH, which may override RTE_EAL_PMD_PATH. + */ + (geteuid() == getuid() && getegid() == getgid() ? + getenv("MLX4_GLUE_PATH") : NULL), + /* + * When RTE_EAL_PMD_PATH is set, use its glue-suffixed + * variant, otherwise let dlopen() look up libraries on its + * own. + */ + (*RTE_EAL_PMD_PATH ? + mlx4_glue_path(glue_path, sizeof(glue_path)) : ""), + }; + unsigned int i = 0; + void *handle = NULL; + void **sym; + const char *dlmsg; + + while (!handle && i != RTE_DIM(path)) { + const char *end; + size_t len; + int ret; + + if (!path[i]) { + ++i; + continue; + } + end = strpbrk(path[i], ":;"); + if (!end) + end = path[i] + strlen(path[i]); + len = end - path[i]; + ret = 0; + do { + char name[ret + 1]; + + ret = snprintf(name, sizeof(name), "%.*s%s" MLX4_GLUE, + (int)len, path[i], + (!len || *(end - 1) == '/') ? "" : "/"); + if (ret == -1) + break; + if (sizeof(name) != (size_t)ret + 1) + continue; + DEBUG("looking for rdma-core glue as \"%s\"", name); + handle = dlopen(name, RTLD_LAZY); + break; + } while (1); + path[i] = end + 1; + if (!*end) + ++i; + } + if (!handle) { + rte_errno = EINVAL; + dlmsg = dlerror(); + if (dlmsg) + WARN("cannot load glue library: %s", dlmsg); + goto glue_error; + } + sym = dlsym(handle, "mlx4_glue"); + if (!sym || !*sym) { + rte_errno = EINVAL; + dlmsg = dlerror(); + if (dlmsg) + ERROR("cannot resolve glue symbol: %s", dlmsg); + goto glue_error; + } + mlx4_glue = *sym; + return 0; +glue_error: + if (handle) + dlclose(handle); + WARN("cannot initialize PMD due to missing run-time" + " dependency on rdma-core libraries (libibverbs," + " libmlx4)"); + return -rte_errno; +} + +#endif + +/** + * Driver initialization routine. + */ +RTE_INIT(rte_mlx4_pmd_init) +{ + /* + * MLX4_DEVICE_FATAL_CLEANUP tells ibv_destroy functions we + * want to get success errno value in case of calling them + * when the device was removed. + */ + setenv("MLX4_DEVICE_FATAL_CLEANUP", "1", 1); + /* + * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use + * huge pages. Calling ibv_fork_init() during init allows + * applications to use fork() safely for purposes other than + * using this PMD, which is not supported in forked processes. + */ + setenv("RDMAV_HUGEPAGES_SAFE", "1", 1); +#ifdef RTE_IBVERBS_LINK_DLOPEN + if (mlx4_glue_init()) + return; + assert(mlx4_glue); +#endif +#ifndef NDEBUG + /* Glue structure must not contain any NULL pointers. */ + { + unsigned int i; + + for (i = 0; i != sizeof(*mlx4_glue) / sizeof(void *); ++i) + assert(((const void *const *)mlx4_glue)[i]); + } +#endif + if (strcmp(mlx4_glue->version, MLX4_GLUE_VERSION)) { + ERROR("rdma-core glue \"%s\" mismatch: \"%s\" is required", + mlx4_glue->version, MLX4_GLUE_VERSION); + return; + } + mlx4_glue->fork_init(); + rte_pci_register(&mlx4_driver); +} + +RTE_PMD_EXPORT_NAME(net_mlx4, __COUNTER__); +RTE_PMD_REGISTER_PCI_TABLE(net_mlx4, mlx4_pci_id_map); +RTE_PMD_REGISTER_KMOD_DEP(net_mlx4, + "* ib_uverbs & mlx4_en & mlx4_core & mlx4_ib"); diff --git a/src/seastar/dpdk/drivers/net/mlx4/mlx4.h b/src/seastar/dpdk/drivers/net/mlx4/mlx4.h new file mode 100644 index 000000000..e2d184f84 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/mlx4/mlx4.h @@ -0,0 +1,250 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2012 6WIND S.A. + * Copyright 2012 Mellanox Technologies, Ltd + */ + +#ifndef RTE_PMD_MLX4_H_ +#define RTE_PMD_MLX4_H_ + +#include <net/if.h> +#include <stdint.h> +#include <sys/queue.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_ethdev_driver.h> +#include <rte_ether.h> +#include <rte_interrupts.h> +#include <rte_mempool.h> +#include <rte_rwlock.h> + +#include "mlx4_mr.h" + +#ifndef IBV_RX_HASH_INNER +/** This is not necessarily defined by supported RDMA core versions. */ +#define IBV_RX_HASH_INNER (1ull << 31) +#endif /* IBV_RX_HASH_INNER */ + +/** Maximum number of simultaneous MAC addresses. This value is arbitrary. */ +#define MLX4_MAX_MAC_ADDRESSES 128 + +/** Request send completion once in every 64 sends, might be less. */ +#define MLX4_PMD_TX_PER_COMP_REQ 64 + +/** Maximum size for inline data. */ +#define MLX4_PMD_MAX_INLINE 0 + +/** Fixed RSS hash key size in bytes. Cannot be modified. */ +#define MLX4_RSS_HASH_KEY_SIZE 40 + +/** Interrupt alarm timeout value in microseconds. */ +#define MLX4_INTR_ALARM_TIMEOUT 100000 + +/* Maximum packet headers size (L2+L3+L4) for TSO. */ +#define MLX4_MAX_TSO_HEADER 192 + +/** Port parameter. */ +#define MLX4_PMD_PORT_KVARG "port" + +/** Enable extending memsegs when creating a MR. */ +#define MLX4_MR_EXT_MEMSEG_EN_KVARG "mr_ext_memseg_en" + +enum { + PCI_VENDOR_ID_MELLANOX = 0x15b3, +}; + +enum { + PCI_DEVICE_ID_MELLANOX_CONNECTX3 = 0x1003, + PCI_DEVICE_ID_MELLANOX_CONNECTX3VF = 0x1004, + PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO = 0x1007, +}; + +/* Request types for IPC. */ +enum mlx4_mp_req_type { + MLX4_MP_REQ_VERBS_CMD_FD = 1, + MLX4_MP_REQ_CREATE_MR, + MLX4_MP_REQ_START_RXTX, + MLX4_MP_REQ_STOP_RXTX, +}; + +/* Pameters for IPC. */ +struct mlx4_mp_param { + enum mlx4_mp_req_type type; + int port_id; + int result; + RTE_STD_C11 + union { + uintptr_t addr; /* MLX4_MP_REQ_CREATE_MR */ + } args; +}; + +/** Request timeout for IPC. */ +#define MLX4_MP_REQ_TIMEOUT_SEC 5 + +/** Key string for IPC. */ +#define MLX4_MP_NAME "net_mlx4_mp" + +/** Driver name reported to lower layers and used in log output. */ +#define MLX4_DRIVER_NAME "net_mlx4" + +struct mlx4_drop; +struct mlx4_rss; +struct rxq; +struct txq; +struct rte_flow; + +/** + * Type of objet being allocated. + */ +enum mlx4_verbs_alloc_type { + MLX4_VERBS_ALLOC_TYPE_NONE, + MLX4_VERBS_ALLOC_TYPE_TX_QUEUE, + MLX4_VERBS_ALLOC_TYPE_RX_QUEUE, +}; + +/** + * Verbs allocator needs a context to know in the callback which kind of + * resources it is allocating. + */ +struct mlx4_verbs_alloc_ctx { + int enabled; + enum mlx4_verbs_alloc_type type; /* Kind of object being allocated. */ + const void *obj; /* Pointer to the DPDK object. */ +}; + +LIST_HEAD(mlx4_dev_list, mlx4_priv); +LIST_HEAD(mlx4_mr_list, mlx4_mr); + +/* Shared data between primary and secondary processes. */ +struct mlx4_shared_data { + rte_spinlock_t lock; + /* Global spinlock for primary and secondary processes. */ + int init_done; /* Whether primary has done initialization. */ + unsigned int secondary_cnt; /* Number of secondary processes init'd. */ + struct mlx4_dev_list mem_event_cb_list; + rte_rwlock_t mem_event_rwlock; +}; + +/* Per-process data structure, not visible to other processes. */ +struct mlx4_local_data { + int init_done; /* Whether a secondary has done initialization. */ +}; + +extern struct mlx4_shared_data *mlx4_shared_data; + +/* Per-process private structure. */ +struct mlx4_proc_priv { + size_t uar_table_sz; + /* Size of UAR register table. */ + void *uar_table[]; + /* Table of UAR registers for each process. */ +}; + +#define MLX4_PROC_PRIV(port_id) \ + ((struct mlx4_proc_priv *)rte_eth_devices[port_id].process_private) + +/** Private data structure. */ +struct mlx4_priv { + LIST_ENTRY(mlx4_priv) mem_event_cb; + /**< Called by memory event callback. */ + struct rte_eth_dev_data *dev_data; /* Pointer to device data. */ + struct ibv_context *ctx; /**< Verbs context. */ + struct ibv_device_attr device_attr; /**< Device properties. */ + struct ibv_pd *pd; /**< Protection Domain. */ + /* Device properties. */ + uint16_t mtu; /**< Configured MTU. */ + uint8_t port; /**< Physical port number. */ + uint32_t started:1; /**< Device started, flows enabled. */ + uint32_t vf:1; /**< This is a VF device. */ + uint32_t intr_alarm:1; /**< An interrupt alarm is scheduled. */ + uint32_t isolated:1; /**< Toggle isolated mode. */ + uint32_t rss_init:1; /**< Common RSS context is initialized. */ + uint32_t hw_csum:1; /**< Checksum offload is supported. */ + uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels. */ + uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */ + uint32_t tso:1; /**< Transmit segmentation offload is supported. */ + uint32_t mr_ext_memseg_en:1; + /** Whether memseg should be extended for MR creation. */ + uint32_t tso_max_payload_sz; /**< Max supported TSO payload size. */ + uint32_t hw_rss_max_qps; /**< Max Rx Queues supported by RSS. */ + uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs format). */ + struct rte_intr_handle intr_handle; /**< Port interrupt handle. */ + struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */ + struct { + uint32_t dev_gen; /* Generation number to flush local caches. */ + rte_rwlock_t rwlock; /* MR Lock. */ + struct mlx4_mr_btree cache; /* Global MR cache table. */ + struct mlx4_mr_list mr_list; /* Registered MR list. */ + struct mlx4_mr_list mr_free_list; /* Freed MR list. */ + } mr; + LIST_HEAD(, mlx4_rss) rss; /**< Shared targets for Rx flow rules. */ + LIST_HEAD(, rte_flow) flows; /**< Configured flow rule handles. */ + struct ether_addr mac[MLX4_MAX_MAC_ADDRESSES]; + /**< Configured MAC addresses. Unused entries are zeroed. */ + uint32_t mac_mc; /**< Number of trailing multicast entries in mac[]. */ + struct mlx4_verbs_alloc_ctx verbs_alloc_ctx; + /**< Context for Verbs allocator. */ +}; + +#define PORT_ID(priv) ((priv)->dev_data->port_id) +#define ETH_DEV(priv) (&rte_eth_devices[PORT_ID(priv)]) + +/* mlx4_ethdev.c */ + +int mlx4_get_ifname(const struct mlx4_priv *priv, char (*ifname)[IF_NAMESIZE]); +int mlx4_get_mac(struct mlx4_priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN]); +int mlx4_mtu_get(struct mlx4_priv *priv, uint16_t *mtu); +int mlx4_mtu_set(struct rte_eth_dev *dev, uint16_t mtu); +int mlx4_dev_set_link_down(struct rte_eth_dev *dev); +int mlx4_dev_set_link_up(struct rte_eth_dev *dev); +void mlx4_promiscuous_enable(struct rte_eth_dev *dev); +void mlx4_promiscuous_disable(struct rte_eth_dev *dev); +void mlx4_allmulticast_enable(struct rte_eth_dev *dev); +void mlx4_allmulticast_disable(struct rte_eth_dev *dev); +void mlx4_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index); +int mlx4_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr, + uint32_t index, uint32_t vmdq); +int mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr); +int mlx4_set_mc_addr_list(struct rte_eth_dev *dev, struct ether_addr *list, + uint32_t num); +int mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on); +int mlx4_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats); +void mlx4_stats_reset(struct rte_eth_dev *dev); +int mlx4_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size); +void mlx4_dev_infos_get(struct rte_eth_dev *dev, + struct rte_eth_dev_info *info); +int mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete); +int mlx4_flow_ctrl_get(struct rte_eth_dev *dev, + struct rte_eth_fc_conf *fc_conf); +int mlx4_flow_ctrl_set(struct rte_eth_dev *dev, + struct rte_eth_fc_conf *fc_conf); +const uint32_t *mlx4_dev_supported_ptypes_get(struct rte_eth_dev *dev); +int mlx4_is_removed(struct rte_eth_dev *dev); + +/* mlx4_intr.c */ + +int mlx4_intr_uninstall(struct mlx4_priv *priv); +int mlx4_intr_install(struct mlx4_priv *priv); +int mlx4_rxq_intr_enable(struct mlx4_priv *priv); +void mlx4_rxq_intr_disable(struct mlx4_priv *priv); +int mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx); +int mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx); + +/* mlx4_mp.c */ +void mlx4_mp_req_start_rxtx(struct rte_eth_dev *dev); +void mlx4_mp_req_stop_rxtx(struct rte_eth_dev *dev); +int mlx4_mp_req_mr_create(struct rte_eth_dev *dev, uintptr_t addr); +int mlx4_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev); +void mlx4_mp_init_primary(void); +void mlx4_mp_uninit_primary(void); +void mlx4_mp_init_secondary(void); +void mlx4_mp_uninit_secondary(void); + +#endif /* RTE_PMD_MLX4_H_ */ diff --git a/src/seastar/dpdk/drivers/net/mlx4/mlx4_ethdev.c b/src/seastar/dpdk/drivers/net/mlx4/mlx4_ethdev.c new file mode 100644 index 000000000..765affc34 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/mlx4/mlx4_ethdev.c @@ -0,0 +1,968 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox Technologies, Ltd + */ + +/** + * @file + * Miscellaneous control operations for mlx4 driver. + */ + +#include <assert.h> +#include <dirent.h> +#include <errno.h> +#include <linux/ethtool.h> +#include <linux/sockios.h> +#include <net/if.h> +#include <netinet/ip.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <unistd.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_bus_pci.h> +#include <rte_errno.h> +#include <rte_ethdev_driver.h> +#include <rte_ether.h> +#include <rte_flow.h> +#include <rte_pci.h> +#include <rte_string_fns.h> + +#include "mlx4.h" +#include "mlx4_flow.h" +#include "mlx4_glue.h" +#include "mlx4_rxtx.h" +#include "mlx4_utils.h" + +/** + * Get interface name from private structure. + * + * @param[in] priv + * Pointer to private structure. + * @param[out] ifname + * Interface name output buffer. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_get_ifname(const struct mlx4_priv *priv, char (*ifname)[IF_NAMESIZE]) +{ + DIR *dir; + struct dirent *dent; + unsigned int dev_type = 0; + unsigned int dev_port_prev = ~0u; + char match[IF_NAMESIZE] = ""; + + { + MKSTR(path, "%s/device/net", priv->ctx->device->ibdev_path); + + dir = opendir(path); + if (dir == NULL) { + rte_errno = errno; + return -rte_errno; + } + } + while ((dent = readdir(dir)) != NULL) { + char *name = dent->d_name; + FILE *file; + unsigned int dev_port; + int r; + + if ((name[0] == '.') && + ((name[1] == '\0') || + ((name[1] == '.') && (name[2] == '\0')))) + continue; + + MKSTR(path, "%s/device/net/%s/%s", + priv->ctx->device->ibdev_path, name, + (dev_type ? "dev_id" : "dev_port")); + + file = fopen(path, "rb"); + if (file == NULL) { + if (errno != ENOENT) + continue; + /* + * Switch to dev_id when dev_port does not exist as + * is the case with Linux kernel versions < 3.15. + */ +try_dev_id: + match[0] = '\0'; + if (dev_type) + break; + dev_type = 1; + dev_port_prev = ~0u; + rewinddir(dir); + continue; + } + r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); + fclose(file); + if (r != 1) + continue; + /* + * Switch to dev_id when dev_port returns the same value for + * all ports. May happen when using a MOFED release older than + * 3.0 with a Linux kernel >= 3.15. + */ + if (dev_port == dev_port_prev) + goto try_dev_id; + dev_port_prev = dev_port; + if (dev_port == (priv->port - 1u)) + strlcpy(match, name, sizeof(match)); + } + closedir(dir); + if (match[0] == '\0') { + rte_errno = ENODEV; + return -rte_errno; + } + strncpy(*ifname, match, sizeof(*ifname)); + return 0; +} + +/** + * Perform ifreq ioctl() on associated Ethernet device. + * + * @param[in] priv + * Pointer to private structure. + * @param req + * Request number to pass to ioctl(). + * @param[out] ifr + * Interface request structure output buffer. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_ifreq(const struct mlx4_priv *priv, int req, struct ifreq *ifr) +{ + int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); + int ret; + + if (sock == -1) { + rte_errno = errno; + return -rte_errno; + } + ret = mlx4_get_ifname(priv, &ifr->ifr_name); + if (!ret && ioctl(sock, req, ifr) == -1) { + rte_errno = errno; + ret = -rte_errno; + } + close(sock); + return ret; +} + +/** + * Get MAC address by querying netdevice. + * + * @param[in] priv + * Pointer to private structure. + * @param[out] mac + * MAC address output buffer. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_get_mac(struct mlx4_priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN]) +{ + struct ifreq request; + int ret = mlx4_ifreq(priv, SIOCGIFHWADDR, &request); + + if (ret) + return ret; + memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN); + return 0; +} + +/** + * Get device MTU. + * + * @param priv + * Pointer to private structure. + * @param[out] mtu + * MTU value output buffer. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_mtu_get(struct mlx4_priv *priv, uint16_t *mtu) +{ + struct ifreq request; + int ret = mlx4_ifreq(priv, SIOCGIFMTU, &request); + + if (ret) + return ret; + *mtu = request.ifr_mtu; + return 0; +} + +/** + * DPDK callback to change the MTU. + * + * @param priv + * Pointer to Ethernet device structure. + * @param mtu + * MTU value to set. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_mtu_set(struct rte_eth_dev *dev, uint16_t mtu) +{ + struct mlx4_priv *priv = dev->data->dev_private; + struct ifreq request = { .ifr_mtu = mtu, }; + int ret = mlx4_ifreq(priv, SIOCSIFMTU, &request); + + if (ret) + return ret; + priv->mtu = mtu; + return 0; +} + +/** + * Set device flags. + * + * @param priv + * Pointer to private structure. + * @param keep + * Bitmask for flags that must remain untouched. + * @param flags + * Bitmask for flags to modify. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_set_flags(struct mlx4_priv *priv, unsigned int keep, unsigned int flags) +{ + struct ifreq request; + int ret = mlx4_ifreq(priv, SIOCGIFFLAGS, &request); + + if (ret) + return ret; + request.ifr_flags &= keep; + request.ifr_flags |= flags & ~keep; + return mlx4_ifreq(priv, SIOCSIFFLAGS, &request); +} + +/** + * Change the link state (UP / DOWN). + * + * @param priv + * Pointer to Ethernet device private data. + * @param up + * Nonzero for link up, otherwise link down. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_dev_set_link(struct mlx4_priv *priv, int up) +{ + int err; + + if (up) { + err = mlx4_set_flags(priv, ~IFF_UP, IFF_UP); + if (err) + return err; + } else { + err = mlx4_set_flags(priv, ~IFF_UP, ~IFF_UP); + if (err) + return err; + } + return 0; +} + +/** + * DPDK callback to bring the link DOWN. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_dev_set_link_down(struct rte_eth_dev *dev) +{ + struct mlx4_priv *priv = dev->data->dev_private; + + return mlx4_dev_set_link(priv, 0); +} + +/** + * DPDK callback to bring the link UP. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_dev_set_link_up(struct rte_eth_dev *dev) +{ + struct mlx4_priv *priv = dev->data->dev_private; + + return mlx4_dev_set_link(priv, 1); +} + +/** + * Supported Rx mode toggles. + * + * Even and odd values respectively stand for off and on. + */ +enum rxmode_toggle { + RXMODE_TOGGLE_PROMISC_OFF, + RXMODE_TOGGLE_PROMISC_ON, + RXMODE_TOGGLE_ALLMULTI_OFF, + RXMODE_TOGGLE_ALLMULTI_ON, +}; + +/** + * Helper function to toggle promiscuous and all multicast modes. + * + * @param dev + * Pointer to Ethernet device structure. + * @param toggle + * Toggle to set. + */ +static void +mlx4_rxmode_toggle(struct rte_eth_dev *dev, enum rxmode_toggle toggle) +{ + struct mlx4_priv *priv = dev->data->dev_private; + const char *mode; + struct rte_flow_error error; + + switch (toggle) { + case RXMODE_TOGGLE_PROMISC_OFF: + case RXMODE_TOGGLE_PROMISC_ON: + mode = "promiscuous"; + dev->data->promiscuous = toggle & 1; + break; + case RXMODE_TOGGLE_ALLMULTI_OFF: + case RXMODE_TOGGLE_ALLMULTI_ON: + mode = "all multicast"; + dev->data->all_multicast = toggle & 1; + break; + default: + mode = "undefined"; + } + if (!mlx4_flow_sync(priv, &error)) + return; + ERROR("cannot toggle %s mode (code %d, \"%s\")," + " flow error type %d, cause %p, message: %s", + mode, rte_errno, strerror(rte_errno), error.type, error.cause, + error.message ? error.message : "(unspecified)"); +} + +/** + * DPDK callback to enable promiscuous mode. + * + * @param dev + * Pointer to Ethernet device structure. + */ +void +mlx4_promiscuous_enable(struct rte_eth_dev *dev) +{ + mlx4_rxmode_toggle(dev, RXMODE_TOGGLE_PROMISC_ON); +} + +/** + * DPDK callback to disable promiscuous mode. + * + * @param dev + * Pointer to Ethernet device structure. + */ +void +mlx4_promiscuous_disable(struct rte_eth_dev *dev) +{ + mlx4_rxmode_toggle(dev, RXMODE_TOGGLE_PROMISC_OFF); +} + +/** + * DPDK callback to enable all multicast mode. + * + * @param dev + * Pointer to Ethernet device structure. + */ +void +mlx4_allmulticast_enable(struct rte_eth_dev *dev) +{ + mlx4_rxmode_toggle(dev, RXMODE_TOGGLE_ALLMULTI_ON); +} + +/** + * DPDK callback to disable all multicast mode. + * + * @param dev + * Pointer to Ethernet device structure. + */ +void +mlx4_allmulticast_disable(struct rte_eth_dev *dev) +{ + mlx4_rxmode_toggle(dev, RXMODE_TOGGLE_ALLMULTI_OFF); +} + +/** + * DPDK callback to remove a MAC address. + * + * @param dev + * Pointer to Ethernet device structure. + * @param index + * MAC address index. + */ +void +mlx4_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) +{ + struct mlx4_priv *priv = dev->data->dev_private; + struct rte_flow_error error; + + if (index >= RTE_DIM(priv->mac) - priv->mac_mc) { + rte_errno = EINVAL; + return; + } + memset(&priv->mac[index], 0, sizeof(priv->mac[index])); + if (!mlx4_flow_sync(priv, &error)) + return; + ERROR("failed to synchronize flow rules after removing MAC address" + " at index %d (code %d, \"%s\")," + " flow error type %d, cause %p, message: %s", + index, rte_errno, strerror(rte_errno), error.type, error.cause, + error.message ? error.message : "(unspecified)"); +} + +/** + * DPDK callback to add a MAC address. + * + * @param dev + * Pointer to Ethernet device structure. + * @param mac_addr + * MAC address to register. + * @param index + * MAC address index. + * @param vmdq + * VMDq pool index to associate address with (ignored). + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr, + uint32_t index, uint32_t vmdq) +{ + struct mlx4_priv *priv = dev->data->dev_private; + struct rte_flow_error error; + int ret; + + (void)vmdq; + if (index >= RTE_DIM(priv->mac) - priv->mac_mc) { + rte_errno = EINVAL; + return -rte_errno; + } + memcpy(&priv->mac[index], mac_addr, sizeof(priv->mac[index])); + ret = mlx4_flow_sync(priv, &error); + if (!ret) + return 0; + ERROR("failed to synchronize flow rules after adding MAC address" + " at index %d (code %d, \"%s\")," + " flow error type %d, cause %p, message: %s", + index, rte_errno, strerror(rte_errno), error.type, error.cause, + error.message ? error.message : "(unspecified)"); + return ret; +} + +/** + * DPDK callback to configure multicast addresses. + * + * @param dev + * Pointer to Ethernet device structure. + * @param list + * List of MAC addresses to register. + * @param num + * Number of entries in list. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_set_mc_addr_list(struct rte_eth_dev *dev, struct ether_addr *list, + uint32_t num) +{ + struct mlx4_priv *priv = dev->data->dev_private; + struct rte_flow_error error; + int ret; + + if (num > RTE_DIM(priv->mac)) { + rte_errno = EINVAL; + return -rte_errno; + } + /* + * Make sure there is enough room to increase the number of + * multicast entries without overwriting standard entries. + */ + if (num > priv->mac_mc) { + unsigned int i; + + for (i = RTE_DIM(priv->mac) - num; + i != RTE_DIM(priv->mac) - priv->mac_mc; + ++i) + if (!is_zero_ether_addr(&priv->mac[i])) { + rte_errno = EBUSY; + return -rte_errno; + } + } else if (num < priv->mac_mc) { + /* Clear unused entries. */ + memset(priv->mac + RTE_DIM(priv->mac) - priv->mac_mc, + 0, + sizeof(priv->mac[0]) * (priv->mac_mc - num)); + } + memcpy(priv->mac + RTE_DIM(priv->mac) - num, list, sizeof(*list) * num); + priv->mac_mc = num; + ret = mlx4_flow_sync(priv, &error); + if (!ret) + return 0; + ERROR("failed to synchronize flow rules after modifying MC list," + " (code %d, \"%s\"), flow error type %d, cause %p, message: %s", + rte_errno, strerror(rte_errno), error.type, error.cause, + error.message ? error.message : "(unspecified)"); + return ret; +} + +/** + * DPDK callback to configure a VLAN filter. + * + * @param dev + * Pointer to Ethernet device structure. + * @param vlan_id + * VLAN ID to filter. + * @param on + * Toggle filter. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on) +{ + struct mlx4_priv *priv = dev->data->dev_private; + struct rte_flow_error error; + unsigned int vidx = vlan_id / 64; + unsigned int vbit = vlan_id % 64; + uint64_t *v; + int ret; + + if (vidx >= RTE_DIM(dev->data->vlan_filter_conf.ids)) { + rte_errno = EINVAL; + return -rte_errno; + } + v = &dev->data->vlan_filter_conf.ids[vidx]; + *v &= ~(UINT64_C(1) << vbit); + *v |= (uint64_t)!!on << vbit; + ret = mlx4_flow_sync(priv, &error); + if (!ret) + return 0; + ERROR("failed to synchronize flow rules after %s VLAN filter on ID %u" + " (code %d, \"%s\"), " + " flow error type %d, cause %p, message: %s", + on ? "enabling" : "disabling", vlan_id, + rte_errno, strerror(rte_errno), error.type, error.cause, + error.message ? error.message : "(unspecified)"); + return ret; +} + +/** + * DPDK callback to set the primary MAC address. + * + * @param dev + * Pointer to Ethernet device structure. + * @param mac_addr + * MAC address to register. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr) +{ + return mlx4_mac_addr_add(dev, mac_addr, 0, 0); +} + +/** + * DPDK callback to get information about the device. + * + * @param dev + * Pointer to Ethernet device structure. + * @param[out] info + * Info structure output buffer. + */ +void +mlx4_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) +{ + struct mlx4_priv *priv = dev->data->dev_private; + unsigned int max; + char ifname[IF_NAMESIZE]; + + /* FIXME: we should ask the device for these values. */ + info->min_rx_bufsize = 32; + info->max_rx_pktlen = 65536; + /* + * Since we need one CQ per QP, the limit is the minimum number + * between the two values. + */ + max = ((priv->device_attr.max_cq > priv->device_attr.max_qp) ? + priv->device_attr.max_qp : priv->device_attr.max_cq); + /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */ + if (max >= 65535) + max = 65535; + info->max_rx_queues = max; + info->max_tx_queues = max; + info->max_mac_addrs = RTE_DIM(priv->mac); + info->tx_offload_capa = mlx4_get_tx_port_offloads(priv); + info->rx_queue_offload_capa = mlx4_get_rx_queue_offloads(priv); + info->rx_offload_capa = (mlx4_get_rx_port_offloads(priv) | + info->rx_queue_offload_capa); + if (mlx4_get_ifname(priv, &ifname) == 0) + info->if_index = if_nametoindex(ifname); + info->hash_key_size = MLX4_RSS_HASH_KEY_SIZE; + info->speed_capa = + ETH_LINK_SPEED_1G | + ETH_LINK_SPEED_10G | + ETH_LINK_SPEED_20G | + ETH_LINK_SPEED_40G | + ETH_LINK_SPEED_56G; + info->flow_type_rss_offloads = mlx4_conv_rss_types(priv, 0, 1); +} + +/** + * Get firmware version of a device. + * + * @param dev + * Ethernet device port. + * @param fw_ver + * String output allocated by caller. + * @param fw_size + * Size of the output string, including terminating null byte. + * + * @return + * 0 on success, or the size of the non truncated string if too big. + */ +int mlx4_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size) +{ + struct mlx4_priv *priv = dev->data->dev_private; + struct ibv_device_attr *attr = &priv->device_attr; + size_t size = strnlen(attr->fw_ver, sizeof(attr->fw_ver)) + 1; + + if (fw_size < size) + return size; + if (fw_ver != NULL) + strlcpy(fw_ver, attr->fw_ver, fw_size); + return 0; +} + +/** + * DPDK callback to get device statistics. + * + * @param dev + * Pointer to Ethernet device structure. + * @param[out] stats + * Stats structure output buffer. + */ +int +mlx4_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats) +{ + struct rte_eth_stats tmp; + unsigned int i; + unsigned int idx; + + memset(&tmp, 0, sizeof(tmp)); + /* Add software counters. */ + for (i = 0; i != dev->data->nb_rx_queues; ++i) { + struct rxq *rxq = dev->data->rx_queues[i]; + + if (rxq == NULL) + continue; + idx = rxq->stats.idx; + if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) { + tmp.q_ipackets[idx] += rxq->stats.ipackets; + tmp.q_ibytes[idx] += rxq->stats.ibytes; + tmp.q_errors[idx] += (rxq->stats.idropped + + rxq->stats.rx_nombuf); + } + tmp.ipackets += rxq->stats.ipackets; + tmp.ibytes += rxq->stats.ibytes; + tmp.ierrors += rxq->stats.idropped; + tmp.rx_nombuf += rxq->stats.rx_nombuf; + } + for (i = 0; i != dev->data->nb_tx_queues; ++i) { + struct txq *txq = dev->data->tx_queues[i]; + + if (txq == NULL) + continue; + idx = txq->stats.idx; + if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) { + tmp.q_opackets[idx] += txq->stats.opackets; + tmp.q_obytes[idx] += txq->stats.obytes; + tmp.q_errors[idx] += txq->stats.odropped; + } + tmp.opackets += txq->stats.opackets; + tmp.obytes += txq->stats.obytes; + tmp.oerrors += txq->stats.odropped; + } + *stats = tmp; + return 0; +} + +/** + * DPDK callback to clear device statistics. + * + * @param dev + * Pointer to Ethernet device structure. + */ +void +mlx4_stats_reset(struct rte_eth_dev *dev) +{ + unsigned int i; + + for (i = 0; i != dev->data->nb_rx_queues; ++i) { + struct rxq *rxq = dev->data->rx_queues[i]; + + if (rxq) + rxq->stats = (struct mlx4_rxq_stats){ + .idx = rxq->stats.idx, + }; + } + for (i = 0; i != dev->data->nb_tx_queues; ++i) { + struct txq *txq = dev->data->tx_queues[i]; + + if (txq) + txq->stats = (struct mlx4_txq_stats){ + .idx = txq->stats.idx, + }; + } +} + +/** + * DPDK callback to retrieve physical link information. + * + * @param dev + * Pointer to Ethernet device structure. + * @param wait_to_complete + * Wait for request completion (ignored). + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete) +{ + const struct mlx4_priv *priv = dev->data->dev_private; + struct ethtool_cmd edata = { + .cmd = ETHTOOL_GSET, + }; + struct ifreq ifr; + struct rte_eth_link dev_link; + int link_speed = 0; + + if (priv == NULL) { + rte_errno = EINVAL; + return -rte_errno; + } + (void)wait_to_complete; + if (mlx4_ifreq(priv, SIOCGIFFLAGS, &ifr)) { + WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(rte_errno)); + return -rte_errno; + } + memset(&dev_link, 0, sizeof(dev_link)); + dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && + (ifr.ifr_flags & IFF_RUNNING)); + ifr.ifr_data = (void *)&edata; + if (mlx4_ifreq(priv, SIOCETHTOOL, &ifr)) { + WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", + strerror(rte_errno)); + return -rte_errno; + } + link_speed = ethtool_cmd_speed(&edata); + if (link_speed == -1) + dev_link.link_speed = ETH_SPEED_NUM_NONE; + else + dev_link.link_speed = link_speed; + dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ? + ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); + dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & + ETH_LINK_SPEED_FIXED); + dev->data->dev_link = dev_link; + return 0; +} + +/** + * DPDK callback to get flow control status. + * + * @param dev + * Pointer to Ethernet device structure. + * @param[out] fc_conf + * Flow control output buffer. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_flow_ctrl_get(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) +{ + struct mlx4_priv *priv = dev->data->dev_private; + struct ifreq ifr; + struct ethtool_pauseparam ethpause = { + .cmd = ETHTOOL_GPAUSEPARAM, + }; + int ret; + + ifr.ifr_data = (void *)ðpause; + if (mlx4_ifreq(priv, SIOCETHTOOL, &ifr)) { + ret = rte_errno; + WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)" + " failed: %s", + strerror(rte_errno)); + goto out; + } + fc_conf->autoneg = ethpause.autoneg; + if (ethpause.rx_pause && ethpause.tx_pause) + fc_conf->mode = RTE_FC_FULL; + else if (ethpause.rx_pause) + fc_conf->mode = RTE_FC_RX_PAUSE; + else if (ethpause.tx_pause) + fc_conf->mode = RTE_FC_TX_PAUSE; + else + fc_conf->mode = RTE_FC_NONE; + ret = 0; +out: + assert(ret >= 0); + return -ret; +} + +/** + * DPDK callback to modify flow control parameters. + * + * @param dev + * Pointer to Ethernet device structure. + * @param[in] fc_conf + * Flow control parameters. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_flow_ctrl_set(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) +{ + struct mlx4_priv *priv = dev->data->dev_private; + struct ifreq ifr; + struct ethtool_pauseparam ethpause = { + .cmd = ETHTOOL_SPAUSEPARAM, + }; + int ret; + + ifr.ifr_data = (void *)ðpause; + ethpause.autoneg = fc_conf->autoneg; + if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || + (fc_conf->mode & RTE_FC_RX_PAUSE)) + ethpause.rx_pause = 1; + else + ethpause.rx_pause = 0; + if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || + (fc_conf->mode & RTE_FC_TX_PAUSE)) + ethpause.tx_pause = 1; + else + ethpause.tx_pause = 0; + if (mlx4_ifreq(priv, SIOCETHTOOL, &ifr)) { + ret = rte_errno; + WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" + " failed: %s", + strerror(rte_errno)); + goto out; + } + ret = 0; +out: + assert(ret >= 0); + return -ret; +} + +/** + * DPDK callback to retrieve the received packet types that are recognized + * by the device. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * Pointer to an array of recognized packet types if in Rx burst mode, + * NULL otherwise. + */ +const uint32_t * +mlx4_dev_supported_ptypes_get(struct rte_eth_dev *dev) +{ + static const uint32_t ptypes[] = { + /* refers to rxq_cq_to_pkt_type() */ + RTE_PTYPE_L2_ETHER, + RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, + RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, + RTE_PTYPE_L4_FRAG, + RTE_PTYPE_L4_TCP, + RTE_PTYPE_L4_UDP, + RTE_PTYPE_UNKNOWN + }; + static const uint32_t ptypes_l2tun[] = { + /* refers to rxq_cq_to_pkt_type() */ + RTE_PTYPE_L2_ETHER, + RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, + RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, + RTE_PTYPE_L4_FRAG, + RTE_PTYPE_L4_TCP, + RTE_PTYPE_L4_UDP, + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN, + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN, + RTE_PTYPE_UNKNOWN + }; + struct mlx4_priv *priv = dev->data->dev_private; + + if (dev->rx_pkt_burst == mlx4_rx_burst) { + if (priv->hw_csum_l2tun) + return ptypes_l2tun; + else + return ptypes; + } + return NULL; +} + +/** + * Check if mlx4 device was removed. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 1 when device is removed, otherwise 0. + */ +int +mlx4_is_removed(struct rte_eth_dev *dev) +{ + struct ibv_device_attr device_attr; + struct mlx4_priv *priv = dev->data->dev_private; + + if (mlx4_glue->query_device(priv->ctx, &device_attr) == EIO) + return 1; + return 0; +} diff --git a/src/seastar/dpdk/drivers/net/mlx4/mlx4_flow.c b/src/seastar/dpdk/drivers/net/mlx4/mlx4_flow.c new file mode 100644 index 000000000..1dcdb31a0 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/mlx4/mlx4_flow.c @@ -0,0 +1,1625 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox Technologies, Ltd + */ + +/** + * @file + * Flow API operations for mlx4 driver. + */ + +#include <arpa/inet.h> +#include <assert.h> +#include <errno.h> +#include <stdalign.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> +#include <sys/queue.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_byteorder.h> +#include <rte_errno.h> +#include <rte_ethdev_driver.h> +#include <rte_ether.h> +#include <rte_flow.h> +#include <rte_flow_driver.h> +#include <rte_malloc.h> + +/* PMD headers. */ +#include "mlx4.h" +#include "mlx4_glue.h" +#include "mlx4_flow.h" +#include "mlx4_rxtx.h" +#include "mlx4_utils.h" + +/** Static initializer for a list of subsequent item types. */ +#define NEXT_ITEM(...) \ + (const enum rte_flow_item_type []){ \ + __VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \ + } + +/** Processor structure associated with a flow item. */ +struct mlx4_flow_proc_item { + /** Bit-mask for fields supported by this PMD. */ + const void *mask_support; + /** Bit-mask to use when @p item->mask is not provided. */ + const void *mask_default; + /** Size in bytes for @p mask_support and @p mask_default. */ + const unsigned int mask_sz; + /** Merge a pattern item into a flow rule handle. */ + int (*merge)(struct rte_flow *flow, + const struct rte_flow_item *item, + const struct mlx4_flow_proc_item *proc, + struct rte_flow_error *error); + /** Size in bytes of the destination structure. */ + const unsigned int dst_sz; + /** List of possible subsequent items. */ + const enum rte_flow_item_type *const next_item; +}; + +/** Shared resources for drop flow rules. */ +struct mlx4_drop { + struct ibv_qp *qp; /**< QP target. */ + struct ibv_cq *cq; /**< CQ associated with above QP. */ + struct mlx4_priv *priv; /**< Back pointer to private data. */ + uint32_t refcnt; /**< Reference count. */ +}; + +/** + * Convert supported RSS hash field types between DPDK and Verbs formats. + * + * This function returns the supported (default) set when @p types has + * special value 0. + * + * @param priv + * Pointer to private structure. + * @param types + * Depending on @p verbs_to_dpdk, hash types in either DPDK (see struct + * rte_eth_rss_conf) or Verbs format. + * @param verbs_to_dpdk + * A zero value converts @p types from DPDK to Verbs, a nonzero value + * performs the reverse operation. + * + * @return + * Converted RSS hash fields on success, (uint64_t)-1 otherwise and + * rte_errno is set. + */ +uint64_t +mlx4_conv_rss_types(struct mlx4_priv *priv, uint64_t types, int verbs_to_dpdk) +{ + enum { + INNER, + IPV4, IPV4_1, IPV4_2, IPV6, IPV6_1, IPV6_2, IPV6_3, + TCP, UDP, + IPV4_TCP, IPV4_UDP, IPV6_TCP, IPV6_TCP_1, IPV6_UDP, IPV6_UDP_1, + }; + enum { + VERBS_IPV4 = IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4, + VERBS_IPV6 = IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6, + VERBS_TCP = IBV_RX_HASH_SRC_PORT_TCP | IBV_RX_HASH_DST_PORT_TCP, + VERBS_UDP = IBV_RX_HASH_SRC_PORT_UDP | IBV_RX_HASH_DST_PORT_UDP, + }; + static const uint64_t dpdk[] = { + [INNER] = 0, + [IPV4] = ETH_RSS_IPV4, + [IPV4_1] = ETH_RSS_FRAG_IPV4, + [IPV4_2] = ETH_RSS_NONFRAG_IPV4_OTHER, + [IPV6] = ETH_RSS_IPV6, + [IPV6_1] = ETH_RSS_FRAG_IPV6, + [IPV6_2] = ETH_RSS_NONFRAG_IPV6_OTHER, + [IPV6_3] = ETH_RSS_IPV6_EX, + [TCP] = 0, + [UDP] = 0, + [IPV4_TCP] = ETH_RSS_NONFRAG_IPV4_TCP, + [IPV4_UDP] = ETH_RSS_NONFRAG_IPV4_UDP, + [IPV6_TCP] = ETH_RSS_NONFRAG_IPV6_TCP, + [IPV6_TCP_1] = ETH_RSS_IPV6_TCP_EX, + [IPV6_UDP] = ETH_RSS_NONFRAG_IPV6_UDP, + [IPV6_UDP_1] = ETH_RSS_IPV6_UDP_EX, + }; + static const uint64_t verbs[RTE_DIM(dpdk)] = { + [INNER] = IBV_RX_HASH_INNER, + [IPV4] = VERBS_IPV4, + [IPV4_1] = VERBS_IPV4, + [IPV4_2] = VERBS_IPV4, + [IPV6] = VERBS_IPV6, + [IPV6_1] = VERBS_IPV6, + [IPV6_2] = VERBS_IPV6, + [IPV6_3] = VERBS_IPV6, + [TCP] = VERBS_TCP, + [UDP] = VERBS_UDP, + [IPV4_TCP] = VERBS_IPV4 | VERBS_TCP, + [IPV4_UDP] = VERBS_IPV4 | VERBS_UDP, + [IPV6_TCP] = VERBS_IPV6 | VERBS_TCP, + [IPV6_TCP_1] = VERBS_IPV6 | VERBS_TCP, + [IPV6_UDP] = VERBS_IPV6 | VERBS_UDP, + [IPV6_UDP_1] = VERBS_IPV6 | VERBS_UDP, + }; + const uint64_t *in = verbs_to_dpdk ? verbs : dpdk; + const uint64_t *out = verbs_to_dpdk ? dpdk : verbs; + uint64_t seen = 0; + uint64_t conv = 0; + unsigned int i; + + if (!types) { + if (!verbs_to_dpdk) + return priv->hw_rss_sup; + types = priv->hw_rss_sup; + } + for (i = 0; i != RTE_DIM(dpdk); ++i) + if (in[i] && (types & in[i]) == in[i]) { + seen |= types & in[i]; + conv |= out[i]; + } + if ((verbs_to_dpdk || (conv & priv->hw_rss_sup) == conv) && + !(types & ~seen)) + return conv; + rte_errno = ENOTSUP; + return (uint64_t)-1; +} + +/** + * Merge Ethernet pattern item into flow rule handle. + * + * Additional mlx4-specific constraints on supported fields: + * + * - No support for partial masks, except in the specific case of matching + * all multicast traffic (@p spec->dst and @p mask->dst equal to + * 01:00:00:00:00:00). + * - Not providing @p item->spec or providing an empty @p mask->dst is + * *only* supported if the rule doesn't specify additional matching + * criteria (i.e. rule is promiscuous-like). + * + * @param[in, out] flow + * Flow rule handle to update. + * @param[in] item + * Pattern item to merge. + * @param[in] proc + * Associated item-processing object. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_flow_merge_eth(struct rte_flow *flow, + const struct rte_flow_item *item, + const struct mlx4_flow_proc_item *proc, + struct rte_flow_error *error) +{ + const struct rte_flow_item_eth *spec = item->spec; + const struct rte_flow_item_eth *mask = + spec ? (item->mask ? item->mask : proc->mask_default) : NULL; + struct ibv_flow_spec_eth *eth; + const char *msg; + unsigned int i; + + if (mask) { + uint32_t sum_dst = 0; + uint32_t sum_src = 0; + + for (i = 0; i != sizeof(mask->dst.addr_bytes); ++i) { + sum_dst += mask->dst.addr_bytes[i]; + sum_src += mask->src.addr_bytes[i]; + } + if (sum_src) { + msg = "mlx4 does not support source MAC matching"; + goto error; + } else if (!sum_dst) { + flow->promisc = 1; + } else if (sum_dst == 1 && mask->dst.addr_bytes[0] == 1) { + if (!(spec->dst.addr_bytes[0] & 1)) { + msg = "mlx4 does not support the explicit" + " exclusion of all multicast traffic"; + goto error; + } + flow->allmulti = 1; + } else if (sum_dst != (UINT8_C(0xff) * ETHER_ADDR_LEN)) { + msg = "mlx4 does not support matching partial" + " Ethernet fields"; + goto error; + } + } + if (!flow->ibv_attr) + return 0; + if (flow->promisc) { + flow->ibv_attr->type = IBV_FLOW_ATTR_ALL_DEFAULT; + return 0; + } + if (flow->allmulti) { + flow->ibv_attr->type = IBV_FLOW_ATTR_MC_DEFAULT; + return 0; + } + ++flow->ibv_attr->num_of_specs; + eth = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size); + *eth = (struct ibv_flow_spec_eth) { + .type = IBV_FLOW_SPEC_ETH, + .size = sizeof(*eth), + }; + if (!mask) { + eth->val.dst_mac[0] = 0xff; + flow->ibv_attr->type = IBV_FLOW_ATTR_ALL_DEFAULT; + flow->promisc = 1; + return 0; + } + memcpy(eth->val.dst_mac, spec->dst.addr_bytes, ETHER_ADDR_LEN); + memcpy(eth->mask.dst_mac, mask->dst.addr_bytes, ETHER_ADDR_LEN); + /* Remove unwanted bits from values. */ + for (i = 0; i < ETHER_ADDR_LEN; ++i) { + eth->val.dst_mac[i] &= eth->mask.dst_mac[i]; + } + return 0; +error: + return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + item, msg); +} + +/** + * Merge VLAN pattern item into flow rule handle. + * + * Additional mlx4-specific constraints on supported fields: + * + * - Matching *all* VLAN traffic by omitting @p item->spec or providing an + * empty @p item->mask would also include non-VLAN traffic. Doing so is + * therefore unsupported. + * - No support for partial masks. + * + * @param[in, out] flow + * Flow rule handle to update. + * @param[in] item + * Pattern item to merge. + * @param[in] proc + * Associated item-processing object. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_flow_merge_vlan(struct rte_flow *flow, + const struct rte_flow_item *item, + const struct mlx4_flow_proc_item *proc, + struct rte_flow_error *error) +{ + const struct rte_flow_item_vlan *spec = item->spec; + const struct rte_flow_item_vlan *mask = + spec ? (item->mask ? item->mask : proc->mask_default) : NULL; + struct ibv_flow_spec_eth *eth; + const char *msg; + + if (!mask || !mask->tci) { + msg = "mlx4 cannot match all VLAN traffic while excluding" + " non-VLAN traffic, TCI VID must be specified"; + goto error; + } + if (mask->tci != RTE_BE16(0x0fff)) { + msg = "mlx4 does not support partial TCI VID matching"; + goto error; + } + if (!flow->ibv_attr) + return 0; + eth = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size - + sizeof(*eth)); + eth->val.vlan_tag = spec->tci; + eth->mask.vlan_tag = mask->tci; + eth->val.vlan_tag &= eth->mask.vlan_tag; + if (flow->ibv_attr->type == IBV_FLOW_ATTR_ALL_DEFAULT) + flow->ibv_attr->type = IBV_FLOW_ATTR_NORMAL; + return 0; +error: + return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + item, msg); +} + +/** + * Merge IPv4 pattern item into flow rule handle. + * + * Additional mlx4-specific constraints on supported fields: + * + * - No support for partial masks. + * + * @param[in, out] flow + * Flow rule handle to update. + * @param[in] item + * Pattern item to merge. + * @param[in] proc + * Associated item-processing object. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_flow_merge_ipv4(struct rte_flow *flow, + const struct rte_flow_item *item, + const struct mlx4_flow_proc_item *proc, + struct rte_flow_error *error) +{ + const struct rte_flow_item_ipv4 *spec = item->spec; + const struct rte_flow_item_ipv4 *mask = + spec ? (item->mask ? item->mask : proc->mask_default) : NULL; + struct ibv_flow_spec_ipv4 *ipv4; + const char *msg; + + if (mask && + ((uint32_t)(mask->hdr.src_addr + 1) > UINT32_C(1) || + (uint32_t)(mask->hdr.dst_addr + 1) > UINT32_C(1))) { + msg = "mlx4 does not support matching partial IPv4 fields"; + goto error; + } + if (!flow->ibv_attr) + return 0; + ++flow->ibv_attr->num_of_specs; + ipv4 = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size); + *ipv4 = (struct ibv_flow_spec_ipv4) { + .type = IBV_FLOW_SPEC_IPV4, + .size = sizeof(*ipv4), + }; + if (!spec) + return 0; + ipv4->val = (struct ibv_flow_ipv4_filter) { + .src_ip = spec->hdr.src_addr, + .dst_ip = spec->hdr.dst_addr, + }; + ipv4->mask = (struct ibv_flow_ipv4_filter) { + .src_ip = mask->hdr.src_addr, + .dst_ip = mask->hdr.dst_addr, + }; + /* Remove unwanted bits from values. */ + ipv4->val.src_ip &= ipv4->mask.src_ip; + ipv4->val.dst_ip &= ipv4->mask.dst_ip; + return 0; +error: + return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + item, msg); +} + +/** + * Merge UDP pattern item into flow rule handle. + * + * Additional mlx4-specific constraints on supported fields: + * + * - No support for partial masks. + * - Due to HW/FW limitation, flow rule priority is not taken into account + * when matching UDP destination ports, doing is therefore only supported + * at the highest priority level (0). + * + * @param[in, out] flow + * Flow rule handle to update. + * @param[in] item + * Pattern item to merge. + * @param[in] proc + * Associated item-processing object. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_flow_merge_udp(struct rte_flow *flow, + const struct rte_flow_item *item, + const struct mlx4_flow_proc_item *proc, + struct rte_flow_error *error) +{ + const struct rte_flow_item_udp *spec = item->spec; + const struct rte_flow_item_udp *mask = + spec ? (item->mask ? item->mask : proc->mask_default) : NULL; + struct ibv_flow_spec_tcp_udp *udp; + const char *msg; + + if (mask && + ((uint16_t)(mask->hdr.src_port + 1) > UINT16_C(1) || + (uint16_t)(mask->hdr.dst_port + 1) > UINT16_C(1))) { + msg = "mlx4 does not support matching partial UDP fields"; + goto error; + } + if (mask && mask->hdr.dst_port && flow->priority) { + msg = "combining UDP destination port matching with a nonzero" + " priority level is not supported"; + goto error; + } + if (!flow->ibv_attr) + return 0; + ++flow->ibv_attr->num_of_specs; + udp = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size); + *udp = (struct ibv_flow_spec_tcp_udp) { + .type = IBV_FLOW_SPEC_UDP, + .size = sizeof(*udp), + }; + if (!spec) + return 0; + udp->val.dst_port = spec->hdr.dst_port; + udp->val.src_port = spec->hdr.src_port; + udp->mask.dst_port = mask->hdr.dst_port; + udp->mask.src_port = mask->hdr.src_port; + /* Remove unwanted bits from values. */ + udp->val.src_port &= udp->mask.src_port; + udp->val.dst_port &= udp->mask.dst_port; + return 0; +error: + return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + item, msg); +} + +/** + * Merge TCP pattern item into flow rule handle. + * + * Additional mlx4-specific constraints on supported fields: + * + * - No support for partial masks. + * + * @param[in, out] flow + * Flow rule handle to update. + * @param[in] item + * Pattern item to merge. + * @param[in] proc + * Associated item-processing object. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_flow_merge_tcp(struct rte_flow *flow, + const struct rte_flow_item *item, + const struct mlx4_flow_proc_item *proc, + struct rte_flow_error *error) +{ + const struct rte_flow_item_tcp *spec = item->spec; + const struct rte_flow_item_tcp *mask = + spec ? (item->mask ? item->mask : proc->mask_default) : NULL; + struct ibv_flow_spec_tcp_udp *tcp; + const char *msg; + + if (mask && + ((uint16_t)(mask->hdr.src_port + 1) > UINT16_C(1) || + (uint16_t)(mask->hdr.dst_port + 1) > UINT16_C(1))) { + msg = "mlx4 does not support matching partial TCP fields"; + goto error; + } + if (!flow->ibv_attr) + return 0; + ++flow->ibv_attr->num_of_specs; + tcp = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size); + *tcp = (struct ibv_flow_spec_tcp_udp) { + .type = IBV_FLOW_SPEC_TCP, + .size = sizeof(*tcp), + }; + if (!spec) + return 0; + tcp->val.dst_port = spec->hdr.dst_port; + tcp->val.src_port = spec->hdr.src_port; + tcp->mask.dst_port = mask->hdr.dst_port; + tcp->mask.src_port = mask->hdr.src_port; + /* Remove unwanted bits from values. */ + tcp->val.src_port &= tcp->mask.src_port; + tcp->val.dst_port &= tcp->mask.dst_port; + return 0; +error: + return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + item, msg); +} + +/** + * Perform basic sanity checks on a pattern item. + * + * @param[in] item + * Item specification. + * @param[in] proc + * Associated item-processing object. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_flow_item_check(const struct rte_flow_item *item, + const struct mlx4_flow_proc_item *proc, + struct rte_flow_error *error) +{ + const uint8_t *mask; + unsigned int i; + + /* item->last and item->mask cannot exist without item->spec. */ + if (!item->spec && (item->mask || item->last)) + return rte_flow_error_set + (error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM, item, + "\"mask\" or \"last\" field provided without a" + " corresponding \"spec\""); + /* No spec, no mask, no problem. */ + if (!item->spec) + return 0; + mask = item->mask ? + (const uint8_t *)item->mask : + (const uint8_t *)proc->mask_default; + assert(mask); + /* + * Single-pass check to make sure that: + * - Mask is supported, no bits are set outside proc->mask_support. + * - Both item->spec and item->last are included in mask. + */ + for (i = 0; i != proc->mask_sz; ++i) { + if (!mask[i]) + continue; + if ((mask[i] | ((const uint8_t *)proc->mask_support)[i]) != + ((const uint8_t *)proc->mask_support)[i]) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + item, "unsupported field found in \"mask\""); + if (item->last && + (((const uint8_t *)item->spec)[i] & mask[i]) != + (((const uint8_t *)item->last)[i] & mask[i])) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + item, + "range between \"spec\" and \"last\"" + " is larger than \"mask\""); + } + return 0; +} + +/** Graph of supported items and associated actions. */ +static const struct mlx4_flow_proc_item mlx4_flow_proc_item_list[] = { + [RTE_FLOW_ITEM_TYPE_END] = { + .next_item = NEXT_ITEM(RTE_FLOW_ITEM_TYPE_ETH), + }, + [RTE_FLOW_ITEM_TYPE_ETH] = { + .next_item = NEXT_ITEM(RTE_FLOW_ITEM_TYPE_VLAN, + RTE_FLOW_ITEM_TYPE_IPV4), + .mask_support = &(const struct rte_flow_item_eth){ + /* Only destination MAC can be matched. */ + .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", + }, + .mask_default = &rte_flow_item_eth_mask, + .mask_sz = sizeof(struct rte_flow_item_eth), + .merge = mlx4_flow_merge_eth, + .dst_sz = sizeof(struct ibv_flow_spec_eth), + }, + [RTE_FLOW_ITEM_TYPE_VLAN] = { + .next_item = NEXT_ITEM(RTE_FLOW_ITEM_TYPE_IPV4), + .mask_support = &(const struct rte_flow_item_vlan){ + /* Only TCI VID matching is supported. */ + .tci = RTE_BE16(0x0fff), + }, + .mask_default = &rte_flow_item_vlan_mask, + .mask_sz = sizeof(struct rte_flow_item_vlan), + .merge = mlx4_flow_merge_vlan, + .dst_sz = 0, + }, + [RTE_FLOW_ITEM_TYPE_IPV4] = { + .next_item = NEXT_ITEM(RTE_FLOW_ITEM_TYPE_UDP, + RTE_FLOW_ITEM_TYPE_TCP), + .mask_support = &(const struct rte_flow_item_ipv4){ + .hdr = { + .src_addr = RTE_BE32(0xffffffff), + .dst_addr = RTE_BE32(0xffffffff), + }, + }, + .mask_default = &rte_flow_item_ipv4_mask, + .mask_sz = sizeof(struct rte_flow_item_ipv4), + .merge = mlx4_flow_merge_ipv4, + .dst_sz = sizeof(struct ibv_flow_spec_ipv4), + }, + [RTE_FLOW_ITEM_TYPE_UDP] = { + .mask_support = &(const struct rte_flow_item_udp){ + .hdr = { + .src_port = RTE_BE16(0xffff), + .dst_port = RTE_BE16(0xffff), + }, + }, + .mask_default = &rte_flow_item_udp_mask, + .mask_sz = sizeof(struct rte_flow_item_udp), + .merge = mlx4_flow_merge_udp, + .dst_sz = sizeof(struct ibv_flow_spec_tcp_udp), + }, + [RTE_FLOW_ITEM_TYPE_TCP] = { + .mask_support = &(const struct rte_flow_item_tcp){ + .hdr = { + .src_port = RTE_BE16(0xffff), + .dst_port = RTE_BE16(0xffff), + }, + }, + .mask_default = &rte_flow_item_tcp_mask, + .mask_sz = sizeof(struct rte_flow_item_tcp), + .merge = mlx4_flow_merge_tcp, + .dst_sz = sizeof(struct ibv_flow_spec_tcp_udp), + }, +}; + +/** + * Make sure a flow rule is supported and initialize associated structure. + * + * @param priv + * Pointer to private structure. + * @param[in] attr + * Flow rule attributes. + * @param[in] pattern + * Pattern specification (list terminated by the END pattern item). + * @param[in] actions + * Associated actions (list terminated by the END action). + * @param[out] error + * Perform verbose error reporting if not NULL. + * @param[in, out] addr + * Buffer where the resulting flow rule handle pointer must be stored. + * If NULL, stop processing after validation stage. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_flow_prepare(struct mlx4_priv *priv, + const struct rte_flow_attr *attr, + const struct rte_flow_item pattern[], + const struct rte_flow_action actions[], + struct rte_flow_error *error, + struct rte_flow **addr) +{ + const struct rte_flow_item *item; + const struct rte_flow_action *action; + const struct mlx4_flow_proc_item *proc; + struct rte_flow temp = { .ibv_attr_size = sizeof(*temp.ibv_attr) }; + struct rte_flow *flow = &temp; + const char *msg = NULL; + int overlap; + + if (attr->group) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_GROUP, + NULL, "groups are not supported"); + if (attr->priority > MLX4_FLOW_PRIORITY_LAST) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, + NULL, "maximum priority level is " + MLX4_STR_EXPAND(MLX4_FLOW_PRIORITY_LAST)); + if (attr->egress) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, + NULL, "egress is not supported"); + if (attr->transfer) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, + NULL, "transfer is not supported"); + if (!attr->ingress) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, + NULL, "only ingress is supported"); +fill: + overlap = 0; + proc = mlx4_flow_proc_item_list; + flow->priority = attr->priority; + /* Go over pattern. */ + for (item = pattern; item->type; ++item) { + const struct mlx4_flow_proc_item *next = NULL; + unsigned int i; + int err; + + if (item->type == RTE_FLOW_ITEM_TYPE_VOID) + continue; + if (item->type == MLX4_FLOW_ITEM_TYPE_INTERNAL) { + flow->internal = 1; + continue; + } + if (flow->promisc || flow->allmulti) { + msg = "mlx4 does not support additional matching" + " criteria combined with indiscriminate" + " matching on Ethernet headers"; + goto exit_item_not_supported; + } + for (i = 0; proc->next_item && proc->next_item[i]; ++i) { + if (proc->next_item[i] == item->type) { + next = &mlx4_flow_proc_item_list[item->type]; + break; + } + } + if (!next) + goto exit_item_not_supported; + proc = next; + /* + * Perform basic sanity checks only once, while handle is + * not allocated. + */ + if (flow == &temp) { + err = mlx4_flow_item_check(item, proc, error); + if (err) + return err; + } + if (proc->merge) { + err = proc->merge(flow, item, proc, error); + if (err) + return err; + } + flow->ibv_attr_size += proc->dst_sz; + } + /* Go over actions list. */ + for (action = actions; action->type; ++action) { + /* This one may appear anywhere multiple times. */ + if (action->type == RTE_FLOW_ACTION_TYPE_VOID) + continue; + /* Fate-deciding actions may appear exactly once. */ + if (overlap) { + msg = "cannot combine several fate-deciding actions," + " choose between DROP, QUEUE or RSS"; + goto exit_action_not_supported; + } + overlap = 1; + switch (action->type) { + const struct rte_flow_action_queue *queue; + const struct rte_flow_action_rss *rss; + const uint8_t *rss_key; + uint32_t rss_key_len; + uint64_t fields; + unsigned int i; + + case RTE_FLOW_ACTION_TYPE_DROP: + flow->drop = 1; + break; + case RTE_FLOW_ACTION_TYPE_QUEUE: + if (flow->rss) + break; + queue = action->conf; + if (queue->index >= ETH_DEV(priv)->data->nb_rx_queues) { + msg = "queue target index beyond number of" + " configured Rx queues"; + goto exit_action_not_supported; + } + flow->rss = mlx4_rss_get + (priv, 0, mlx4_rss_hash_key_default, 1, + &queue->index); + if (!flow->rss) { + msg = "not enough resources for additional" + " single-queue RSS context"; + goto exit_action_not_supported; + } + break; + case RTE_FLOW_ACTION_TYPE_RSS: + if (flow->rss) + break; + rss = action->conf; + /* Default RSS configuration if none is provided. */ + if (rss->key_len) { + rss_key = rss->key; + rss_key_len = rss->key_len; + } else { + rss_key = mlx4_rss_hash_key_default; + rss_key_len = MLX4_RSS_HASH_KEY_SIZE; + } + /* Sanity checks. */ + for (i = 0; i < rss->queue_num; ++i) + if (rss->queue[i] >= + ETH_DEV(priv)->data->nb_rx_queues) + break; + if (i != rss->queue_num) { + msg = "queue index target beyond number of" + " configured Rx queues"; + goto exit_action_not_supported; + } + if (!rte_is_power_of_2(rss->queue_num)) { + msg = "for RSS, mlx4 requires the number of" + " queues to be a power of two"; + goto exit_action_not_supported; + } + if (rss_key_len != sizeof(flow->rss->key)) { + msg = "mlx4 supports exactly one RSS hash key" + " length: " + MLX4_STR_EXPAND(MLX4_RSS_HASH_KEY_SIZE); + goto exit_action_not_supported; + } + for (i = 1; i < rss->queue_num; ++i) + if (rss->queue[i] - rss->queue[i - 1] != 1) + break; + if (i != rss->queue_num) { + msg = "mlx4 requires RSS contexts to use" + " consecutive queue indices only"; + goto exit_action_not_supported; + } + if (rss->queue[0] % rss->queue_num) { + msg = "mlx4 requires the first queue of a RSS" + " context to be aligned on a multiple" + " of the context size"; + goto exit_action_not_supported; + } + if (rss->func && + rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ) { + msg = "the only supported RSS hash function" + " is Toeplitz"; + goto exit_action_not_supported; + } + if (rss->level) { + msg = "a nonzero RSS encapsulation level is" + " not supported"; + goto exit_action_not_supported; + } + rte_errno = 0; + fields = mlx4_conv_rss_types(priv, rss->types, 0); + if (fields == (uint64_t)-1 && rte_errno) { + msg = "unsupported RSS hash type requested"; + goto exit_action_not_supported; + } + flow->rss = mlx4_rss_get + (priv, fields, rss_key, rss->queue_num, + rss->queue); + if (!flow->rss) { + msg = "either invalid parameters or not enough" + " resources for additional multi-queue" + " RSS context"; + goto exit_action_not_supported; + } + break; + default: + goto exit_action_not_supported; + } + } + /* When fate is unknown, drop traffic. */ + if (!overlap) + flow->drop = 1; + /* Validation ends here. */ + if (!addr) { + if (flow->rss) + mlx4_rss_put(flow->rss); + return 0; + } + if (flow == &temp) { + /* Allocate proper handle based on collected data. */ + const struct mlx4_malloc_vec vec[] = { + { + .align = alignof(struct rte_flow), + .size = sizeof(*flow), + .addr = (void **)&flow, + }, + { + .align = alignof(struct ibv_flow_attr), + .size = temp.ibv_attr_size, + .addr = (void **)&temp.ibv_attr, + }, + }; + + if (!mlx4_zmallocv(__func__, vec, RTE_DIM(vec))) { + if (temp.rss) + mlx4_rss_put(temp.rss); + return rte_flow_error_set + (error, -rte_errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "flow rule handle allocation failure"); + } + /* Most fields will be updated by second pass. */ + *flow = (struct rte_flow){ + .ibv_attr = temp.ibv_attr, + .ibv_attr_size = sizeof(*flow->ibv_attr), + .rss = temp.rss, + }; + *flow->ibv_attr = (struct ibv_flow_attr){ + .type = IBV_FLOW_ATTR_NORMAL, + .size = sizeof(*flow->ibv_attr), + .priority = attr->priority, + .port = priv->port, + }; + goto fill; + } + *addr = flow; + return 0; +exit_item_not_supported: + return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + item, msg ? msg : "item not supported"); +exit_action_not_supported: + return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, + action, msg ? msg : "action not supported"); +} + +/** + * Validate a flow supported by the NIC. + * + * @see rte_flow_validate() + * @see rte_flow_ops + */ +static int +mlx4_flow_validate(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item pattern[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + struct mlx4_priv *priv = dev->data->dev_private; + + return mlx4_flow_prepare(priv, attr, pattern, actions, error, NULL); +} + +/** + * Get a drop flow rule resources instance. + * + * @param priv + * Pointer to private structure. + * + * @return + * Pointer to drop flow resources on success, NULL otherwise and rte_errno + * is set. + */ +static struct mlx4_drop * +mlx4_drop_get(struct mlx4_priv *priv) +{ + struct mlx4_drop *drop = priv->drop; + + if (drop) { + assert(drop->refcnt); + assert(drop->priv == priv); + ++drop->refcnt; + return drop; + } + drop = rte_malloc(__func__, sizeof(*drop), 0); + if (!drop) + goto error; + *drop = (struct mlx4_drop){ + .priv = priv, + .refcnt = 1, + }; + drop->cq = mlx4_glue->create_cq(priv->ctx, 1, NULL, NULL, 0); + if (!drop->cq) + goto error; + drop->qp = mlx4_glue->create_qp + (priv->pd, + &(struct ibv_qp_init_attr){ + .send_cq = drop->cq, + .recv_cq = drop->cq, + .qp_type = IBV_QPT_RAW_PACKET, + }); + if (!drop->qp) + goto error; + priv->drop = drop; + return drop; +error: + if (drop->qp) + claim_zero(mlx4_glue->destroy_qp(drop->qp)); + if (drop->cq) + claim_zero(mlx4_glue->destroy_cq(drop->cq)); + if (drop) + rte_free(drop); + rte_errno = ENOMEM; + return NULL; +} + +/** + * Give back a drop flow rule resources instance. + * + * @param drop + * Pointer to drop flow rule resources. + */ +static void +mlx4_drop_put(struct mlx4_drop *drop) +{ + assert(drop->refcnt); + if (--drop->refcnt) + return; + drop->priv->drop = NULL; + claim_zero(mlx4_glue->destroy_qp(drop->qp)); + claim_zero(mlx4_glue->destroy_cq(drop->cq)); + rte_free(drop); +} + +/** + * Toggle a configured flow rule. + * + * @param priv + * Pointer to private structure. + * @param flow + * Flow rule handle to toggle. + * @param enable + * Whether associated Verbs flow must be created or removed. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_flow_toggle(struct mlx4_priv *priv, + struct rte_flow *flow, + int enable, + struct rte_flow_error *error) +{ + struct ibv_qp *qp = NULL; + const char *msg; + int err; + + if (!enable) { + if (!flow->ibv_flow) + return 0; + claim_zero(mlx4_glue->destroy_flow(flow->ibv_flow)); + flow->ibv_flow = NULL; + if (flow->drop) + mlx4_drop_put(priv->drop); + else if (flow->rss) + mlx4_rss_detach(flow->rss); + return 0; + } + assert(flow->ibv_attr); + if (!flow->internal && + !priv->isolated && + flow->ibv_attr->priority == MLX4_FLOW_PRIORITY_LAST) { + if (flow->ibv_flow) { + claim_zero(mlx4_glue->destroy_flow(flow->ibv_flow)); + flow->ibv_flow = NULL; + if (flow->drop) + mlx4_drop_put(priv->drop); + else if (flow->rss) + mlx4_rss_detach(flow->rss); + } + err = EACCES; + msg = ("priority level " + MLX4_STR_EXPAND(MLX4_FLOW_PRIORITY_LAST) + " is reserved when not in isolated mode"); + goto error; + } + if (flow->rss) { + struct mlx4_rss *rss = flow->rss; + int missing = 0; + unsigned int i; + + /* Stop at the first nonexistent target queue. */ + for (i = 0; i != rss->queues; ++i) + if (rss->queue_id[i] >= + ETH_DEV(priv)->data->nb_rx_queues || + !ETH_DEV(priv)->data->rx_queues[rss->queue_id[i]]) { + missing = 1; + break; + } + if (flow->ibv_flow) { + if (missing ^ !flow->drop) + return 0; + /* Verbs flow needs updating. */ + claim_zero(mlx4_glue->destroy_flow(flow->ibv_flow)); + flow->ibv_flow = NULL; + if (flow->drop) + mlx4_drop_put(priv->drop); + else + mlx4_rss_detach(rss); + } + if (!missing) { + err = mlx4_rss_attach(rss); + if (err) { + err = -err; + msg = "cannot create indirection table or hash" + " QP to associate flow rule with"; + goto error; + } + qp = rss->qp; + } + /* A missing target queue drops traffic implicitly. */ + flow->drop = missing; + } + if (flow->drop) { + if (flow->ibv_flow) + return 0; + mlx4_drop_get(priv); + if (!priv->drop) { + err = rte_errno; + msg = "resources for drop flow rule cannot be created"; + goto error; + } + qp = priv->drop->qp; + } + assert(qp); + if (flow->ibv_flow) + return 0; + flow->ibv_flow = mlx4_glue->create_flow(qp, flow->ibv_attr); + if (flow->ibv_flow) + return 0; + if (flow->drop) + mlx4_drop_put(priv->drop); + else if (flow->rss) + mlx4_rss_detach(flow->rss); + err = errno; + msg = "flow rule rejected by device"; +error: + return rte_flow_error_set + (error, err, RTE_FLOW_ERROR_TYPE_HANDLE, flow, msg); +} + +/** + * Create a flow. + * + * @see rte_flow_create() + * @see rte_flow_ops + */ +static struct rte_flow * +mlx4_flow_create(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item pattern[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + struct mlx4_priv *priv = dev->data->dev_private; + struct rte_flow *flow; + int err; + + err = mlx4_flow_prepare(priv, attr, pattern, actions, error, &flow); + if (err) + return NULL; + err = mlx4_flow_toggle(priv, flow, priv->started, error); + if (!err) { + struct rte_flow *curr = LIST_FIRST(&priv->flows); + + /* New rules are inserted after internal ones. */ + if (!curr || !curr->internal) { + LIST_INSERT_HEAD(&priv->flows, flow, next); + } else { + while (LIST_NEXT(curr, next) && + LIST_NEXT(curr, next)->internal) + curr = LIST_NEXT(curr, next); + LIST_INSERT_AFTER(curr, flow, next); + } + return flow; + } + if (flow->rss) + mlx4_rss_put(flow->rss); + rte_flow_error_set(error, -err, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + error->message); + rte_free(flow); + return NULL; +} + +/** + * Configure isolated mode. + * + * @see rte_flow_isolate() + * @see rte_flow_ops + */ +static int +mlx4_flow_isolate(struct rte_eth_dev *dev, + int enable, + struct rte_flow_error *error) +{ + struct mlx4_priv *priv = dev->data->dev_private; + + if (!!enable == !!priv->isolated) + return 0; + priv->isolated = !!enable; + if (mlx4_flow_sync(priv, error)) { + priv->isolated = !enable; + return -rte_errno; + } + return 0; +} + +/** + * Destroy a flow rule. + * + * @see rte_flow_destroy() + * @see rte_flow_ops + */ +static int +mlx4_flow_destroy(struct rte_eth_dev *dev, + struct rte_flow *flow, + struct rte_flow_error *error) +{ + struct mlx4_priv *priv = dev->data->dev_private; + int err = mlx4_flow_toggle(priv, flow, 0, error); + + if (err) + return err; + LIST_REMOVE(flow, next); + if (flow->rss) + mlx4_rss_put(flow->rss); + rte_free(flow); + return 0; +} + +/** + * Destroy user-configured flow rules. + * + * This function skips internal flows rules. + * + * @see rte_flow_flush() + * @see rte_flow_ops + */ +static int +mlx4_flow_flush(struct rte_eth_dev *dev, + struct rte_flow_error *error) +{ + struct mlx4_priv *priv = dev->data->dev_private; + struct rte_flow *flow = LIST_FIRST(&priv->flows); + + while (flow) { + struct rte_flow *next = LIST_NEXT(flow, next); + + if (!flow->internal) + mlx4_flow_destroy(dev, flow, error); + flow = next; + } + return 0; +} + +/** + * Helper function to determine the next configured VLAN filter. + * + * @param priv + * Pointer to private structure. + * @param vlan + * VLAN ID to use as a starting point. + * + * @return + * Next configured VLAN ID or a high value (>= 4096) if there is none. + */ +static uint16_t +mlx4_flow_internal_next_vlan(struct mlx4_priv *priv, uint16_t vlan) +{ + while (vlan < 4096) { + if (ETH_DEV(priv)->data->vlan_filter_conf.ids[vlan / 64] & + (UINT64_C(1) << (vlan % 64))) + return vlan; + ++vlan; + } + return vlan; +} + +/** + * Generate internal flow rules. + * + * Various flow rules are created depending on the mode the device is in: + * + * 1. Promiscuous: + * port MAC + broadcast + catch-all (VLAN filtering is ignored). + * 2. All multicast: + * port MAC/VLAN + broadcast + catch-all multicast. + * 3. Otherwise: + * port MAC/VLAN + broadcast MAC/VLAN. + * + * About MAC flow rules: + * + * - MAC flow rules are generated from @p dev->data->mac_addrs + * (@p priv->mac array). + * - An additional flow rule for Ethernet broadcasts is also generated. + * - All these are per-VLAN if @p DEV_RX_OFFLOAD_VLAN_FILTER + * is enabled and VLAN filters are configured. + * + * @param priv + * Pointer to private structure. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_flow_internal(struct mlx4_priv *priv, struct rte_flow_error *error) +{ + struct rte_flow_attr attr = { + .priority = MLX4_FLOW_PRIORITY_LAST, + .ingress = 1, + }; + struct rte_flow_item_eth eth_spec; + const struct rte_flow_item_eth eth_mask = { + .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", + }; + const struct rte_flow_item_eth eth_allmulti = { + .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00", + }; + struct rte_flow_item_vlan vlan_spec; + const struct rte_flow_item_vlan vlan_mask = { + .tci = RTE_BE16(0x0fff), + }; + struct rte_flow_item pattern[] = { + { + .type = MLX4_FLOW_ITEM_TYPE_INTERNAL, + }, + { + .type = RTE_FLOW_ITEM_TYPE_ETH, + .spec = ð_spec, + .mask = ð_mask, + }, + { + /* Replaced with VLAN if filtering is enabled. */ + .type = RTE_FLOW_ITEM_TYPE_END, + }, + { + .type = RTE_FLOW_ITEM_TYPE_END, + }, + }; + /* + * Round number of queues down to their previous power of 2 to + * comply with RSS context limitations. Extra queues silently do not + * get RSS by default. + */ + uint32_t queues = + rte_align32pow2(ETH_DEV(priv)->data->nb_rx_queues + 1) >> 1; + uint16_t queue[queues]; + struct rte_flow_action_rss action_rss = { + .func = RTE_ETH_HASH_FUNCTION_DEFAULT, + .level = 0, + .types = 0, + .key_len = MLX4_RSS_HASH_KEY_SIZE, + .queue_num = queues, + .key = mlx4_rss_hash_key_default, + .queue = queue, + }; + struct rte_flow_action actions[] = { + { + .type = RTE_FLOW_ACTION_TYPE_RSS, + .conf = &action_rss, + }, + { + .type = RTE_FLOW_ACTION_TYPE_END, + }, + }; + struct ether_addr *rule_mac = ð_spec.dst; + rte_be16_t *rule_vlan = + (ETH_DEV(priv)->data->dev_conf.rxmode.offloads & + DEV_RX_OFFLOAD_VLAN_FILTER) && + !ETH_DEV(priv)->data->promiscuous ? + &vlan_spec.tci : + NULL; + uint16_t vlan = 0; + struct rte_flow *flow; + unsigned int i; + int err = 0; + + /* Nothing to be done if there are no Rx queues. */ + if (!queues) + goto error; + /* Prepare default RSS configuration. */ + for (i = 0; i != queues; ++i) + queue[i] = i; + /* + * Set up VLAN item if filtering is enabled and at least one VLAN + * filter is configured. + */ + if (rule_vlan) { + vlan = mlx4_flow_internal_next_vlan(priv, 0); + if (vlan < 4096) { + pattern[2] = (struct rte_flow_item){ + .type = RTE_FLOW_ITEM_TYPE_VLAN, + .spec = &vlan_spec, + .mask = &vlan_mask, + }; +next_vlan: + *rule_vlan = rte_cpu_to_be_16(vlan); + } else { + rule_vlan = NULL; + } + } + for (i = 0; i != RTE_DIM(priv->mac) + 1; ++i) { + const struct ether_addr *mac; + + /* Broadcasts are handled by an extra iteration. */ + if (i < RTE_DIM(priv->mac)) + mac = &priv->mac[i]; + else + mac = ð_mask.dst; + if (is_zero_ether_addr(mac)) + continue; + /* Check if MAC flow rule is already present. */ + for (flow = LIST_FIRST(&priv->flows); + flow && flow->internal; + flow = LIST_NEXT(flow, next)) { + const struct ibv_flow_spec_eth *eth = + (const void *)((uintptr_t)flow->ibv_attr + + sizeof(*flow->ibv_attr)); + unsigned int j; + + if (!flow->mac) + continue; + assert(flow->ibv_attr->type == IBV_FLOW_ATTR_NORMAL); + assert(flow->ibv_attr->num_of_specs == 1); + assert(eth->type == IBV_FLOW_SPEC_ETH); + assert(flow->rss); + if (rule_vlan && + (eth->val.vlan_tag != *rule_vlan || + eth->mask.vlan_tag != RTE_BE16(0x0fff))) + continue; + if (!rule_vlan && eth->mask.vlan_tag) + continue; + for (j = 0; j != sizeof(mac->addr_bytes); ++j) + if (eth->val.dst_mac[j] != mac->addr_bytes[j] || + eth->mask.dst_mac[j] != UINT8_C(0xff) || + eth->val.src_mac[j] != UINT8_C(0x00) || + eth->mask.src_mac[j] != UINT8_C(0x00)) + break; + if (j != sizeof(mac->addr_bytes)) + continue; + if (flow->rss->queues != queues || + memcmp(flow->rss->queue_id, action_rss.queue, + queues * sizeof(flow->rss->queue_id[0]))) + continue; + break; + } + if (!flow || !flow->internal) { + /* Not found, create a new flow rule. */ + memcpy(rule_mac, mac, sizeof(*mac)); + flow = mlx4_flow_create(ETH_DEV(priv), &attr, pattern, + actions, error); + if (!flow) { + err = -rte_errno; + goto error; + } + } + flow->select = 1; + flow->mac = 1; + } + if (rule_vlan) { + vlan = mlx4_flow_internal_next_vlan(priv, vlan + 1); + if (vlan < 4096) + goto next_vlan; + } + /* Take care of promiscuous and all multicast flow rules. */ + if (ETH_DEV(priv)->data->promiscuous || + ETH_DEV(priv)->data->all_multicast) { + for (flow = LIST_FIRST(&priv->flows); + flow && flow->internal; + flow = LIST_NEXT(flow, next)) { + if (ETH_DEV(priv)->data->promiscuous) { + if (flow->promisc) + break; + } else { + assert(ETH_DEV(priv)->data->all_multicast); + if (flow->allmulti) + break; + } + } + if (flow && flow->internal) { + assert(flow->rss); + if (flow->rss->queues != queues || + memcmp(flow->rss->queue_id, action_rss.queue, + queues * sizeof(flow->rss->queue_id[0]))) + flow = NULL; + } + if (!flow || !flow->internal) { + /* Not found, create a new flow rule. */ + if (ETH_DEV(priv)->data->promiscuous) { + pattern[1].spec = NULL; + pattern[1].mask = NULL; + } else { + assert(ETH_DEV(priv)->data->all_multicast); + pattern[1].spec = ð_allmulti; + pattern[1].mask = ð_allmulti; + } + pattern[2] = pattern[3]; + flow = mlx4_flow_create(ETH_DEV(priv), &attr, pattern, + actions, error); + if (!flow) { + err = -rte_errno; + goto error; + } + } + assert(flow->promisc || flow->allmulti); + flow->select = 1; + } +error: + /* Clear selection and clean up stale internal flow rules. */ + flow = LIST_FIRST(&priv->flows); + while (flow && flow->internal) { + struct rte_flow *next = LIST_NEXT(flow, next); + + if (!flow->select) + claim_zero(mlx4_flow_destroy(ETH_DEV(priv), flow, + error)); + else + flow->select = 0; + flow = next; + } + return err; +} + +/** + * Synchronize flow rules. + * + * This function synchronizes flow rules with the state of the device by + * taking into account isolated mode and whether target queues are + * configured. + * + * @param priv + * Pointer to private structure. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx4_flow_sync(struct mlx4_priv *priv, struct rte_flow_error *error) +{ + struct rte_flow *flow; + int ret; + + /* Internal flow rules are guaranteed to come first in the list. */ + if (priv->isolated) { + /* + * Get rid of them in isolated mode, stop at the first + * non-internal rule found. + */ + for (flow = LIST_FIRST(&priv->flows); + flow && flow->internal; + flow = LIST_FIRST(&priv->flows)) + claim_zero(mlx4_flow_destroy(ETH_DEV(priv), flow, + error)); + } else { + /* Refresh internal rules. */ + ret = mlx4_flow_internal(priv, error); + if (ret) + return ret; + } + /* Toggle the remaining flow rules . */ + LIST_FOREACH(flow, &priv->flows, next) { + ret = mlx4_flow_toggle(priv, flow, priv->started, error); + if (ret) + return ret; + } + if (!priv->started) + assert(!priv->drop); + return 0; +} + +/** + * Clean up all flow rules. + * + * Unlike mlx4_flow_flush(), this function takes care of all remaining flow + * rules regardless of whether they are internal or user-configured. + * + * @param priv + * Pointer to private structure. + */ +void +mlx4_flow_clean(struct mlx4_priv *priv) +{ + struct rte_flow *flow; + + while ((flow = LIST_FIRST(&priv->flows))) + mlx4_flow_destroy(ETH_DEV(priv), flow, NULL); + assert(LIST_EMPTY(&priv->rss)); +} + +static const struct rte_flow_ops mlx4_flow_ops = { + .validate = mlx4_flow_validate, + .create = mlx4_flow_create, + .destroy = mlx4_flow_destroy, + .flush = mlx4_flow_flush, + .isolate = mlx4_flow_isolate, +}; + +/** + * Manage filter operations. + * + * @param dev + * Pointer to Ethernet device structure. + * @param filter_type + * Filter type. + * @param filter_op + * Operation to perform. + * @param arg + * Pointer to operation-specific structure. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_filter_ctrl(struct rte_eth_dev *dev, + enum rte_filter_type filter_type, + enum rte_filter_op filter_op, + void *arg) +{ + switch (filter_type) { + case RTE_ETH_FILTER_GENERIC: + if (filter_op != RTE_ETH_FILTER_GET) + break; + *(const void **)arg = &mlx4_flow_ops; + return 0; + default: + ERROR("%p: filter type (%d) not supported", + (void *)dev, filter_type); + break; + } + rte_errno = ENOTSUP; + return -rte_errno; +} diff --git a/src/seastar/dpdk/drivers/net/mlx4/mlx4_flow.h b/src/seastar/dpdk/drivers/net/mlx4/mlx4_flow.h new file mode 100644 index 000000000..26465c66a --- /dev/null +++ b/src/seastar/dpdk/drivers/net/mlx4/mlx4_flow.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox Technologies, Ltd + */ + +#ifndef RTE_PMD_MLX4_FLOW_H_ +#define RTE_PMD_MLX4_FLOW_H_ + +#include <stdint.h> +#include <sys/queue.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_ethdev_driver.h> +#include <rte_flow.h> +#include <rte_flow_driver.h> +#include <rte_byteorder.h> + +/** Last and lowest priority level for a flow rule. */ +#define MLX4_FLOW_PRIORITY_LAST UINT32_C(0xfff) + +/** Meta pattern item used to distinguish internal rules. */ +#define MLX4_FLOW_ITEM_TYPE_INTERNAL ((enum rte_flow_item_type)-1) + +/** PMD-specific (mlx4) definition of a flow rule handle. */ +struct rte_flow { + LIST_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */ + struct ibv_flow *ibv_flow; /**< Verbs flow. */ + struct ibv_flow_attr *ibv_attr; /**< Pointer to Verbs attributes. */ + uint32_t ibv_attr_size; /**< Size of Verbs attributes. */ + uint32_t select:1; /**< Used by operations on the linked list. */ + uint32_t internal:1; /**< Internal flow rule outside isolated mode. */ + uint32_t mac:1; /**< Rule associated with a configured MAC address. */ + uint32_t promisc:1; /**< This rule matches everything. */ + uint32_t allmulti:1; /**< This rule matches all multicast traffic. */ + uint32_t drop:1; /**< This rule drops packets. */ + uint32_t priority; /**< Flow rule priority. */ + struct mlx4_rss *rss; /**< Rx target. */ +}; + +/* mlx4_flow.c */ + +uint64_t mlx4_conv_rss_types(struct mlx4_priv *priv, uint64_t types, + int verbs_to_dpdk); +int mlx4_flow_sync(struct mlx4_priv *priv, struct rte_flow_error *error); +void mlx4_flow_clean(struct mlx4_priv *priv); +int mlx4_filter_ctrl(struct rte_eth_dev *dev, + enum rte_filter_type filter_type, + enum rte_filter_op filter_op, + void *arg); + +#endif /* RTE_PMD_MLX4_FLOW_H_ */ diff --git a/src/seastar/dpdk/drivers/net/mlx4/mlx4_glue.c b/src/seastar/dpdk/drivers/net/mlx4/mlx4_glue.c new file mode 100644 index 000000000..67b3bface --- /dev/null +++ b/src/seastar/dpdk/drivers/net/mlx4/mlx4_glue.c @@ -0,0 +1,279 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 6WIND S.A. + * Copyright 2018 Mellanox Technologies, Ltd + */ + +#include <stddef.h> +#include <stdint.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/mlx4dv.h> +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include "mlx4_glue.h" + +static int +mlx4_glue_fork_init(void) +{ + return ibv_fork_init(); +} + +static int +mlx4_glue_get_async_event(struct ibv_context *context, + struct ibv_async_event *event) +{ + return ibv_get_async_event(context, event); +} + +static void +mlx4_glue_ack_async_event(struct ibv_async_event *event) +{ + ibv_ack_async_event(event); +} + +static struct ibv_pd * +mlx4_glue_alloc_pd(struct ibv_context *context) +{ + return ibv_alloc_pd(context); +} + +static int +mlx4_glue_dealloc_pd(struct ibv_pd *pd) +{ + return ibv_dealloc_pd(pd); +} + +static struct ibv_device ** +mlx4_glue_get_device_list(int *num_devices) +{ + return ibv_get_device_list(num_devices); +} + +static void +mlx4_glue_free_device_list(struct ibv_device **list) +{ + ibv_free_device_list(list); +} + +static struct ibv_context * +mlx4_glue_open_device(struct ibv_device *device) +{ + return ibv_open_device(device); +} + +static int +mlx4_glue_close_device(struct ibv_context *context) +{ + return ibv_close_device(context); +} + +static const char * +mlx4_glue_get_device_name(struct ibv_device *device) +{ + return ibv_get_device_name(device); +} + +static int +mlx4_glue_query_device(struct ibv_context *context, + struct ibv_device_attr *device_attr) +{ + return ibv_query_device(context, device_attr); +} + +static int +mlx4_glue_query_device_ex(struct ibv_context *context, + const struct ibv_query_device_ex_input *input, + struct ibv_device_attr_ex *attr) +{ + return ibv_query_device_ex(context, input, attr); +} + +static int +mlx4_glue_query_port(struct ibv_context *context, uint8_t port_num, + struct ibv_port_attr *port_attr) +{ + return ibv_query_port(context, port_num, port_attr); +} + +static const char * +mlx4_glue_port_state_str(enum ibv_port_state port_state) +{ + return ibv_port_state_str(port_state); +} + +static struct ibv_comp_channel * +mlx4_glue_create_comp_channel(struct ibv_context *context) +{ + return ibv_create_comp_channel(context); +} + +static int +mlx4_glue_destroy_comp_channel(struct ibv_comp_channel *channel) +{ + return ibv_destroy_comp_channel(channel); +} + +static struct ibv_cq * +mlx4_glue_create_cq(struct ibv_context *context, int cqe, void *cq_context, + struct ibv_comp_channel *channel, int comp_vector) +{ + return ibv_create_cq(context, cqe, cq_context, channel, comp_vector); +} + +static int +mlx4_glue_destroy_cq(struct ibv_cq *cq) +{ + return ibv_destroy_cq(cq); +} + +static int +mlx4_glue_get_cq_event(struct ibv_comp_channel *channel, struct ibv_cq **cq, + void **cq_context) +{ + return ibv_get_cq_event(channel, cq, cq_context); +} + +static void +mlx4_glue_ack_cq_events(struct ibv_cq *cq, unsigned int nevents) +{ + ibv_ack_cq_events(cq, nevents); +} + +static struct ibv_flow * +mlx4_glue_create_flow(struct ibv_qp *qp, struct ibv_flow_attr *flow) +{ + return ibv_create_flow(qp, flow); +} + +static int +mlx4_glue_destroy_flow(struct ibv_flow *flow_id) +{ + return ibv_destroy_flow(flow_id); +} + +static struct ibv_qp * +mlx4_glue_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) +{ + return ibv_create_qp(pd, qp_init_attr); +} + +static struct ibv_qp * +mlx4_glue_create_qp_ex(struct ibv_context *context, + struct ibv_qp_init_attr_ex *qp_init_attr_ex) +{ + return ibv_create_qp_ex(context, qp_init_attr_ex); +} + +static int +mlx4_glue_destroy_qp(struct ibv_qp *qp) +{ + return ibv_destroy_qp(qp); +} + +static int +mlx4_glue_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) +{ + return ibv_modify_qp(qp, attr, attr_mask); +} + +static struct ibv_mr * +mlx4_glue_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) +{ + return ibv_reg_mr(pd, addr, length, access); +} + +static int +mlx4_glue_dereg_mr(struct ibv_mr *mr) +{ + return ibv_dereg_mr(mr); +} + +static struct ibv_rwq_ind_table * +mlx4_glue_create_rwq_ind_table(struct ibv_context *context, + struct ibv_rwq_ind_table_init_attr *init_attr) +{ + return ibv_create_rwq_ind_table(context, init_attr); +} + +static int +mlx4_glue_destroy_rwq_ind_table(struct ibv_rwq_ind_table *rwq_ind_table) +{ + return ibv_destroy_rwq_ind_table(rwq_ind_table); +} + +static struct ibv_wq * +mlx4_glue_create_wq(struct ibv_context *context, + struct ibv_wq_init_attr *wq_init_attr) +{ + return ibv_create_wq(context, wq_init_attr); +} + +static int +mlx4_glue_destroy_wq(struct ibv_wq *wq) +{ + return ibv_destroy_wq(wq); +} +static int +mlx4_glue_modify_wq(struct ibv_wq *wq, struct ibv_wq_attr *wq_attr) +{ + return ibv_modify_wq(wq, wq_attr); +} + +static int +mlx4_glue_dv_init_obj(struct mlx4dv_obj *obj, uint64_t obj_type) +{ + return mlx4dv_init_obj(obj, obj_type); +} + +static int +mlx4_glue_dv_set_context_attr(struct ibv_context *context, + enum mlx4dv_set_ctx_attr_type attr_type, + void *attr) +{ + return mlx4dv_set_context_attr(context, attr_type, attr); +} + +const struct mlx4_glue *mlx4_glue = &(const struct mlx4_glue){ + .version = MLX4_GLUE_VERSION, + .fork_init = mlx4_glue_fork_init, + .get_async_event = mlx4_glue_get_async_event, + .ack_async_event = mlx4_glue_ack_async_event, + .alloc_pd = mlx4_glue_alloc_pd, + .dealloc_pd = mlx4_glue_dealloc_pd, + .get_device_list = mlx4_glue_get_device_list, + .free_device_list = mlx4_glue_free_device_list, + .open_device = mlx4_glue_open_device, + .close_device = mlx4_glue_close_device, + .get_device_name = mlx4_glue_get_device_name, + .query_device = mlx4_glue_query_device, + .query_device_ex = mlx4_glue_query_device_ex, + .query_port = mlx4_glue_query_port, + .port_state_str = mlx4_glue_port_state_str, + .create_comp_channel = mlx4_glue_create_comp_channel, + .destroy_comp_channel = mlx4_glue_destroy_comp_channel, + .create_cq = mlx4_glue_create_cq, + .destroy_cq = mlx4_glue_destroy_cq, + .get_cq_event = mlx4_glue_get_cq_event, + .ack_cq_events = mlx4_glue_ack_cq_events, + .create_flow = mlx4_glue_create_flow, + .destroy_flow = mlx4_glue_destroy_flow, + .create_qp = mlx4_glue_create_qp, + .create_qp_ex = mlx4_glue_create_qp_ex, + .destroy_qp = mlx4_glue_destroy_qp, + .modify_qp = mlx4_glue_modify_qp, + .reg_mr = mlx4_glue_reg_mr, + .dereg_mr = mlx4_glue_dereg_mr, + .create_rwq_ind_table = mlx4_glue_create_rwq_ind_table, + .destroy_rwq_ind_table = mlx4_glue_destroy_rwq_ind_table, + .create_wq = mlx4_glue_create_wq, + .destroy_wq = mlx4_glue_destroy_wq, + .modify_wq = mlx4_glue_modify_wq, + .dv_init_obj = mlx4_glue_dv_init_obj, + .dv_set_context_attr = mlx4_glue_dv_set_context_attr, +}; diff --git a/src/seastar/dpdk/drivers/net/mlx4/mlx4_glue.h b/src/seastar/dpdk/drivers/net/mlx4/mlx4_glue.h new file mode 100644 index 000000000..668ca8670 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/mlx4/mlx4_glue.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 6WIND S.A. + * Copyright 2018 Mellanox Technologies, Ltd + */ + +#ifndef MLX4_GLUE_H_ +#define MLX4_GLUE_H_ + +#include <stddef.h> +#include <stdint.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/mlx4dv.h> +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#ifndef MLX4_GLUE_VERSION +#define MLX4_GLUE_VERSION "" +#endif + +/* LIB_GLUE_VERSION must be updated every time this structure is modified. */ +struct mlx4_glue { + const char *version; + int (*fork_init)(void); + int (*get_async_event)(struct ibv_context *context, + struct ibv_async_event *event); + void (*ack_async_event)(struct ibv_async_event *event); + struct ibv_pd *(*alloc_pd)(struct ibv_context *context); + int (*dealloc_pd)(struct ibv_pd *pd); + struct ibv_device **(*get_device_list)(int *num_devices); + void (*free_device_list)(struct ibv_device **list); + struct ibv_context *(*open_device)(struct ibv_device *device); + int (*close_device)(struct ibv_context *context); + const char *(*get_device_name)(struct ibv_device *device); + int (*query_device)(struct ibv_context *context, + struct ibv_device_attr *device_attr); + int (*query_device_ex)(struct ibv_context *context, + const struct ibv_query_device_ex_input *input, + struct ibv_device_attr_ex *attr); + int (*query_port)(struct ibv_context *context, uint8_t port_num, + struct ibv_port_attr *port_attr); + const char *(*port_state_str)(enum ibv_port_state port_state); + struct ibv_comp_channel *(*create_comp_channel) + (struct ibv_context *context); + int (*destroy_comp_channel)(struct ibv_comp_channel *channel); + struct ibv_cq *(*create_cq)(struct ibv_context *context, int cqe, + void *cq_context, + struct ibv_comp_channel *channel, + int comp_vector); + int (*destroy_cq)(struct ibv_cq *cq); + int (*get_cq_event)(struct ibv_comp_channel *channel, + struct ibv_cq **cq, void **cq_context); + void (*ack_cq_events)(struct ibv_cq *cq, unsigned int nevents); + struct ibv_flow *(*create_flow)(struct ibv_qp *qp, + struct ibv_flow_attr *flow); + int (*destroy_flow)(struct ibv_flow *flow_id); + struct ibv_qp *(*create_qp)(struct ibv_pd *pd, + struct ibv_qp_init_attr *qp_init_attr); + struct ibv_qp *(*create_qp_ex) + (struct ibv_context *context, + struct ibv_qp_init_attr_ex *qp_init_attr_ex); + int (*destroy_qp)(struct ibv_qp *qp); + int (*modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask); + struct ibv_mr *(*reg_mr)(struct ibv_pd *pd, void *addr, + size_t length, int access); + int (*dereg_mr)(struct ibv_mr *mr); + struct ibv_rwq_ind_table *(*create_rwq_ind_table) + (struct ibv_context *context, + struct ibv_rwq_ind_table_init_attr *init_attr); + int (*destroy_rwq_ind_table)(struct ibv_rwq_ind_table *rwq_ind_table); + struct ibv_wq *(*create_wq)(struct ibv_context *context, + struct ibv_wq_init_attr *wq_init_attr); + int (*destroy_wq)(struct ibv_wq *wq); + int (*modify_wq)(struct ibv_wq *wq, struct ibv_wq_attr *wq_attr); + int (*dv_init_obj)(struct mlx4dv_obj *obj, uint64_t obj_type); + int (*dv_set_context_attr)(struct ibv_context *context, + enum mlx4dv_set_ctx_attr_type attr_type, + void *attr); +}; + +const struct mlx4_glue *mlx4_glue; + +#endif /* MLX4_GLUE_H_ */ diff --git a/src/seastar/dpdk/drivers/net/mlx4/mlx4_intr.c b/src/seastar/dpdk/drivers/net/mlx4/mlx4_intr.c new file mode 100644 index 000000000..4f3352675 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/mlx4/mlx4_intr.c @@ -0,0 +1,406 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox Technologies, Ltd + */ + +/** + * @file + * Interrupts handling for mlx4 driver. + */ + +#include <assert.h> +#include <errno.h> +#include <stdint.h> +#include <stdlib.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_alarm.h> +#include <rte_errno.h> +#include <rte_ethdev_driver.h> +#include <rte_io.h> +#include <rte_interrupts.h> + +#include "mlx4.h" +#include "mlx4_glue.h" +#include "mlx4_rxtx.h" +#include "mlx4_utils.h" + +static int mlx4_link_status_check(struct mlx4_priv *priv); + +/** + * Clean up Rx interrupts handler. + * + * @param priv + * Pointer to private structure. + */ +static void +mlx4_rx_intr_vec_disable(struct mlx4_priv *priv) +{ + struct rte_intr_handle *intr_handle = &priv->intr_handle; + + rte_intr_free_epoll_fd(intr_handle); + free(intr_handle->intr_vec); + intr_handle->nb_efd = 0; + intr_handle->intr_vec = NULL; +} + +/** + * Allocate queue vector and fill epoll fd list for Rx interrupts. + * + * @param priv + * Pointer to private structure. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_rx_intr_vec_enable(struct mlx4_priv *priv) +{ + unsigned int i; + unsigned int rxqs_n = ETH_DEV(priv)->data->nb_rx_queues; + unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID); + unsigned int count = 0; + struct rte_intr_handle *intr_handle = &priv->intr_handle; + + mlx4_rx_intr_vec_disable(priv); + intr_handle->intr_vec = malloc(n * sizeof(intr_handle->intr_vec[0])); + if (intr_handle->intr_vec == NULL) { + rte_errno = ENOMEM; + ERROR("failed to allocate memory for interrupt vector," + " Rx interrupts will not be supported"); + return -rte_errno; + } + for (i = 0; i != n; ++i) { + struct rxq *rxq = ETH_DEV(priv)->data->rx_queues[i]; + + /* Skip queues that cannot request interrupts. */ + if (!rxq || !rxq->channel) { + /* Use invalid intr_vec[] index to disable entry. */ + intr_handle->intr_vec[i] = + RTE_INTR_VEC_RXTX_OFFSET + + RTE_MAX_RXTX_INTR_VEC_ID; + continue; + } + if (count >= RTE_MAX_RXTX_INTR_VEC_ID) { + rte_errno = E2BIG; + ERROR("too many Rx queues for interrupt vector size" + " (%d), Rx interrupts cannot be enabled", + RTE_MAX_RXTX_INTR_VEC_ID); + mlx4_rx_intr_vec_disable(priv); + return -rte_errno; + } + intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count; + intr_handle->efds[count] = rxq->channel->fd; + count++; + } + if (!count) + mlx4_rx_intr_vec_disable(priv); + else + intr_handle->nb_efd = count; + return 0; +} + +/** + * Process scheduled link status check. + * + * If LSC interrupts are requested, process related callback. + * + * @param priv + * Pointer to private structure. + */ +static void +mlx4_link_status_alarm(struct mlx4_priv *priv) +{ + const struct rte_intr_conf *const intr_conf = + Ð_DEV(priv)->data->dev_conf.intr_conf; + + assert(priv->intr_alarm == 1); + priv->intr_alarm = 0; + if (intr_conf->lsc && !mlx4_link_status_check(priv)) + _rte_eth_dev_callback_process(ETH_DEV(priv), + RTE_ETH_EVENT_INTR_LSC, + NULL); +} + +/** + * Check link status. + * + * In case of inconsistency, another check is scheduled. + * + * @param priv + * Pointer to private structure. + * + * @return + * 0 on success (link status is consistent), negative errno value + * otherwise and rte_errno is set. + */ +static int +mlx4_link_status_check(struct mlx4_priv *priv) +{ + struct rte_eth_link *link = Ð_DEV(priv)->data->dev_link; + int ret = mlx4_link_update(ETH_DEV(priv), 0); + + if (ret) + return ret; + if ((!link->link_speed && link->link_status) || + (link->link_speed && !link->link_status)) { + if (!priv->intr_alarm) { + /* Inconsistent status, check again later. */ + ret = rte_eal_alarm_set(MLX4_INTR_ALARM_TIMEOUT, + (void (*)(void *)) + mlx4_link_status_alarm, + priv); + if (ret) + return ret; + priv->intr_alarm = 1; + } + rte_errno = EINPROGRESS; + return -rte_errno; + } + return 0; +} + +/** + * Handle interrupts from the NIC. + * + * @param priv + * Pointer to private structure. + */ +static void +mlx4_interrupt_handler(struct mlx4_priv *priv) +{ + enum { LSC, RMV, }; + static const enum rte_eth_event_type type[] = { + [LSC] = RTE_ETH_EVENT_INTR_LSC, + [RMV] = RTE_ETH_EVENT_INTR_RMV, + }; + uint32_t caught[RTE_DIM(type)] = { 0 }; + struct ibv_async_event event; + const struct rte_intr_conf *const intr_conf = + Ð_DEV(priv)->data->dev_conf.intr_conf; + unsigned int i; + + /* Read all message and acknowledge them. */ + while (!mlx4_glue->get_async_event(priv->ctx, &event)) { + switch (event.event_type) { + case IBV_EVENT_PORT_ACTIVE: + case IBV_EVENT_PORT_ERR: + if (intr_conf->lsc && !mlx4_link_status_check(priv)) + ++caught[LSC]; + break; + case IBV_EVENT_DEVICE_FATAL: + if (intr_conf->rmv) + ++caught[RMV]; + break; + default: + DEBUG("event type %d on physical port %d not handled", + event.event_type, event.element.port_num); + } + mlx4_glue->ack_async_event(&event); + } + for (i = 0; i != RTE_DIM(caught); ++i) + if (caught[i]) + _rte_eth_dev_callback_process(ETH_DEV(priv), type[i], + NULL); +} + +/** + * MLX4 CQ notification . + * + * @param rxq + * Pointer to receive queue structure. + * @param solicited + * Is request solicited or not. + */ +static void +mlx4_arm_cq(struct rxq *rxq, int solicited) +{ + struct mlx4_cq *cq = &rxq->mcq; + uint64_t doorbell; + uint32_t sn = cq->arm_sn & MLX4_CQ_DB_GEQ_N_MASK; + uint32_t ci = cq->cons_index & MLX4_CQ_DB_CI_MASK; + uint32_t cmd = solicited ? MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT; + + *cq->arm_db = rte_cpu_to_be_32(sn << 28 | cmd | ci); + /* + * Make sure that the doorbell record in host memory is + * written before ringing the doorbell via PCI MMIO. + */ + rte_wmb(); + doorbell = sn << 28 | cmd | cq->cqn; + doorbell <<= 32; + doorbell |= ci; + rte_write64(rte_cpu_to_be_64(doorbell), cq->cq_db_reg); +} + +/** + * Uninstall interrupt handler. + * + * @param priv + * Pointer to private structure. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_intr_uninstall(struct mlx4_priv *priv) +{ + int err = rte_errno; /* Make sure rte_errno remains unchanged. */ + + if (priv->intr_handle.fd != -1) { + rte_intr_callback_unregister(&priv->intr_handle, + (void (*)(void *)) + mlx4_interrupt_handler, + priv); + priv->intr_handle.fd = -1; + } + rte_eal_alarm_cancel((void (*)(void *))mlx4_link_status_alarm, priv); + priv->intr_alarm = 0; + mlx4_rxq_intr_disable(priv); + rte_errno = err; + return 0; +} + +/** + * Install interrupt handler. + * + * @param priv + * Pointer to private structure. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_intr_install(struct mlx4_priv *priv) +{ + const struct rte_intr_conf *const intr_conf = + Ð_DEV(priv)->data->dev_conf.intr_conf; + int rc; + + mlx4_intr_uninstall(priv); + if (intr_conf->lsc | intr_conf->rmv) { + priv->intr_handle.fd = priv->ctx->async_fd; + rc = rte_intr_callback_register(&priv->intr_handle, + (void (*)(void *)) + mlx4_interrupt_handler, + priv); + if (rc < 0) { + rte_errno = -rc; + goto error; + } + } + return 0; +error: + mlx4_intr_uninstall(priv); + return -rte_errno; +} + +/** + * DPDK callback for Rx queue interrupt disable. + * + * @param dev + * Pointer to Ethernet device structure. + * @param idx + * Rx queue index. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx) +{ + struct rxq *rxq = dev->data->rx_queues[idx]; + struct ibv_cq *ev_cq; + void *ev_ctx; + int ret; + + if (!rxq || !rxq->channel) { + ret = EINVAL; + } else { + ret = mlx4_glue->get_cq_event(rxq->cq->channel, &ev_cq, + &ev_ctx); + if (ret || ev_cq != rxq->cq) + ret = EINVAL; + } + if (ret) { + rte_errno = ret; + WARN("unable to disable interrupt on rx queue %d", + idx); + } else { + rxq->mcq.arm_sn++; + mlx4_glue->ack_cq_events(rxq->cq, 1); + } + return -ret; +} + +/** + * DPDK callback for Rx queue interrupt enable. + * + * @param dev + * Pointer to Ethernet device structure. + * @param idx + * Rx queue index. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx) +{ + struct rxq *rxq = dev->data->rx_queues[idx]; + int ret = 0; + + if (!rxq || !rxq->channel) { + ret = EINVAL; + rte_errno = ret; + WARN("unable to arm interrupt on rx queue %d", idx); + } else { + mlx4_arm_cq(rxq, 0); + } + return -ret; +} + +/** + * Enable datapath interrupts. + * + * @param priv + * Pointer to private structure. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_rxq_intr_enable(struct mlx4_priv *priv) +{ + const struct rte_intr_conf *const intr_conf = + Ð_DEV(priv)->data->dev_conf.intr_conf; + + if (intr_conf->rxq && mlx4_rx_intr_vec_enable(priv) < 0) + goto error; + return 0; +error: + return -rte_errno; +} + +/** + * Disable datapath interrupts, keeping other interrupts intact. + * + * @param priv + * Pointer to private structure. + */ +void +mlx4_rxq_intr_disable(struct mlx4_priv *priv) +{ + int err = rte_errno; /* Make sure rte_errno remains unchanged. */ + + mlx4_rx_intr_vec_disable(priv); + rte_errno = err; +} diff --git a/src/seastar/dpdk/drivers/net/mlx4/mlx4_mp.c b/src/seastar/dpdk/drivers/net/mlx4/mlx4_mp.c new file mode 100644 index 000000000..183622453 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/mlx4/mlx4_mp.c @@ -0,0 +1,354 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2019 6WIND S.A. + * Copyright 2019 Mellanox Technologies, Ltd + */ + +#include <assert.h> +#include <stdio.h> +#include <time.h> + +#include <rte_eal.h> +#include <rte_ethdev_driver.h> +#include <rte_string_fns.h> + +#include "mlx4.h" +#include "mlx4_rxtx.h" +#include "mlx4_utils.h" + +/** + * Initialize IPC message. + * + * @param[in] dev + * Pointer to Ethernet structure. + * @param[out] msg + * Pointer to message to fill in. + * @param[in] type + * Message type. + */ +static inline void +mp_init_msg(struct rte_eth_dev *dev, struct rte_mp_msg *msg, + enum mlx4_mp_req_type type) +{ + struct mlx4_mp_param *param = (struct mlx4_mp_param *)msg->param; + + memset(msg, 0, sizeof(*msg)); + strlcpy(msg->name, MLX4_MP_NAME, sizeof(msg->name)); + msg->len_param = sizeof(*param); + param->type = type; + param->port_id = dev->data->port_id; +} + +/** + * IPC message handler of primary process. + * + * @param[in] dev + * Pointer to Ethernet structure. + * @param[in] peer + * Pointer to the peer socket path. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +static int +mp_primary_handle(const struct rte_mp_msg *mp_msg, const void *peer) +{ + struct rte_mp_msg mp_res; + struct mlx4_mp_param *res = (struct mlx4_mp_param *)mp_res.param; + const struct mlx4_mp_param *param = + (const struct mlx4_mp_param *)mp_msg->param; + struct rte_eth_dev *dev; + struct mlx4_priv *priv; + struct mlx4_mr_cache entry; + uint32_t lkey; + int ret; + + assert(rte_eal_process_type() == RTE_PROC_PRIMARY); + if (!rte_eth_dev_is_valid_port(param->port_id)) { + rte_errno = ENODEV; + ERROR("port %u invalid port ID", param->port_id); + return -rte_errno; + } + dev = &rte_eth_devices[param->port_id]; + priv = dev->data->dev_private; + switch (param->type) { + case MLX4_MP_REQ_CREATE_MR: + mp_init_msg(dev, &mp_res, param->type); + lkey = mlx4_mr_create_primary(dev, &entry, param->args.addr); + if (lkey == UINT32_MAX) + res->result = -rte_errno; + ret = rte_mp_reply(&mp_res, peer); + break; + case MLX4_MP_REQ_VERBS_CMD_FD: + mp_init_msg(dev, &mp_res, param->type); + mp_res.num_fds = 1; + mp_res.fds[0] = priv->ctx->cmd_fd; + res->result = 0; + ret = rte_mp_reply(&mp_res, peer); + break; + default: + rte_errno = EINVAL; + ERROR("port %u invalid mp request type", dev->data->port_id); + return -rte_errno; + } + return ret; +} + +/** + * IPC message handler of a secondary process. + * + * @param[in] dev + * Pointer to Ethernet structure. + * @param[in] peer + * Pointer to the peer socket path. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mp_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer) +{ + struct rte_mp_msg mp_res; + struct mlx4_mp_param *res = (struct mlx4_mp_param *)mp_res.param; + const struct mlx4_mp_param *param = + (const struct mlx4_mp_param *)mp_msg->param; + struct rte_eth_dev *dev; + int ret; + + assert(rte_eal_process_type() == RTE_PROC_SECONDARY); + if (!rte_eth_dev_is_valid_port(param->port_id)) { + rte_errno = ENODEV; + ERROR("port %u invalid port ID", param->port_id); + return -rte_errno; + } + dev = &rte_eth_devices[param->port_id]; + switch (param->type) { + case MLX4_MP_REQ_START_RXTX: + INFO("port %u starting datapath", dev->data->port_id); + rte_mb(); + dev->tx_pkt_burst = mlx4_tx_burst; + dev->rx_pkt_burst = mlx4_rx_burst; + mp_init_msg(dev, &mp_res, param->type); + res->result = 0; + ret = rte_mp_reply(&mp_res, peer); + break; + case MLX4_MP_REQ_STOP_RXTX: + INFO("port %u stopping datapath", dev->data->port_id); + dev->tx_pkt_burst = mlx4_tx_burst_removed; + dev->rx_pkt_burst = mlx4_rx_burst_removed; + rte_mb(); + mp_init_msg(dev, &mp_res, param->type); + res->result = 0; + ret = rte_mp_reply(&mp_res, peer); + break; + default: + rte_errno = EINVAL; + ERROR("port %u invalid mp request type", dev->data->port_id); + return -rte_errno; + } + return ret; +} + +/** + * Broadcast request of stopping/starting data-path to secondary processes. + * + * @param[in] dev + * Pointer to Ethernet structure. + * @param[in] type + * Request type. + */ +static void +mp_req_on_rxtx(struct rte_eth_dev *dev, enum mlx4_mp_req_type type) +{ + struct rte_mp_msg mp_req; + struct rte_mp_msg *mp_res; + struct rte_mp_reply mp_rep; + struct mlx4_mp_param *res __rte_unused; + struct timespec ts = {.tv_sec = MLX4_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0}; + int ret; + int i; + + assert(rte_eal_process_type() == RTE_PROC_PRIMARY); + if (!mlx4_shared_data->secondary_cnt) + return; + if (type != MLX4_MP_REQ_START_RXTX && type != MLX4_MP_REQ_STOP_RXTX) { + ERROR("port %u unknown request (req_type %d)", + dev->data->port_id, type); + return; + } + mp_init_msg(dev, &mp_req, type); + ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts); + if (ret) { + ERROR("port %u failed to request stop/start Rx/Tx (%d)", + dev->data->port_id, type); + goto exit; + } + if (mp_rep.nb_sent != mp_rep.nb_received) { + ERROR("port %u not all secondaries responded (req_type %d)", + dev->data->port_id, type); + goto exit; + } + for (i = 0; i < mp_rep.nb_received; i++) { + mp_res = &mp_rep.msgs[i]; + res = (struct mlx4_mp_param *)mp_res->param; + if (res->result) { + ERROR("port %u request failed on secondary #%d", + dev->data->port_id, i); + goto exit; + } + } +exit: + free(mp_rep.msgs); +} + +/** + * Broadcast request of starting data-path to secondary processes. The request + * is synchronous. + * + * @param[in] dev + * Pointer to Ethernet structure. + */ +void +mlx4_mp_req_start_rxtx(struct rte_eth_dev *dev) +{ + mp_req_on_rxtx(dev, MLX4_MP_REQ_START_RXTX); +} + +/** + * Broadcast request of stopping data-path to secondary processes. The request + * is synchronous. + * + * @param[in] dev + * Pointer to Ethernet structure. + */ +void +mlx4_mp_req_stop_rxtx(struct rte_eth_dev *dev) +{ + mp_req_on_rxtx(dev, MLX4_MP_REQ_STOP_RXTX); +} + +/** + * Request Memory Region creation to the primary process. + * + * @param[in] dev + * Pointer to Ethernet structure. + * @param addr + * Target virtual address to register. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx4_mp_req_mr_create(struct rte_eth_dev *dev, uintptr_t addr) +{ + struct rte_mp_msg mp_req; + struct rte_mp_msg *mp_res; + struct rte_mp_reply mp_rep; + struct mlx4_mp_param *req = (struct mlx4_mp_param *)mp_req.param; + struct mlx4_mp_param *res; + struct timespec ts = {.tv_sec = MLX4_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0}; + int ret; + + assert(rte_eal_process_type() == RTE_PROC_SECONDARY); + mp_init_msg(dev, &mp_req, MLX4_MP_REQ_CREATE_MR); + req->args.addr = addr; + ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts); + if (ret) { + ERROR("port %u request to primary process failed", + dev->data->port_id); + return -rte_errno; + } + assert(mp_rep.nb_received == 1); + mp_res = &mp_rep.msgs[0]; + res = (struct mlx4_mp_param *)mp_res->param; + ret = res->result; + if (ret) + rte_errno = -ret; + free(mp_rep.msgs); + return ret; +} + +/** + * IPC message handler of primary process. + * + * @param[in] dev + * Pointer to Ethernet structure. + * + * @return + * fd on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx4_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev) +{ + struct rte_mp_msg mp_req; + struct rte_mp_msg *mp_res; + struct rte_mp_reply mp_rep; + struct mlx4_mp_param *res; + struct timespec ts = {.tv_sec = MLX4_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0}; + int ret; + + assert(rte_eal_process_type() == RTE_PROC_SECONDARY); + mp_init_msg(dev, &mp_req, MLX4_MP_REQ_VERBS_CMD_FD); + ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts); + if (ret) { + ERROR("port %u request to primary process failed", + dev->data->port_id); + return -rte_errno; + } + assert(mp_rep.nb_received == 1); + mp_res = &mp_rep.msgs[0]; + res = (struct mlx4_mp_param *)mp_res->param; + if (res->result) { + rte_errno = -res->result; + ERROR("port %u failed to get command FD from primary process", + dev->data->port_id); + ret = -rte_errno; + goto exit; + } + assert(mp_res->num_fds == 1); + ret = mp_res->fds[0]; + DEBUG("port %u command FD from primary is %d", + dev->data->port_id, ret); +exit: + free(mp_rep.msgs); + return ret; +} + +/** + * Initialize by primary process. + */ +void +mlx4_mp_init_primary(void) +{ + assert(rte_eal_process_type() == RTE_PROC_PRIMARY); + rte_mp_action_register(MLX4_MP_NAME, mp_primary_handle); +} + +/** + * Un-initialize by primary process. + */ +void +mlx4_mp_uninit_primary(void) +{ + assert(rte_eal_process_type() == RTE_PROC_PRIMARY); + rte_mp_action_unregister(MLX4_MP_NAME); +} + +/** + * Initialize by secondary process. + */ +void +mlx4_mp_init_secondary(void) +{ + assert(rte_eal_process_type() == RTE_PROC_SECONDARY); + rte_mp_action_register(MLX4_MP_NAME, mp_secondary_handle); +} + +/** + * Un-initialize by secondary process. + */ +void +mlx4_mp_uninit_secondary(void) +{ + assert(rte_eal_process_type() == RTE_PROC_SECONDARY); + rte_mp_action_unregister(MLX4_MP_NAME); +} diff --git a/src/seastar/dpdk/drivers/net/mlx4/mlx4_mr.c b/src/seastar/dpdk/drivers/net/mlx4/mlx4_mr.c new file mode 100644 index 000000000..48d458ad4 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/mlx4/mlx4_mr.c @@ -0,0 +1,1462 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox Technologies, Ltd + */ + +/** + * @file + * Memory management functions for mlx4 driver. + */ + +#include <assert.h> +#include <errno.h> +#include <inttypes.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_branch_prediction.h> +#include <rte_common.h> +#include <rte_errno.h> +#include <rte_malloc.h> +#include <rte_memory.h> +#include <rte_mempool.h> +#include <rte_rwlock.h> + +#include "mlx4_glue.h" +#include "mlx4_mr.h" +#include "mlx4_rxtx.h" +#include "mlx4_utils.h" + +struct mr_find_contig_memsegs_data { + uintptr_t addr; + uintptr_t start; + uintptr_t end; + const struct rte_memseg_list *msl; +}; + +struct mr_update_mp_data { + struct rte_eth_dev *dev; + struct mlx4_mr_ctrl *mr_ctrl; + int ret; +}; + +/** + * Expand B-tree table to a given size. Can't be called with holding + * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc(). + * + * @param bt + * Pointer to B-tree structure. + * @param n + * Number of entries for expansion. + * + * @return + * 0 on success, -1 on failure. + */ +static int +mr_btree_expand(struct mlx4_mr_btree *bt, int n) +{ + void *mem; + int ret = 0; + + if (n <= bt->size) + return ret; + /* + * Downside of directly using rte_realloc() is that SOCKET_ID_ANY is + * used inside if there's no room to expand. Because this is a quite + * rare case and a part of very slow path, it is very acceptable. + * Initially cache_bh[] will be given practically enough space and once + * it is expanded, expansion wouldn't be needed again ever. + */ + mem = rte_realloc(bt->table, n * sizeof(struct mlx4_mr_cache), 0); + if (mem == NULL) { + /* Not an error, B-tree search will be skipped. */ + WARN("failed to expand MR B-tree (%p) table", (void *)bt); + ret = -1; + } else { + DEBUG("expanded MR B-tree table (size=%u)", n); + bt->table = mem; + bt->size = n; + } + return ret; +} + +/** + * Look up LKey from given B-tree lookup table, store the last index and return + * searched LKey. + * + * @param bt + * Pointer to B-tree structure. + * @param[out] idx + * Pointer to index. Even on search failure, returns index where it stops + * searching so that index can be used when inserting a new entry. + * @param addr + * Search key. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +static uint32_t +mr_btree_lookup(struct mlx4_mr_btree *bt, uint16_t *idx, uintptr_t addr) +{ + struct mlx4_mr_cache *lkp_tbl; + uint16_t n; + uint16_t base = 0; + + assert(bt != NULL); + lkp_tbl = *bt->table; + n = bt->len; + /* First entry must be NULL for comparison. */ + assert(bt->len > 0 || (lkp_tbl[0].start == 0 && + lkp_tbl[0].lkey == UINT32_MAX)); + /* Binary search. */ + do { + register uint16_t delta = n >> 1; + + if (addr < lkp_tbl[base + delta].start) { + n = delta; + } else { + base += delta; + n -= delta; + } + } while (n > 1); + assert(addr >= lkp_tbl[base].start); + *idx = base; + if (addr < lkp_tbl[base].end) + return lkp_tbl[base].lkey; + /* Not found. */ + return UINT32_MAX; +} + +/** + * Insert an entry to B-tree lookup table. + * + * @param bt + * Pointer to B-tree structure. + * @param entry + * Pointer to new entry to insert. + * + * @return + * 0 on success, -1 on failure. + */ +static int +mr_btree_insert(struct mlx4_mr_btree *bt, struct mlx4_mr_cache *entry) +{ + struct mlx4_mr_cache *lkp_tbl; + uint16_t idx = 0; + size_t shift; + + assert(bt != NULL); + assert(bt->len <= bt->size); + assert(bt->len > 0); + lkp_tbl = *bt->table; + /* Find out the slot for insertion. */ + if (mr_btree_lookup(bt, &idx, entry->start) != UINT32_MAX) { + DEBUG("abort insertion to B-tree(%p): already exist at" + " idx=%u [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", + (void *)bt, idx, entry->start, entry->end, entry->lkey); + /* Already exist, return. */ + return 0; + } + /* If table is full, return error. */ + if (unlikely(bt->len == bt->size)) { + bt->overflow = 1; + return -1; + } + /* Insert entry. */ + ++idx; + shift = (bt->len - idx) * sizeof(struct mlx4_mr_cache); + if (shift) + memmove(&lkp_tbl[idx + 1], &lkp_tbl[idx], shift); + lkp_tbl[idx] = *entry; + bt->len++; + DEBUG("inserted B-tree(%p)[%u]," + " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", + (void *)bt, idx, entry->start, entry->end, entry->lkey); + return 0; +} + +/** + * Initialize B-tree and allocate memory for lookup table. + * + * @param bt + * Pointer to B-tree structure. + * @param n + * Number of entries to allocate. + * @param socket + * NUMA socket on which memory must be allocated. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx4_mr_btree_init(struct mlx4_mr_btree *bt, int n, int socket) +{ + if (bt == NULL) { + rte_errno = EINVAL; + return -rte_errno; + } + memset(bt, 0, sizeof(*bt)); + bt->table = rte_calloc_socket("B-tree table", + n, sizeof(struct mlx4_mr_cache), + 0, socket); + if (bt->table == NULL) { + rte_errno = ENOMEM; + ERROR("failed to allocate memory for btree cache on socket %d", + socket); + return -rte_errno; + } + bt->size = n; + /* First entry must be NULL for binary search. */ + (*bt->table)[bt->len++] = (struct mlx4_mr_cache) { + .lkey = UINT32_MAX, + }; + DEBUG("initialized B-tree %p with table %p", + (void *)bt, (void *)bt->table); + return 0; +} + +/** + * Free B-tree resources. + * + * @param bt + * Pointer to B-tree structure. + */ +void +mlx4_mr_btree_free(struct mlx4_mr_btree *bt) +{ + if (bt == NULL) + return; + DEBUG("freeing B-tree %p with table %p", (void *)bt, (void *)bt->table); + rte_free(bt->table); + memset(bt, 0, sizeof(*bt)); +} + +#ifndef NDEBUG +/** + * Dump all the entries in a B-tree + * + * @param bt + * Pointer to B-tree structure. + */ +void +mlx4_mr_btree_dump(struct mlx4_mr_btree *bt) +{ + int idx; + struct mlx4_mr_cache *lkp_tbl; + + if (bt == NULL) + return; + lkp_tbl = *bt->table; + for (idx = 0; idx < bt->len; ++idx) { + struct mlx4_mr_cache *entry = &lkp_tbl[idx]; + + DEBUG("B-tree(%p)[%u]," + " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", + (void *)bt, idx, entry->start, entry->end, entry->lkey); + } +} +#endif + +/** + * Find virtually contiguous memory chunk in a given MR. + * + * @param dev + * Pointer to MR structure. + * @param[out] entry + * Pointer to returning MR cache entry. If not found, this will not be + * updated. + * @param start_idx + * Start index of the memseg bitmap. + * + * @return + * Next index to go on lookup. + */ +static int +mr_find_next_chunk(struct mlx4_mr *mr, struct mlx4_mr_cache *entry, + int base_idx) +{ + uintptr_t start = 0; + uintptr_t end = 0; + uint32_t idx = 0; + + /* MR for external memory doesn't have memseg list. */ + if (mr->msl == NULL) { + struct ibv_mr *ibv_mr = mr->ibv_mr; + + assert(mr->ms_bmp_n == 1); + assert(mr->ms_n == 1); + assert(base_idx == 0); + /* + * Can't search it from memseg list but get it directly from + * verbs MR as there's only one chunk. + */ + entry->start = (uintptr_t)ibv_mr->addr; + entry->end = (uintptr_t)ibv_mr->addr + mr->ibv_mr->length; + entry->lkey = rte_cpu_to_be_32(mr->ibv_mr->lkey); + /* Returning 1 ends iteration. */ + return 1; + } + for (idx = base_idx; idx < mr->ms_bmp_n; ++idx) { + if (rte_bitmap_get(mr->ms_bmp, idx)) { + const struct rte_memseg_list *msl; + const struct rte_memseg *ms; + + msl = mr->msl; + ms = rte_fbarray_get(&msl->memseg_arr, + mr->ms_base_idx + idx); + assert(msl->page_sz == ms->hugepage_sz); + if (!start) + start = ms->addr_64; + end = ms->addr_64 + ms->hugepage_sz; + } else if (start) { + /* Passed the end of a fragment. */ + break; + } + } + if (start) { + /* Found one chunk. */ + entry->start = start; + entry->end = end; + entry->lkey = rte_cpu_to_be_32(mr->ibv_mr->lkey); + } + return idx; +} + +/** + * Insert a MR to the global B-tree cache. It may fail due to low-on-memory. + * Then, this entry will have to be searched by mr_lookup_dev_list() in + * mlx4_mr_create() on miss. + * + * @param dev + * Pointer to Ethernet device. + * @param mr + * Pointer to MR to insert. + * + * @return + * 0 on success, -1 on failure. + */ +static int +mr_insert_dev_cache(struct rte_eth_dev *dev, struct mlx4_mr *mr) +{ + struct mlx4_priv *priv = dev->data->dev_private; + unsigned int n; + + DEBUG("port %u inserting MR(%p) to global cache", + dev->data->port_id, (void *)mr); + for (n = 0; n < mr->ms_bmp_n; ) { + struct mlx4_mr_cache entry; + + memset(&entry, 0, sizeof(entry)); + /* Find a contiguous chunk and advance the index. */ + n = mr_find_next_chunk(mr, &entry, n); + if (!entry.end) + break; + if (mr_btree_insert(&priv->mr.cache, &entry) < 0) { + /* + * Overflowed, but the global table cannot be expanded + * because of deadlock. + */ + return -1; + } + } + return 0; +} + +/** + * Look up address in the original global MR list. + * + * @param dev + * Pointer to Ethernet device. + * @param[out] entry + * Pointer to returning MR cache entry. If no match, this will not be updated. + * @param addr + * Search key. + * + * @return + * Found MR on match, NULL otherwise. + */ +static struct mlx4_mr * +mr_lookup_dev_list(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry, + uintptr_t addr) +{ + struct mlx4_priv *priv = dev->data->dev_private; + struct mlx4_mr *mr; + + /* Iterate all the existing MRs. */ + LIST_FOREACH(mr, &priv->mr.mr_list, mr) { + unsigned int n; + + if (mr->ms_n == 0) + continue; + for (n = 0; n < mr->ms_bmp_n; ) { + struct mlx4_mr_cache ret; + + memset(&ret, 0, sizeof(ret)); + n = mr_find_next_chunk(mr, &ret, n); + if (addr >= ret.start && addr < ret.end) { + /* Found. */ + *entry = ret; + return mr; + } + } + } + return NULL; +} + +/** + * Look up address on device. + * + * @param dev + * Pointer to Ethernet device. + * @param[out] entry + * Pointer to returning MR cache entry. If no match, this will not be updated. + * @param addr + * Search key. + * + * @return + * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. + */ +static uint32_t +mr_lookup_dev(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry, + uintptr_t addr) +{ + struct mlx4_priv *priv = dev->data->dev_private; + uint16_t idx; + uint32_t lkey = UINT32_MAX; + struct mlx4_mr *mr; + + /* + * If the global cache has overflowed since it failed to expand the + * B-tree table, it can't have all the existing MRs. Then, the address + * has to be searched by traversing the original MR list instead, which + * is very slow path. Otherwise, the global cache is all inclusive. + */ + if (!unlikely(priv->mr.cache.overflow)) { + lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr); + if (lkey != UINT32_MAX) + *entry = (*priv->mr.cache.table)[idx]; + } else { + /* Falling back to the slowest path. */ + mr = mr_lookup_dev_list(dev, entry, addr); + if (mr != NULL) + lkey = entry->lkey; + } + assert(lkey == UINT32_MAX || (addr >= entry->start && + addr < entry->end)); + return lkey; +} + +/** + * Free MR resources. MR lock must not be held to avoid a deadlock. rte_free() + * can raise memory free event and the callback function will spin on the lock. + * + * @param mr + * Pointer to MR to free. + */ +static void +mr_free(struct mlx4_mr *mr) +{ + if (mr == NULL) + return; + DEBUG("freeing MR(%p):", (void *)mr); + if (mr->ibv_mr != NULL) + claim_zero(mlx4_glue->dereg_mr(mr->ibv_mr)); + if (mr->ms_bmp != NULL) + rte_bitmap_free(mr->ms_bmp); + rte_free(mr); +} + +/** + * Release resources of detached MR having no online entry. + * + * @param dev + * Pointer to Ethernet device. + */ +static void +mlx4_mr_garbage_collect(struct rte_eth_dev *dev) +{ + struct mlx4_priv *priv = dev->data->dev_private; + struct mlx4_mr *mr_next; + struct mlx4_mr_list free_list = LIST_HEAD_INITIALIZER(free_list); + + /* Must be called from the primary process. */ + assert(rte_eal_process_type() == RTE_PROC_PRIMARY); + /* + * MR can't be freed with holding the lock because rte_free() could call + * memory free callback function. This will be a deadlock situation. + */ + rte_rwlock_write_lock(&priv->mr.rwlock); + /* Detach the whole free list and release it after unlocking. */ + free_list = priv->mr.mr_free_list; + LIST_INIT(&priv->mr.mr_free_list); + rte_rwlock_write_unlock(&priv->mr.rwlock); + /* Release resources. */ + mr_next = LIST_FIRST(&free_list); + while (mr_next != NULL) { + struct mlx4_mr *mr = mr_next; + + mr_next = LIST_NEXT(mr, mr); + mr_free(mr); + } +} + +/* Called during rte_memseg_contig_walk() by mlx4_mr_create(). */ +static int +mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, size_t len, void *arg) +{ + struct mr_find_contig_memsegs_data *data = arg; + + if (data->addr < ms->addr_64 || data->addr >= ms->addr_64 + len) + return 0; + /* Found, save it and stop walking. */ + data->start = ms->addr_64; + data->end = ms->addr_64 + len; + data->msl = msl; + return 1; +} + +/** + * Create a new global Memory Region (MR) for a missing virtual address. + * This API should be called on a secondary process, then a request is sent to + * the primary process in order to create a MR for the address. As the global MR + * list is on the shared memory, following LKey lookup should succeed unless the + * request fails. + * + * @param dev + * Pointer to Ethernet device. + * @param[out] entry + * Pointer to returning MR cache entry, found in the global cache or newly + * created. If failed to create one, this will not be updated. + * @param addr + * Target virtual address to register. + * + * @return + * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. + */ +static uint32_t +mlx4_mr_create_secondary(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry, + uintptr_t addr) +{ + struct mlx4_priv *priv = dev->data->dev_private; + int ret; + + DEBUG("port %u requesting MR creation for address (%p)", + dev->data->port_id, (void *)addr); + ret = mlx4_mp_req_mr_create(dev, addr); + if (ret) { + DEBUG("port %u fail to request MR creation for address (%p)", + dev->data->port_id, (void *)addr); + return UINT32_MAX; + } + rte_rwlock_read_lock(&priv->mr.rwlock); + /* Fill in output data. */ + mr_lookup_dev(dev, entry, addr); + /* Lookup can't fail. */ + assert(entry->lkey != UINT32_MAX); + rte_rwlock_read_unlock(&priv->mr.rwlock); + DEBUG("port %u MR CREATED by primary process for %p:\n" + " [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x", + dev->data->port_id, (void *)addr, + entry->start, entry->end, entry->lkey); + return entry->lkey; +} + +/** + * Create a new global Memory Region (MR) for a missing virtual address. + * Register entire virtually contiguous memory chunk around the address. + * This must be called from the primary process. + * + * @param dev + * Pointer to Ethernet device. + * @param[out] entry + * Pointer to returning MR cache entry, found in the global cache or newly + * created. If failed to create one, this will not be updated. + * @param addr + * Target virtual address to register. + * + * @return + * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. + */ +uint32_t +mlx4_mr_create_primary(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry, + uintptr_t addr) +{ + struct mlx4_priv *priv = dev->data->dev_private; + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + const struct rte_memseg_list *msl; + const struct rte_memseg *ms; + struct mlx4_mr *mr = NULL; + size_t len; + uint32_t ms_n; + uint32_t bmp_size; + void *bmp_mem; + int ms_idx_shift = -1; + unsigned int n; + struct mr_find_contig_memsegs_data data = { + .addr = addr, + }; + struct mr_find_contig_memsegs_data data_re; + + DEBUG("port %u creating a MR using address (%p)", + dev->data->port_id, (void *)addr); + /* + * Release detached MRs if any. This can't be called with holding either + * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have + * been detached by the memory free event but it couldn't be released + * inside the callback due to deadlock. As a result, releasing resources + * is quite opportunistic. + */ + mlx4_mr_garbage_collect(dev); + /* + * If enabled, find out a contiguous virtual address chunk in use, to + * which the given address belongs, in order to register maximum range. + * In the best case where mempools are not dynamically recreated and + * '--socket-mem' is specified as an EAL option, it is very likely to + * have only one MR(LKey) per a socket and per a hugepage-size even + * though the system memory is highly fragmented. As the whole memory + * chunk will be pinned by kernel, it can't be reused unless entire + * chunk is freed from EAL. + * + * If disabled, just register one memseg (page). Then, memory + * consumption will be minimized but it may drop performance if there + * are many MRs to lookup on the datapath. + */ + if (!priv->mr_ext_memseg_en) { + data.msl = rte_mem_virt2memseg_list((void *)addr); + data.start = RTE_ALIGN_FLOOR(addr, data.msl->page_sz); + data.end = data.start + data.msl->page_sz; + } else if (!rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data)) { + WARN("port %u unable to find virtually contiguous" + " chunk for address (%p)." + " rte_memseg_contig_walk() failed.", + dev->data->port_id, (void *)addr); + rte_errno = ENXIO; + goto err_nolock; + } +alloc_resources: + /* Addresses must be page-aligned. */ + assert(rte_is_aligned((void *)data.start, data.msl->page_sz)); + assert(rte_is_aligned((void *)data.end, data.msl->page_sz)); + msl = data.msl; + ms = rte_mem_virt2memseg((void *)data.start, msl); + len = data.end - data.start; + assert(msl->page_sz == ms->hugepage_sz); + /* Number of memsegs in the range. */ + ms_n = len / msl->page_sz; + DEBUG("port %u extending %p to [0x%" PRIxPTR ", 0x%" PRIxPTR ")," + " page_sz=0x%" PRIx64 ", ms_n=%u", + dev->data->port_id, (void *)addr, + data.start, data.end, msl->page_sz, ms_n); + /* Size of memory for bitmap. */ + bmp_size = rte_bitmap_get_memory_footprint(ms_n); + mr = rte_zmalloc_socket(NULL, + RTE_ALIGN_CEIL(sizeof(*mr), + RTE_CACHE_LINE_SIZE) + + bmp_size, + RTE_CACHE_LINE_SIZE, msl->socket_id); + if (mr == NULL) { + WARN("port %u unable to allocate memory for a new MR of" + " address (%p).", + dev->data->port_id, (void *)addr); + rte_errno = ENOMEM; + goto err_nolock; + } + mr->msl = msl; + /* + * Save the index of the first memseg and initialize memseg bitmap. To + * see if a memseg of ms_idx in the memseg-list is still valid, check: + * rte_bitmap_get(mr->bmp, ms_idx - mr->ms_base_idx) + */ + mr->ms_base_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + bmp_mem = RTE_PTR_ALIGN_CEIL(mr + 1, RTE_CACHE_LINE_SIZE); + mr->ms_bmp = rte_bitmap_init(ms_n, bmp_mem, bmp_size); + if (mr->ms_bmp == NULL) { + WARN("port %u unable to initialize bitmap for a new MR of" + " address (%p).", + dev->data->port_id, (void *)addr); + rte_errno = EINVAL; + goto err_nolock; + } + /* + * Should recheck whether the extended contiguous chunk is still valid. + * Because memory_hotplug_lock can't be held if there's any memory + * related calls in a critical path, resource allocation above can't be + * locked. If the memory has been changed at this point, try again with + * just single page. If not, go on with the big chunk atomically from + * here. + */ + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + data_re = data; + if (len > msl->page_sz && + !rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data_re)) { + WARN("port %u unable to find virtually contiguous" + " chunk for address (%p)." + " rte_memseg_contig_walk() failed.", + dev->data->port_id, (void *)addr); + rte_errno = ENXIO; + goto err_memlock; + } + if (data.start != data_re.start || data.end != data_re.end) { + /* + * The extended contiguous chunk has been changed. Try again + * with single memseg instead. + */ + data.start = RTE_ALIGN_FLOOR(addr, msl->page_sz); + data.end = data.start + msl->page_sz; + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + mr_free(mr); + goto alloc_resources; + } + assert(data.msl == data_re.msl); + rte_rwlock_write_lock(&priv->mr.rwlock); + /* + * Check the address is really missing. If other thread already created + * one or it is not found due to overflow, abort and return. + */ + if (mr_lookup_dev(dev, entry, addr) != UINT32_MAX) { + /* + * Insert to the global cache table. It may fail due to + * low-on-memory. Then, this entry will have to be searched + * here again. + */ + mr_btree_insert(&priv->mr.cache, entry); + DEBUG("port %u found MR for %p on final lookup, abort", + dev->data->port_id, (void *)addr); + rte_rwlock_write_unlock(&priv->mr.rwlock); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + /* + * Must be unlocked before calling rte_free() because + * mlx4_mr_mem_event_free_cb() can be called inside. + */ + mr_free(mr); + return entry->lkey; + } + /* + * Trim start and end addresses for verbs MR. Set bits for registering + * memsegs but exclude already registered ones. Bitmap can be + * fragmented. + */ + for (n = 0; n < ms_n; ++n) { + uintptr_t start; + struct mlx4_mr_cache ret; + + memset(&ret, 0, sizeof(ret)); + start = data_re.start + n * msl->page_sz; + /* Exclude memsegs already registered by other MRs. */ + if (mr_lookup_dev(dev, &ret, start) == UINT32_MAX) { + /* + * Start from the first unregistered memseg in the + * extended range. + */ + if (ms_idx_shift == -1) { + mr->ms_base_idx += n; + data.start = start; + ms_idx_shift = n; + } + data.end = start + msl->page_sz; + rte_bitmap_set(mr->ms_bmp, n - ms_idx_shift); + ++mr->ms_n; + } + } + len = data.end - data.start; + mr->ms_bmp_n = len / msl->page_sz; + assert(ms_idx_shift + mr->ms_bmp_n <= ms_n); + /* + * Finally create a verbs MR for the memory chunk. ibv_reg_mr() can be + * called with holding the memory lock because it doesn't use + * mlx4_alloc_buf_extern() which eventually calls rte_malloc_socket() + * through mlx4_alloc_verbs_buf(). + */ + mr->ibv_mr = mlx4_glue->reg_mr(priv->pd, (void *)data.start, len, + IBV_ACCESS_LOCAL_WRITE); + if (mr->ibv_mr == NULL) { + WARN("port %u fail to create a verbs MR for address (%p)", + dev->data->port_id, (void *)addr); + rte_errno = EINVAL; + goto err_mrlock; + } + assert((uintptr_t)mr->ibv_mr->addr == data.start); + assert(mr->ibv_mr->length == len); + LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr); + DEBUG("port %u MR CREATED (%p) for %p:\n" + " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," + " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", + dev->data->port_id, (void *)mr, (void *)addr, + data.start, data.end, rte_cpu_to_be_32(mr->ibv_mr->lkey), + mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); + /* Insert to the global cache table. */ + mr_insert_dev_cache(dev, mr); + /* Fill in output data. */ + mr_lookup_dev(dev, entry, addr); + /* Lookup can't fail. */ + assert(entry->lkey != UINT32_MAX); + rte_rwlock_write_unlock(&priv->mr.rwlock); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + return entry->lkey; +err_mrlock: + rte_rwlock_write_unlock(&priv->mr.rwlock); +err_memlock: + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); +err_nolock: + /* + * In case of error, as this can be called in a datapath, a warning + * message per an error is preferable instead. Must be unlocked before + * calling rte_free() because mlx4_mr_mem_event_free_cb() can be called + * inside. + */ + mr_free(mr); + return UINT32_MAX; +} + +/** + * Create a new global Memory Region (MR) for a missing virtual address. + * This can be called from primary and secondary process. + * + * @param dev + * Pointer to Ethernet device. + * @param[out] entry + * Pointer to returning MR cache entry, found in the global cache or newly + * created. If failed to create one, this will not be updated. + * @param addr + * Target virtual address to register. + * + * @return + * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. + */ +static uint32_t +mlx4_mr_create(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry, + uintptr_t addr) +{ + uint32_t ret = 0; + + switch (rte_eal_process_type()) { + case RTE_PROC_PRIMARY: + ret = mlx4_mr_create_primary(dev, entry, addr); + break; + case RTE_PROC_SECONDARY: + ret = mlx4_mr_create_secondary(dev, entry, addr); + break; + default: + break; + } + return ret; +} + +/** + * Rebuild the global B-tree cache of device from the original MR list. + * + * @param dev + * Pointer to Ethernet device. + */ +static void +mr_rebuild_dev_cache(struct rte_eth_dev *dev) +{ + struct mlx4_priv *priv = dev->data->dev_private; + struct mlx4_mr *mr; + + DEBUG("port %u rebuild dev cache[]", dev->data->port_id); + /* Flush cache to rebuild. */ + priv->mr.cache.len = 1; + priv->mr.cache.overflow = 0; + /* Iterate all the existing MRs. */ + LIST_FOREACH(mr, &priv->mr.mr_list, mr) + if (mr_insert_dev_cache(dev, mr) < 0) + return; +} + +/** + * Callback for memory free event. Iterate freed memsegs and check whether it + * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a + * result, the MR would be fragmented. If it becomes empty, the MR will be freed + * later by mlx4_mr_garbage_collect(). + * + * The global cache must be rebuilt if there's any change and this event has to + * be propagated to dataplane threads to flush the local caches. + * + * @param dev + * Pointer to Ethernet device. + * @param addr + * Address of freed memory. + * @param len + * Size of freed memory. + */ +static void +mlx4_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len) +{ + struct mlx4_priv *priv = dev->data->dev_private; + const struct rte_memseg_list *msl; + struct mlx4_mr *mr; + int ms_n; + int i; + int rebuild = 0; + + DEBUG("port %u free callback: addr=%p, len=%zu", + dev->data->port_id, addr, len); + msl = rte_mem_virt2memseg_list(addr); + /* addr and len must be page-aligned. */ + assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz)); + assert(len == RTE_ALIGN(len, msl->page_sz)); + ms_n = len / msl->page_sz; + rte_rwlock_write_lock(&priv->mr.rwlock); + /* Clear bits of freed memsegs from MR. */ + for (i = 0; i < ms_n; ++i) { + const struct rte_memseg *ms; + struct mlx4_mr_cache entry; + uintptr_t start; + int ms_idx; + uint32_t pos; + + /* Find MR having this memseg. */ + start = (uintptr_t)addr + i * msl->page_sz; + mr = mr_lookup_dev_list(dev, &entry, start); + if (mr == NULL) + continue; + assert(mr->msl); /* Can't be external memory. */ + ms = rte_mem_virt2memseg((void *)start, msl); + assert(ms != NULL); + assert(msl->page_sz == ms->hugepage_sz); + ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + pos = ms_idx - mr->ms_base_idx; + assert(rte_bitmap_get(mr->ms_bmp, pos)); + assert(pos < mr->ms_bmp_n); + DEBUG("port %u MR(%p): clear bitmap[%u] for addr %p", + dev->data->port_id, (void *)mr, pos, (void *)start); + rte_bitmap_clear(mr->ms_bmp, pos); + if (--mr->ms_n == 0) { + LIST_REMOVE(mr, mr); + LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr); + DEBUG("port %u remove MR(%p) from list", + dev->data->port_id, (void *)mr); + } + /* + * MR is fragmented or will be freed. the global cache must be + * rebuilt. + */ + rebuild = 1; + } + if (rebuild) { + mr_rebuild_dev_cache(dev); + /* + * Flush local caches by propagating invalidation across cores. + * rte_smp_wmb() is enough to synchronize this event. If one of + * freed memsegs is seen by other core, that means the memseg + * has been allocated by allocator, which will come after this + * free call. Therefore, this store instruction (incrementing + * generation below) will be guaranteed to be seen by other core + * before the core sees the newly allocated memory. + */ + ++priv->mr.dev_gen; + DEBUG("broadcasting local cache flush, gen=%d", + priv->mr.dev_gen); + rte_smp_wmb(); + } + rte_rwlock_write_unlock(&priv->mr.rwlock); +#ifndef NDEBUG + if (rebuild) + mlx4_mr_dump_dev(dev); +#endif +} + +/** + * Callback for memory event. + * + * @param event_type + * Memory event type. + * @param addr + * Address of memory. + * @param len + * Size of memory. + */ +void +mlx4_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr, + size_t len, void *arg __rte_unused) +{ + struct mlx4_priv *priv; + struct mlx4_dev_list *dev_list = &mlx4_shared_data->mem_event_cb_list; + + /* Must be called from the primary process. */ + assert(rte_eal_process_type() == RTE_PROC_PRIMARY); + switch (event_type) { + case RTE_MEM_EVENT_FREE: + rte_rwlock_read_lock(&mlx4_shared_data->mem_event_rwlock); + /* Iterate all the existing mlx4 devices. */ + LIST_FOREACH(priv, dev_list, mem_event_cb) + mlx4_mr_mem_event_free_cb(ETH_DEV(priv), addr, len); + rte_rwlock_read_unlock(&mlx4_shared_data->mem_event_rwlock); + break; + case RTE_MEM_EVENT_ALLOC: + default: + break; + } +} + +/** + * Look up address in the global MR cache table. If not found, create a new MR. + * Insert the found/created entry to local bottom-half cache table. + * + * @param dev + * Pointer to Ethernet device. + * @param mr_ctrl + * Pointer to per-queue MR control structure. + * @param[out] entry + * Pointer to returning MR cache entry, found in the global cache or newly + * created. If failed to create one, this is not written. + * @param addr + * Search key. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +static uint32_t +mlx4_mr_lookup_dev(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl, + struct mlx4_mr_cache *entry, uintptr_t addr) +{ + struct mlx4_priv *priv = dev->data->dev_private; + struct mlx4_mr_btree *bt = &mr_ctrl->cache_bh; + uint16_t idx; + uint32_t lkey; + + /* If local cache table is full, try to double it. */ + if (unlikely(bt->len == bt->size)) + mr_btree_expand(bt, bt->size << 1); + /* Look up in the global cache. */ + rte_rwlock_read_lock(&priv->mr.rwlock); + lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr); + if (lkey != UINT32_MAX) { + /* Found. */ + *entry = (*priv->mr.cache.table)[idx]; + rte_rwlock_read_unlock(&priv->mr.rwlock); + /* + * Update local cache. Even if it fails, return the found entry + * to update top-half cache. Next time, this entry will be found + * in the global cache. + */ + mr_btree_insert(bt, entry); + return lkey; + } + rte_rwlock_read_unlock(&priv->mr.rwlock); + /* First time to see the address? Create a new MR. */ + lkey = mlx4_mr_create(dev, entry, addr); + /* + * Update the local cache if successfully created a new global MR. Even + * if failed to create one, there's no action to take in this datapath + * code. As returning LKey is invalid, this will eventually make HW + * fail. + */ + if (lkey != UINT32_MAX) + mr_btree_insert(bt, entry); + return lkey; +} + +/** + * Bottom-half of LKey search on datapath. Firstly search in cache_bh[] and if + * misses, search in the global MR cache table and update the new entry to + * per-queue local caches. + * + * @param dev + * Pointer to Ethernet device. + * @param mr_ctrl + * Pointer to per-queue MR control structure. + * @param addr + * Search key. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +static uint32_t +mlx4_mr_addr2mr_bh(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl, + uintptr_t addr) +{ + uint32_t lkey; + uint16_t bh_idx = 0; + /* Victim in top-half cache to replace with new entry. */ + struct mlx4_mr_cache *repl = &mr_ctrl->cache[mr_ctrl->head]; + + /* Binary-search MR translation table. */ + lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr); + /* Update top-half cache. */ + if (likely(lkey != UINT32_MAX)) { + *repl = (*mr_ctrl->cache_bh.table)[bh_idx]; + } else { + /* + * If missed in local lookup table, search in the global cache + * and local cache_bh[] will be updated inside if possible. + * Top-half cache entry will also be updated. + */ + lkey = mlx4_mr_lookup_dev(dev, mr_ctrl, repl, addr); + if (unlikely(lkey == UINT32_MAX)) + return UINT32_MAX; + } + /* Update the most recently used entry. */ + mr_ctrl->mru = mr_ctrl->head; + /* Point to the next victim, the oldest. */ + mr_ctrl->head = (mr_ctrl->head + 1) % MLX4_MR_CACHE_N; + return lkey; +} + +/** + * Bottom-half of LKey search on Rx. + * + * @param rxq + * Pointer to Rx queue structure. + * @param addr + * Search key. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +uint32_t +mlx4_rx_addr2mr_bh(struct rxq *rxq, uintptr_t addr) +{ + struct mlx4_mr_ctrl *mr_ctrl = &rxq->mr_ctrl; + struct mlx4_priv *priv = rxq->priv; + + return mlx4_mr_addr2mr_bh(ETH_DEV(priv), mr_ctrl, addr); +} + +/** + * Bottom-half of LKey search on Tx. + * + * @param txq + * Pointer to Tx queue structure. + * @param addr + * Search key. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +static uint32_t +mlx4_tx_addr2mr_bh(struct txq *txq, uintptr_t addr) +{ + struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl; + struct mlx4_priv *priv = txq->priv; + + return mlx4_mr_addr2mr_bh(ETH_DEV(priv), mr_ctrl, addr); +} + +/** + * Bottom-half of LKey search on Tx. If it can't be searched in the memseg + * list, register the mempool of the mbuf as externally allocated memory. + * + * @param txq + * Pointer to Tx queue structure. + * @param mb + * Pointer to mbuf. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +uint32_t +mlx4_tx_mb2mr_bh(struct txq *txq, struct rte_mbuf *mb) +{ + uintptr_t addr = (uintptr_t)mb->buf_addr; + uint32_t lkey; + + lkey = mlx4_tx_addr2mr_bh(txq, addr); + if (lkey == UINT32_MAX && rte_errno == ENXIO) { + /* Mempool may have externally allocated memory. */ + return mlx4_tx_update_ext_mp(txq, addr, mlx4_mb2mp(mb)); + } + return lkey; +} + +/** + * Flush all of the local cache entries. + * + * @param mr_ctrl + * Pointer to per-queue MR control structure. + */ +void +mlx4_mr_flush_local_cache(struct mlx4_mr_ctrl *mr_ctrl) +{ + /* Reset the most-recently-used index. */ + mr_ctrl->mru = 0; + /* Reset the linear search array. */ + mr_ctrl->head = 0; + memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache)); + /* Reset the B-tree table. */ + mr_ctrl->cache_bh.len = 1; + mr_ctrl->cache_bh.overflow = 0; + /* Update the generation number. */ + mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr; + DEBUG("mr_ctrl(%p): flushed, cur_gen=%d", + (void *)mr_ctrl, mr_ctrl->cur_gen); +} + +/** + * Called during rte_mempool_mem_iter() by mlx4_mr_update_ext_mp(). + * + * Externally allocated chunk is registered and a MR is created for the chunk. + * The MR object is added to the global list. If memseg list of a MR object + * (mr->msl) is null, the MR object can be regarded as externally allocated + * memory. + * + * Once external memory is registered, it should be static. If the memory is + * freed and the virtual address range has different physical memory mapped + * again, it may cause crash on device due to the wrong translation entry. PMD + * can't track the free event of the external memory for now. + */ +static void +mlx4_mr_update_ext_mp_cb(struct rte_mempool *mp, void *opaque, + struct rte_mempool_memhdr *memhdr, + unsigned mem_idx __rte_unused) +{ + struct mr_update_mp_data *data = opaque; + struct rte_eth_dev *dev = data->dev; + struct mlx4_priv *priv = dev->data->dev_private; + struct mlx4_mr_ctrl *mr_ctrl = data->mr_ctrl; + struct mlx4_mr *mr = NULL; + uintptr_t addr = (uintptr_t)memhdr->addr; + size_t len = memhdr->len; + struct mlx4_mr_cache entry; + uint32_t lkey; + + assert(rte_eal_process_type() == RTE_PROC_PRIMARY); + /* If already registered, it should return. */ + rte_rwlock_read_lock(&priv->mr.rwlock); + lkey = mr_lookup_dev(dev, &entry, addr); + rte_rwlock_read_unlock(&priv->mr.rwlock); + if (lkey != UINT32_MAX) + return; + mr = rte_zmalloc_socket(NULL, + RTE_ALIGN_CEIL(sizeof(*mr), + RTE_CACHE_LINE_SIZE), + RTE_CACHE_LINE_SIZE, mp->socket_id); + if (mr == NULL) { + WARN("port %u unable to allocate memory for a new MR of" + " mempool (%s).", + dev->data->port_id, mp->name); + data->ret = -1; + return; + } + DEBUG("port %u register MR for chunk #%d of mempool (%s)", + dev->data->port_id, mem_idx, mp->name); + mr->ibv_mr = mlx4_glue->reg_mr(priv->pd, (void *)addr, len, + IBV_ACCESS_LOCAL_WRITE); + if (mr->ibv_mr == NULL) { + WARN("port %u fail to create a verbs MR for address (%p)", + dev->data->port_id, (void *)addr); + rte_free(mr); + data->ret = -1; + return; + } + mr->msl = NULL; /* Mark it is external memory. */ + mr->ms_bmp = NULL; + mr->ms_n = 1; + mr->ms_bmp_n = 1; + rte_rwlock_write_lock(&priv->mr.rwlock); + LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr); + DEBUG("port %u MR CREATED (%p) for external memory %p:\n" + " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," + " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", + dev->data->port_id, (void *)mr, (void *)addr, + addr, addr + len, rte_cpu_to_be_32(mr->ibv_mr->lkey), + mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); + /* Insert to the global cache table. */ + mr_insert_dev_cache(dev, mr); + rte_rwlock_write_unlock(&priv->mr.rwlock); + /* Insert to the local cache table */ + mlx4_mr_addr2mr_bh(dev, mr_ctrl, addr); +} + +/** + * Register MR for entire memory chunks in a Mempool having externally allocated + * memory and fill in local cache. + * + * @param dev + * Pointer to Ethernet device. + * @param mr_ctrl + * Pointer to per-queue MR control structure. + * @param mp + * Pointer to registering Mempool. + * + * @return + * 0 on success, -1 on failure. + */ +static uint32_t +mlx4_mr_update_ext_mp(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl, + struct rte_mempool *mp) +{ + struct mr_update_mp_data data = { + .dev = dev, + .mr_ctrl = mr_ctrl, + .ret = 0, + }; + + rte_mempool_mem_iter(mp, mlx4_mr_update_ext_mp_cb, &data); + return data.ret; +} + +/** + * Register MR entire memory chunks in a Mempool having externally allocated + * memory and search LKey of the address to return. + * + * @param dev + * Pointer to Ethernet device. + * @param addr + * Search key. + * @param mp + * Pointer to registering Mempool where addr belongs. + * + * @return + * LKey for address on success, UINT32_MAX on failure. + */ +uint32_t +mlx4_tx_update_ext_mp(struct txq *txq, uintptr_t addr, struct rte_mempool *mp) +{ + struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl; + struct mlx4_priv *priv = txq->priv; + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) { + WARN("port %u using address (%p) from unregistered mempool" + " having externally allocated memory" + " in secondary process, please create mempool" + " prior to rte_eth_dev_start()", + PORT_ID(priv), (void *)addr); + return UINT32_MAX; + } + mlx4_mr_update_ext_mp(ETH_DEV(priv), mr_ctrl, mp); + return mlx4_tx_addr2mr_bh(txq, addr); +} + +/* Called during rte_mempool_mem_iter() by mlx4_mr_update_mp(). */ +static void +mlx4_mr_update_mp_cb(struct rte_mempool *mp __rte_unused, void *opaque, + struct rte_mempool_memhdr *memhdr, + unsigned mem_idx __rte_unused) +{ + struct mr_update_mp_data *data = opaque; + uint32_t lkey; + + /* Stop iteration if failed in the previous walk. */ + if (data->ret < 0) + return; + /* Register address of the chunk and update local caches. */ + lkey = mlx4_mr_addr2mr_bh(data->dev, data->mr_ctrl, + (uintptr_t)memhdr->addr); + if (lkey == UINT32_MAX) + data->ret = -1; +} + +/** + * Register entire memory chunks in a Mempool. + * + * @param dev + * Pointer to Ethernet device. + * @param mr_ctrl + * Pointer to per-queue MR control structure. + * @param mp + * Pointer to registering Mempool. + * + * @return + * 0 on success, -1 on failure. + */ +int +mlx4_mr_update_mp(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl, + struct rte_mempool *mp) +{ + struct mr_update_mp_data data = { + .dev = dev, + .mr_ctrl = mr_ctrl, + .ret = 0, + }; + + rte_mempool_mem_iter(mp, mlx4_mr_update_mp_cb, &data); + if (data.ret < 0 && rte_errno == ENXIO) { + /* Mempool may have externally allocated memory. */ + return mlx4_mr_update_ext_mp(dev, mr_ctrl, mp); + } + return data.ret; +} + +#ifndef NDEBUG +/** + * Dump all the created MRs and the global cache entries. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx4_mr_dump_dev(struct rte_eth_dev *dev) +{ + struct mlx4_priv *priv = dev->data->dev_private; + struct mlx4_mr *mr; + int mr_n = 0; + int chunk_n = 0; + + rte_rwlock_read_lock(&priv->mr.rwlock); + /* Iterate all the existing MRs. */ + LIST_FOREACH(mr, &priv->mr.mr_list, mr) { + unsigned int n; + + DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u", + dev->data->port_id, mr_n++, + rte_cpu_to_be_32(mr->ibv_mr->lkey), + mr->ms_n, mr->ms_bmp_n); + if (mr->ms_n == 0) + continue; + for (n = 0; n < mr->ms_bmp_n; ) { + struct mlx4_mr_cache ret; + + memset(&ret, 0, sizeof(ret)); + n = mr_find_next_chunk(mr, &ret, n); + if (!ret.end) + break; + DEBUG(" chunk[%u], [0x%" PRIxPTR ", 0x%" PRIxPTR ")", + chunk_n++, ret.start, ret.end); + } + } + DEBUG("port %u dumping global cache", dev->data->port_id); + mlx4_mr_btree_dump(&priv->mr.cache); + rte_rwlock_read_unlock(&priv->mr.rwlock); +} +#endif + +/** + * Release all the created MRs and resources. Remove device from memory callback + * list. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx4_mr_release(struct rte_eth_dev *dev) +{ + struct mlx4_priv *priv = dev->data->dev_private; + struct mlx4_mr *mr_next; + + /* Remove from memory callback device list. */ + rte_rwlock_write_lock(&mlx4_shared_data->mem_event_rwlock); + LIST_REMOVE(priv, mem_event_cb); + rte_rwlock_write_unlock(&mlx4_shared_data->mem_event_rwlock); +#ifndef NDEBUG + mlx4_mr_dump_dev(dev); +#endif + rte_rwlock_write_lock(&priv->mr.rwlock); + /* Detach from MR list and move to free list. */ + mr_next = LIST_FIRST(&priv->mr.mr_list); + while (mr_next != NULL) { + struct mlx4_mr *mr = mr_next; + + mr_next = LIST_NEXT(mr, mr); + LIST_REMOVE(mr, mr); + LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr); + } + LIST_INIT(&priv->mr.mr_list); + /* Free global cache. */ + mlx4_mr_btree_free(&priv->mr.cache); + rte_rwlock_write_unlock(&priv->mr.rwlock); + /* Free all remaining MRs. */ + mlx4_mr_garbage_collect(dev); +} diff --git a/src/seastar/dpdk/drivers/net/mlx4/mlx4_mr.h b/src/seastar/dpdk/drivers/net/mlx4/mlx4_mr.h new file mode 100644 index 000000000..9d125e239 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/mlx4/mlx4_mr.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 6WIND S.A. + * Copyright 2018 Mellanox Technologies, Ltd + */ + +#ifndef RTE_PMD_MLX4_MR_H_ +#define RTE_PMD_MLX4_MR_H_ + +#include <stddef.h> +#include <stdint.h> +#include <sys/queue.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_eal_memconfig.h> +#include <rte_ethdev.h> +#include <rte_rwlock.h> +#include <rte_bitmap.h> + +/* Size of per-queue MR cache array for linear search. */ +#define MLX4_MR_CACHE_N 8 + +/* Size of MR cache table for binary search. */ +#define MLX4_MR_BTREE_CACHE_N 256 + +/* Memory Region object. */ +struct mlx4_mr { + LIST_ENTRY(mlx4_mr) mr; /**< Pointer to the prev/next entry. */ + struct ibv_mr *ibv_mr; /* Verbs Memory Region. */ + const struct rte_memseg_list *msl; + int ms_base_idx; /* Start index of msl->memseg_arr[]. */ + int ms_n; /* Number of memsegs in use. */ + uint32_t ms_bmp_n; /* Number of bits in memsegs bit-mask. */ + struct rte_bitmap *ms_bmp; /* Bit-mask of memsegs belonged to MR. */ +}; + +/* Cache entry for Memory Region. */ +struct mlx4_mr_cache { + uintptr_t start; /* Start address of MR. */ + uintptr_t end; /* End address of MR. */ + uint32_t lkey; /* rte_cpu_to_be_32(ibv_mr->lkey). */ +} __rte_packed; + +/* MR Cache table for Binary search. */ +struct mlx4_mr_btree { + uint16_t len; /* Number of entries. */ + uint16_t size; /* Total number of entries. */ + int overflow; /* Mark failure of table expansion. */ + struct mlx4_mr_cache (*table)[]; +} __rte_packed; + +/* Per-queue MR control descriptor. */ +struct mlx4_mr_ctrl { + uint32_t *dev_gen_ptr; /* Generation number of device to poll. */ + uint32_t cur_gen; /* Generation number saved to flush caches. */ + uint16_t mru; /* Index of last hit entry in top-half cache. */ + uint16_t head; /* Index of the oldest entry in top-half cache. */ + struct mlx4_mr_cache cache[MLX4_MR_CACHE_N]; /* Cache for top-half. */ + struct mlx4_mr_btree cache_bh; /* Cache for bottom-half. */ +} __rte_packed; + +extern struct mlx4_dev_list mlx4_mem_event_cb_list; +extern rte_rwlock_t mlx4_mem_event_rwlock; + +/* First entry must be NULL for comparison. */ +#define mlx4_mr_btree_len(bt) ((bt)->len - 1) + +int mlx4_mr_btree_init(struct mlx4_mr_btree *bt, int n, int socket); +void mlx4_mr_btree_free(struct mlx4_mr_btree *bt); +void mlx4_mr_btree_dump(struct mlx4_mr_btree *bt); +uint32_t mlx4_mr_create_primary(struct rte_eth_dev *dev, + struct mlx4_mr_cache *entry, uintptr_t addr); +void mlx4_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr, + size_t len, void *arg); +int mlx4_mr_update_mp(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl, + struct rte_mempool *mp); +void mlx4_mr_dump_dev(struct rte_eth_dev *dev); +void mlx4_mr_release(struct rte_eth_dev *dev); + +/** + * Look up LKey from given lookup table by linear search. Firstly look up the + * last-hit entry. If miss, the entire array is searched. If found, update the + * last-hit index and return LKey. + * + * @param lkp_tbl + * Pointer to lookup table. + * @param[in,out] cached_idx + * Pointer to last-hit index. + * @param n + * Size of lookup table. + * @param addr + * Search key. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +static __rte_always_inline uint32_t +mlx4_mr_lookup_cache(struct mlx4_mr_cache *lkp_tbl, uint16_t *cached_idx, + uint16_t n, uintptr_t addr) +{ + uint16_t idx; + + if (likely(addr >= lkp_tbl[*cached_idx].start && + addr < lkp_tbl[*cached_idx].end)) + return lkp_tbl[*cached_idx].lkey; + for (idx = 0; idx < n && lkp_tbl[idx].start != 0; ++idx) { + if (addr >= lkp_tbl[idx].start && + addr < lkp_tbl[idx].end) { + /* Found. */ + *cached_idx = idx; + return lkp_tbl[idx].lkey; + } + } + return UINT32_MAX; +} + +#endif /* RTE_PMD_MLX4_MR_H_ */ diff --git a/src/seastar/dpdk/drivers/net/mlx4/mlx4_prm.h b/src/seastar/dpdk/drivers/net/mlx4/mlx4_prm.h new file mode 100644 index 000000000..16ae6db82 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/mlx4/mlx4_prm.h @@ -0,0 +1,163 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox Technologies, Ltd + */ + +#ifndef MLX4_PRM_H_ +#define MLX4_PRM_H_ + +#include <rte_atomic.h> +#include <rte_branch_prediction.h> +#include <rte_byteorder.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/mlx4dv.h> +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif +#include "mlx4_autoconf.h" + +/* ConnectX-3 Tx queue basic block. */ +#define MLX4_TXBB_SHIFT 6 +#define MLX4_TXBB_SIZE (1 << MLX4_TXBB_SHIFT) + +/* Typical TSO descriptor with 16 gather entries is 352 bytes. */ +#define MLX4_MAX_SGE 32 +#define MLX4_MAX_WQE_SIZE \ + (MLX4_MAX_SGE * sizeof(struct mlx4_wqe_data_seg) + \ + sizeof(struct mlx4_wqe_ctrl_seg)) +#define MLX4_SEG_SHIFT 4 + +/* Send queue stamping/invalidating information. */ +#define MLX4_SQ_STAMP_STRIDE 64 +#define MLX4_SQ_STAMP_DWORDS (MLX4_SQ_STAMP_STRIDE / 4) +#define MLX4_SQ_OWNER_BIT 31 +#define MLX4_SQ_STAMP_VAL 0x7fffffff + +/* Work queue element (WQE) flags. */ +#define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28) +#define MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27) +#define MLX4_WQE_CTRL_RR (1 << 6) + +/* CQE checksum flags. */ +enum { + MLX4_CQE_L2_TUNNEL_IPV4 = (int)(1u << 25), + MLX4_CQE_L2_TUNNEL_L4_CSUM = (int)(1u << 26), + MLX4_CQE_L2_TUNNEL = (int)(1u << 27), + MLX4_CQE_L2_VLAN_MASK = (int)(3u << 29), + MLX4_CQE_L2_TUNNEL_IPOK = (int)(1u << 31), +}; + +/* CQE status flags. */ +#define MLX4_CQE_STATUS_IPV6F (1 << 12) +#define MLX4_CQE_STATUS_IPV4 (1 << 22) +#define MLX4_CQE_STATUS_IPV4F (1 << 23) +#define MLX4_CQE_STATUS_IPV6 (1 << 24) +#define MLX4_CQE_STATUS_IPV4OPT (1 << 25) +#define MLX4_CQE_STATUS_TCP (1 << 26) +#define MLX4_CQE_STATUS_UDP (1 << 27) +#define MLX4_CQE_STATUS_PTYPE_MASK \ + (MLX4_CQE_STATUS_IPV4 | \ + MLX4_CQE_STATUS_IPV4F | \ + MLX4_CQE_STATUS_IPV6 | \ + MLX4_CQE_STATUS_IPV4OPT | \ + MLX4_CQE_STATUS_TCP | \ + MLX4_CQE_STATUS_UDP) + +/* Send queue information. */ +struct mlx4_sq { + volatile uint8_t *buf; /**< SQ buffer. */ + volatile uint8_t *eob; /**< End of SQ buffer */ + uint32_t size; /**< SQ size includes headroom. */ + uint32_t remain_size; /**< Remaining WQE room in SQ (bytes). */ + uint32_t owner_opcode; + /**< Default owner opcode with HW valid owner bit. */ + uint32_t stamp; /**< Stamp value with an invalid HW owner bit. */ + uint32_t *db; /**< Pointer to the doorbell. */ + off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */ + uint32_t doorbell_qpn; /**< qp number to write to the doorbell. */ +}; + +/* Completion queue events, numbers and masks. */ +#define MLX4_CQ_DB_GEQ_N_MASK 0x3 +#define MLX4_CQ_DOORBELL 0x20 +#define MLX4_CQ_DB_CI_MASK 0xffffff + +/* Completion queue information. */ +struct mlx4_cq { + volatile void *cq_uar; /**< CQ user access region. */ + volatile void *cq_db_reg; /**< CQ doorbell register. */ + volatile uint32_t *set_ci_db; /**< Pointer to the CQ doorbell. */ + volatile uint32_t *arm_db; /**< Arming Rx events doorbell. */ + volatile uint8_t *buf; /**< Pointer to the completion queue buffer. */ + uint32_t cqe_cnt; /**< Number of entries in the queue. */ + uint32_t cqe_64:1; /**< CQ entry size is 64 bytes. */ + uint32_t cons_index; /**< Last queue entry that was handled. */ + uint32_t cqn; /**< CQ number. */ + int arm_sn; /**< Rx event counter. */ +}; + +#ifndef HAVE_IBV_MLX4_WQE_LSO_SEG +/* + * WQE LSO segment structure. + * Defined here as backward compatibility for rdma-core v17 and below. + * Similar definition is found in infiniband/mlx4dv.h in rdma-core v18 + * and above. + */ +struct mlx4_wqe_lso_seg { + rte_be32_t mss_hdr_size; + rte_be32_t header[]; +}; +#endif + +/** + * Retrieve a CQE entry from a CQ. + * + * cqe = cq->buf + cons_index * cqe_size + cqe_offset + * + * Where cqe_size is 32 or 64 bytes and cqe_offset is 0 or 32 (depending on + * cqe_size). + * + * @param cq + * CQ to retrieve entry from. + * @param index + * Entry index. + * + * @return + * Pointer to CQE entry. + */ +static inline volatile struct mlx4_cqe * +mlx4_get_cqe(struct mlx4_cq *cq, uint32_t index) +{ + return (volatile struct mlx4_cqe *)(cq->buf + + ((index & (cq->cqe_cnt - 1)) << + (5 + cq->cqe_64)) + + (cq->cqe_64 << 5)); +} + +/** + * Transpose a flag in a value. + * + * @param val + * Input value. + * @param from + * Flag to retrieve from input value. + * @param to + * Flag to set in output value. + * + * @return + * Output value with transposed flag enabled if present on input. + */ +static inline uint64_t +mlx4_transpose(uint64_t val, uint64_t from, uint64_t to) +{ + return (from >= to ? + (val & from) / (from / to) : + (val & from) * (to / from)); +} + +#endif /* MLX4_PRM_H_ */ diff --git a/src/seastar/dpdk/drivers/net/mlx4/mlx4_rxq.c b/src/seastar/dpdk/drivers/net/mlx4/mlx4_rxq.c new file mode 100644 index 000000000..f45c1ff85 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/mlx4/mlx4_rxq.c @@ -0,0 +1,941 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox Technologies, Ltd + */ + +/** + * @file + * Rx queues configuration for mlx4 driver. + */ + +#include <assert.h> +#include <errno.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/mlx4dv.h> +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_byteorder.h> +#include <rte_common.h> +#include <rte_errno.h> +#include <rte_ethdev_driver.h> +#include <rte_flow.h> +#include <rte_malloc.h> +#include <rte_mbuf.h> +#include <rte_mempool.h> + +#include "mlx4.h" +#include "mlx4_glue.h" +#include "mlx4_flow.h" +#include "mlx4_rxtx.h" +#include "mlx4_utils.h" + +/** + * Historical RSS hash key. + * + * This used to be the default for mlx4 in Linux before v3.19 switched to + * generating random hash keys through netdev_rss_key_fill(). + * + * It is used in this PMD for consistency with past DPDK releases but can + * now be overridden through user configuration. + * + * Note: this is not const to work around API quirks. + */ +uint8_t +mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE] = { + 0x2c, 0xc6, 0x81, 0xd1, + 0x5b, 0xdb, 0xf4, 0xf7, + 0xfc, 0xa2, 0x83, 0x19, + 0xdb, 0x1a, 0x3e, 0x94, + 0x6b, 0x9e, 0x38, 0xd9, + 0x2c, 0x9c, 0x03, 0xd1, + 0xad, 0x99, 0x44, 0xa7, + 0xd9, 0x56, 0x3d, 0x59, + 0x06, 0x3c, 0x25, 0xf3, + 0xfc, 0x1f, 0xdc, 0x2a, +}; + +/** + * Obtain a RSS context with specified properties. + * + * Used when creating a flow rule targeting one or several Rx queues. + * + * If a matching RSS context already exists, it is returned with its + * reference count incremented. + * + * @param priv + * Pointer to private structure. + * @param fields + * Fields for RSS processing (Verbs format). + * @param[in] key + * Hash key to use (whose size is exactly MLX4_RSS_HASH_KEY_SIZE). + * @param queues + * Number of target queues. + * @param[in] queue_id + * Target queues. + * + * @return + * Pointer to RSS context on success, NULL otherwise and rte_errno is set. + */ +struct mlx4_rss * +mlx4_rss_get(struct mlx4_priv *priv, uint64_t fields, + const uint8_t key[MLX4_RSS_HASH_KEY_SIZE], + uint16_t queues, const uint16_t queue_id[]) +{ + struct mlx4_rss *rss; + size_t queue_id_size = sizeof(queue_id[0]) * queues; + + LIST_FOREACH(rss, &priv->rss, next) + if (fields == rss->fields && + queues == rss->queues && + !memcmp(key, rss->key, MLX4_RSS_HASH_KEY_SIZE) && + !memcmp(queue_id, rss->queue_id, queue_id_size)) { + ++rss->refcnt; + return rss; + } + rss = rte_malloc(__func__, offsetof(struct mlx4_rss, queue_id) + + queue_id_size, 0); + if (!rss) + goto error; + *rss = (struct mlx4_rss){ + .priv = priv, + .refcnt = 1, + .usecnt = 0, + .qp = NULL, + .ind = NULL, + .fields = fields, + .queues = queues, + }; + memcpy(rss->key, key, MLX4_RSS_HASH_KEY_SIZE); + memcpy(rss->queue_id, queue_id, queue_id_size); + LIST_INSERT_HEAD(&priv->rss, rss, next); + return rss; +error: + rte_errno = ENOMEM; + return NULL; +} + +/** + * Release a RSS context instance. + * + * Used when destroying a flow rule targeting one or several Rx queues. + * + * This function decrements the reference count of the context and destroys + * it after reaching 0. The context must have no users at this point; all + * prior calls to mlx4_rss_attach() must have been followed by matching + * calls to mlx4_rss_detach(). + * + * @param rss + * RSS context to release. + */ +void +mlx4_rss_put(struct mlx4_rss *rss) +{ + assert(rss->refcnt); + if (--rss->refcnt) + return; + assert(!rss->usecnt); + assert(!rss->qp); + assert(!rss->ind); + LIST_REMOVE(rss, next); + rte_free(rss); +} + +/** + * Attach a user to a RSS context instance. + * + * Used when the RSS QP and indirection table objects must be instantiated, + * that is, when a flow rule must be enabled. + * + * This function increments the usage count of the context. + * + * @param rss + * RSS context to attach to. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx4_rss_attach(struct mlx4_rss *rss) +{ + assert(rss->refcnt); + if (rss->usecnt++) { + assert(rss->qp); + assert(rss->ind); + return 0; + } + + struct ibv_wq *ind_tbl[rss->queues]; + struct mlx4_priv *priv = rss->priv; + struct rte_eth_dev *dev = ETH_DEV(priv); + const char *msg; + unsigned int i = 0; + int ret; + + if (!rte_is_power_of_2(RTE_DIM(ind_tbl))) { + ret = EINVAL; + msg = "number of RSS queues must be a power of two"; + goto error; + } + for (i = 0; i != RTE_DIM(ind_tbl); ++i) { + uint16_t id = rss->queue_id[i]; + struct rxq *rxq = NULL; + + if (id < dev->data->nb_rx_queues) + rxq = dev->data->rx_queues[id]; + if (!rxq) { + ret = EINVAL; + msg = "RSS target queue is not configured"; + goto error; + } + ret = mlx4_rxq_attach(rxq); + if (ret) { + ret = -ret; + msg = "unable to attach RSS target queue"; + goto error; + } + ind_tbl[i] = rxq->wq; + } + rss->ind = mlx4_glue->create_rwq_ind_table + (priv->ctx, + &(struct ibv_rwq_ind_table_init_attr){ + .log_ind_tbl_size = rte_log2_u32(RTE_DIM(ind_tbl)), + .ind_tbl = ind_tbl, + .comp_mask = 0, + }); + if (!rss->ind) { + ret = errno ? errno : EINVAL; + msg = "RSS indirection table creation failure"; + goto error; + } + rss->qp = mlx4_glue->create_qp_ex + (priv->ctx, + &(struct ibv_qp_init_attr_ex){ + .comp_mask = (IBV_QP_INIT_ATTR_PD | + IBV_QP_INIT_ATTR_RX_HASH | + IBV_QP_INIT_ATTR_IND_TABLE), + .qp_type = IBV_QPT_RAW_PACKET, + .pd = priv->pd, + .rwq_ind_tbl = rss->ind, + .rx_hash_conf = { + .rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ, + .rx_hash_key_len = MLX4_RSS_HASH_KEY_SIZE, + .rx_hash_key = rss->key, + .rx_hash_fields_mask = rss->fields, + }, + }); + if (!rss->qp) { + ret = errno ? errno : EINVAL; + msg = "RSS hash QP creation failure"; + goto error; + } + ret = mlx4_glue->modify_qp + (rss->qp, + &(struct ibv_qp_attr){ + .qp_state = IBV_QPS_INIT, + .port_num = priv->port, + }, + IBV_QP_STATE | IBV_QP_PORT); + if (ret) { + msg = "failed to switch RSS hash QP to INIT state"; + goto error; + } + ret = mlx4_glue->modify_qp + (rss->qp, + &(struct ibv_qp_attr){ + .qp_state = IBV_QPS_RTR, + }, + IBV_QP_STATE); + if (ret) { + msg = "failed to switch RSS hash QP to RTR state"; + goto error; + } + return 0; +error: + if (rss->qp) { + claim_zero(mlx4_glue->destroy_qp(rss->qp)); + rss->qp = NULL; + } + if (rss->ind) { + claim_zero(mlx4_glue->destroy_rwq_ind_table(rss->ind)); + rss->ind = NULL; + } + while (i--) + mlx4_rxq_detach(dev->data->rx_queues[rss->queue_id[i]]); + ERROR("mlx4: %s", msg); + --rss->usecnt; + rte_errno = ret; + return -ret; +} + +/** + * Detach a user from a RSS context instance. + * + * Used when disabling (not destroying) a flow rule. + * + * This function decrements the usage count of the context and destroys + * usage resources after reaching 0. + * + * @param rss + * RSS context to detach from. + */ +void +mlx4_rss_detach(struct mlx4_rss *rss) +{ + struct mlx4_priv *priv = rss->priv; + struct rte_eth_dev *dev = ETH_DEV(priv); + unsigned int i; + + assert(rss->refcnt); + assert(rss->qp); + assert(rss->ind); + if (--rss->usecnt) + return; + claim_zero(mlx4_glue->destroy_qp(rss->qp)); + rss->qp = NULL; + claim_zero(mlx4_glue->destroy_rwq_ind_table(rss->ind)); + rss->ind = NULL; + for (i = 0; i != rss->queues; ++i) + mlx4_rxq_detach(dev->data->rx_queues[rss->queue_id[i]]); +} + +/** + * Initialize common RSS context resources. + * + * Because ConnectX-3 hardware limitations require a fixed order in the + * indirection table, WQs must be allocated sequentially to be part of a + * common RSS context. + * + * Since a newly created WQ cannot be moved to a different context, this + * function allocates them all at once, one for each configured Rx queue, + * as well as all related resources (CQs and mbufs). + * + * This must therefore be done before creating any Rx flow rules relying on + * indirection tables. + * + * @param priv + * Pointer to private structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx4_rss_init(struct mlx4_priv *priv) +{ + struct rte_eth_dev *dev = ETH_DEV(priv); + uint8_t log2_range = rte_log2_u32(dev->data->nb_rx_queues); + uint32_t wq_num_prev = 0; + const char *msg; + unsigned int i; + int ret; + + if (priv->rss_init) + return 0; + if (ETH_DEV(priv)->data->nb_rx_queues > priv->hw_rss_max_qps) { + ERROR("RSS does not support more than %d queues", + priv->hw_rss_max_qps); + rte_errno = EINVAL; + return -rte_errno; + } + /* Prepare range for RSS contexts before creating the first WQ. */ + ret = mlx4_glue->dv_set_context_attr + (priv->ctx, + MLX4DV_SET_CTX_ATTR_LOG_WQS_RANGE_SZ, + &log2_range); + if (ret) { + ERROR("cannot set up range size for RSS context to %u" + " (for %u Rx queues), error: %s", + 1 << log2_range, dev->data->nb_rx_queues, strerror(ret)); + rte_errno = ret; + return -ret; + } + for (i = 0; i != ETH_DEV(priv)->data->nb_rx_queues; ++i) { + struct rxq *rxq = ETH_DEV(priv)->data->rx_queues[i]; + struct ibv_cq *cq; + struct ibv_wq *wq; + uint32_t wq_num; + + /* Attach the configured Rx queues. */ + if (rxq) { + assert(!rxq->usecnt); + ret = mlx4_rxq_attach(rxq); + if (!ret) { + wq_num = rxq->wq->wq_num; + goto wq_num_check; + } + ret = -ret; + msg = "unable to create Rx queue resources"; + goto error; + } + /* + * WQs are temporarily allocated for unconfigured Rx queues + * to maintain proper index alignment in indirection table + * by skipping unused WQ numbers. + * + * The reason this works at all even though these WQs are + * immediately destroyed is that WQNs are allocated + * sequentially and are guaranteed to never be reused in the + * same context by the underlying implementation. + */ + cq = mlx4_glue->create_cq(priv->ctx, 1, NULL, NULL, 0); + if (!cq) { + ret = ENOMEM; + msg = "placeholder CQ creation failure"; + goto error; + } + wq = mlx4_glue->create_wq + (priv->ctx, + &(struct ibv_wq_init_attr){ + .wq_type = IBV_WQT_RQ, + .max_wr = 1, + .max_sge = 1, + .pd = priv->pd, + .cq = cq, + }); + if (wq) { + wq_num = wq->wq_num; + claim_zero(mlx4_glue->destroy_wq(wq)); + } else { + wq_num = 0; /* Shut up GCC 4.8 warnings. */ + } + claim_zero(mlx4_glue->destroy_cq(cq)); + if (!wq) { + ret = ENOMEM; + msg = "placeholder WQ creation failure"; + goto error; + } +wq_num_check: + /* + * While guaranteed by the implementation, make sure WQ + * numbers are really sequential (as the saying goes, + * trust, but verify). + */ + if (i && wq_num - wq_num_prev != 1) { + if (rxq) + mlx4_rxq_detach(rxq); + ret = ERANGE; + msg = "WQ numbers are not sequential"; + goto error; + } + wq_num_prev = wq_num; + } + priv->rss_init = 1; + return 0; +error: + ERROR("cannot initialize common RSS resources (queue %u): %s: %s", + i, msg, strerror(ret)); + while (i--) { + struct rxq *rxq = ETH_DEV(priv)->data->rx_queues[i]; + + if (rxq) + mlx4_rxq_detach(rxq); + } + rte_errno = ret; + return -ret; +} + +/** + * Release common RSS context resources. + * + * As the reverse of mlx4_rss_init(), this must be done after removing all + * flow rules relying on indirection tables. + * + * @param priv + * Pointer to private structure. + */ +void +mlx4_rss_deinit(struct mlx4_priv *priv) +{ + unsigned int i; + + if (!priv->rss_init) + return; + for (i = 0; i != ETH_DEV(priv)->data->nb_rx_queues; ++i) { + struct rxq *rxq = ETH_DEV(priv)->data->rx_queues[i]; + + if (rxq) { + assert(rxq->usecnt == 1); + mlx4_rxq_detach(rxq); + } + } + priv->rss_init = 0; +} + +/** + * Attach a user to a Rx queue. + * + * Used when the resources of an Rx queue must be instantiated for it to + * become in a usable state. + * + * This function increments the usage count of the Rx queue. + * + * @param rxq + * Pointer to Rx queue structure. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_rxq_attach(struct rxq *rxq) +{ + if (rxq->usecnt++) { + assert(rxq->cq); + assert(rxq->wq); + assert(rxq->wqes); + assert(rxq->rq_db); + return 0; + } + + struct mlx4_priv *priv = rxq->priv; + struct rte_eth_dev *dev = ETH_DEV(priv); + const uint32_t elts_n = 1 << rxq->elts_n; + const uint32_t sges_n = 1 << rxq->sges_n; + struct rte_mbuf *(*elts)[elts_n] = rxq->elts; + struct mlx4dv_obj mlxdv; + struct mlx4dv_rwq dv_rwq; + struct mlx4dv_cq dv_cq = { .comp_mask = MLX4DV_CQ_MASK_UAR, }; + const char *msg; + struct ibv_cq *cq = NULL; + struct ibv_wq *wq = NULL; + uint32_t create_flags = 0; + uint32_t comp_mask = 0; + volatile struct mlx4_wqe_data_seg (*wqes)[]; + unsigned int i; + int ret; + + assert(rte_is_power_of_2(elts_n)); + priv->verbs_alloc_ctx.type = MLX4_VERBS_ALLOC_TYPE_RX_QUEUE; + priv->verbs_alloc_ctx.obj = rxq; + cq = mlx4_glue->create_cq(priv->ctx, elts_n / sges_n, NULL, + rxq->channel, 0); + if (!cq) { + ret = ENOMEM; + msg = "CQ creation failure"; + goto error; + } + /* By default, FCS (CRC) is stripped by hardware. */ + if (rxq->crc_present) { + create_flags |= IBV_WQ_FLAGS_SCATTER_FCS; + comp_mask |= IBV_WQ_INIT_ATTR_FLAGS; + } + wq = mlx4_glue->create_wq + (priv->ctx, + &(struct ibv_wq_init_attr){ + .wq_type = IBV_WQT_RQ, + .max_wr = elts_n / sges_n, + .max_sge = sges_n, + .pd = priv->pd, + .cq = cq, + .comp_mask = comp_mask, + .create_flags = create_flags, + }); + if (!wq) { + ret = errno ? errno : EINVAL; + msg = "WQ creation failure"; + goto error; + } + ret = mlx4_glue->modify_wq + (wq, + &(struct ibv_wq_attr){ + .attr_mask = IBV_WQ_ATTR_STATE, + .wq_state = IBV_WQS_RDY, + }); + if (ret) { + msg = "WQ state change to IBV_WQS_RDY failed"; + goto error; + } + /* Retrieve device queue information. */ + mlxdv.cq.in = cq; + mlxdv.cq.out = &dv_cq; + mlxdv.rwq.in = wq; + mlxdv.rwq.out = &dv_rwq; + ret = mlx4_glue->dv_init_obj(&mlxdv, MLX4DV_OBJ_RWQ | MLX4DV_OBJ_CQ); + if (ret) { + msg = "failed to obtain device information from WQ/CQ objects"; + goto error; + } + /* Pre-register Rx mempool. */ + DEBUG("port %u Rx queue %u registering mp %s having %u chunks", + ETH_DEV(priv)->data->port_id, rxq->stats.idx, + rxq->mp->name, rxq->mp->nb_mem_chunks); + mlx4_mr_update_mp(dev, &rxq->mr_ctrl, rxq->mp); + wqes = (volatile struct mlx4_wqe_data_seg (*)[]) + ((uintptr_t)dv_rwq.buf.buf + dv_rwq.rq.offset); + for (i = 0; i != RTE_DIM(*elts); ++i) { + volatile struct mlx4_wqe_data_seg *scat = &(*wqes)[i]; + struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp); + + if (buf == NULL) { + while (i--) { + rte_pktmbuf_free_seg((*elts)[i]); + (*elts)[i] = NULL; + } + ret = ENOMEM; + msg = "cannot allocate mbuf"; + goto error; + } + /* Headroom is reserved by rte_pktmbuf_alloc(). */ + assert(buf->data_off == RTE_PKTMBUF_HEADROOM); + /* Buffer is supposed to be empty. */ + assert(rte_pktmbuf_data_len(buf) == 0); + assert(rte_pktmbuf_pkt_len(buf) == 0); + /* Only the first segment keeps headroom. */ + if (i % sges_n) + buf->data_off = 0; + buf->port = rxq->port_id; + buf->data_len = rte_pktmbuf_tailroom(buf); + buf->pkt_len = rte_pktmbuf_tailroom(buf); + buf->nb_segs = 1; + *scat = (struct mlx4_wqe_data_seg){ + .addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, + uintptr_t)), + .byte_count = rte_cpu_to_be_32(buf->data_len), + .lkey = mlx4_rx_mb2mr(rxq, buf), + }; + (*elts)[i] = buf; + } + DEBUG("%p: allocated and configured %u segments (max %u packets)", + (void *)rxq, elts_n, elts_n / sges_n); + rxq->cq = cq; + rxq->wq = wq; + rxq->wqes = wqes; + rxq->rq_db = dv_rwq.rdb; + rxq->mcq.buf = dv_cq.buf.buf; + rxq->mcq.cqe_cnt = dv_cq.cqe_cnt; + rxq->mcq.set_ci_db = dv_cq.set_ci_db; + rxq->mcq.cqe_64 = (dv_cq.cqe_size & 64) ? 1 : 0; + rxq->mcq.arm_db = dv_cq.arm_db; + rxq->mcq.arm_sn = dv_cq.arm_sn; + rxq->mcq.cqn = dv_cq.cqn; + rxq->mcq.cq_uar = dv_cq.cq_uar; + rxq->mcq.cq_db_reg = (uint8_t *)dv_cq.cq_uar + MLX4_CQ_DOORBELL; + /* Update doorbell counter. */ + rxq->rq_ci = elts_n / sges_n; + rte_wmb(); + *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); + priv->verbs_alloc_ctx.type = MLX4_VERBS_ALLOC_TYPE_NONE; + return 0; +error: + if (wq) + claim_zero(mlx4_glue->destroy_wq(wq)); + if (cq) + claim_zero(mlx4_glue->destroy_cq(cq)); + --rxq->usecnt; + rte_errno = ret; + ERROR("error while attaching Rx queue %p: %s: %s", + (void *)rxq, msg, strerror(ret)); + priv->verbs_alloc_ctx.type = MLX4_VERBS_ALLOC_TYPE_NONE; + return -ret; +} + +/** + * Detach a user from a Rx queue. + * + * This function decrements the usage count of the Rx queue and destroys + * usage resources after reaching 0. + * + * @param rxq + * Pointer to Rx queue structure. + */ +void +mlx4_rxq_detach(struct rxq *rxq) +{ + unsigned int i; + struct rte_mbuf *(*elts)[1 << rxq->elts_n] = rxq->elts; + + if (--rxq->usecnt) + return; + rxq->rq_ci = 0; + memset(&rxq->mcq, 0, sizeof(rxq->mcq)); + rxq->rq_db = NULL; + rxq->wqes = NULL; + claim_zero(mlx4_glue->destroy_wq(rxq->wq)); + rxq->wq = NULL; + claim_zero(mlx4_glue->destroy_cq(rxq->cq)); + rxq->cq = NULL; + DEBUG("%p: freeing Rx queue elements", (void *)rxq); + for (i = 0; (i != RTE_DIM(*elts)); ++i) { + if (!(*elts)[i]) + continue; + rte_pktmbuf_free_seg((*elts)[i]); + (*elts)[i] = NULL; + } +} + +/** + * Returns the per-queue supported offloads. + * + * @param priv + * Pointer to private structure. + * + * @return + * Supported Tx offloads. + */ +uint64_t +mlx4_get_rx_queue_offloads(struct mlx4_priv *priv) +{ + uint64_t offloads = DEV_RX_OFFLOAD_SCATTER | + DEV_RX_OFFLOAD_KEEP_CRC | + DEV_RX_OFFLOAD_JUMBO_FRAME; + + if (priv->hw_csum) + offloads |= DEV_RX_OFFLOAD_CHECKSUM; + return offloads; +} + +/** + * Returns the per-port supported offloads. + * + * @param priv + * Pointer to private structure. + * + * @return + * Supported Rx offloads. + */ +uint64_t +mlx4_get_rx_port_offloads(struct mlx4_priv *priv) +{ + uint64_t offloads = DEV_RX_OFFLOAD_VLAN_FILTER; + + (void)priv; + return offloads; +} + +/** + * DPDK callback to configure a Rx queue. + * + * @param dev + * Pointer to Ethernet device structure. + * @param idx + * Rx queue index. + * @param desc + * Number of descriptors to configure in queue. + * @param socket + * NUMA socket on which memory must be allocated. + * @param[in] conf + * Thresholds parameters. + * @param mp + * Memory pool for buffer allocations. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + unsigned int socket, const struct rte_eth_rxconf *conf, + struct rte_mempool *mp) +{ + struct mlx4_priv *priv = dev->data->dev_private; + uint32_t mb_len = rte_pktmbuf_data_room_size(mp); + struct rte_mbuf *(*elts)[rte_align32pow2(desc)]; + struct rxq *rxq; + struct mlx4_malloc_vec vec[] = { + { + .align = RTE_CACHE_LINE_SIZE, + .size = sizeof(*rxq), + .addr = (void **)&rxq, + }, + { + .align = RTE_CACHE_LINE_SIZE, + .size = sizeof(*elts), + .addr = (void **)&elts, + }, + }; + int ret; + uint32_t crc_present; + uint64_t offloads; + + offloads = conf->offloads | dev->data->dev_conf.rxmode.offloads; + + DEBUG("%p: configuring queue %u for %u descriptors", + (void *)dev, idx, desc); + + if (idx >= dev->data->nb_rx_queues) { + rte_errno = EOVERFLOW; + ERROR("%p: queue index out of range (%u >= %u)", + (void *)dev, idx, dev->data->nb_rx_queues); + return -rte_errno; + } + rxq = dev->data->rx_queues[idx]; + if (rxq) { + rte_errno = EEXIST; + ERROR("%p: Rx queue %u already configured, release it first", + (void *)dev, idx); + return -rte_errno; + } + if (!desc) { + rte_errno = EINVAL; + ERROR("%p: invalid number of Rx descriptors", (void *)dev); + return -rte_errno; + } + if (desc != RTE_DIM(*elts)) { + desc = RTE_DIM(*elts); + WARN("%p: increased number of descriptors in Rx queue %u" + " to the next power of two (%u)", + (void *)dev, idx, desc); + } + /* By default, FCS (CRC) is stripped by hardware. */ + crc_present = 0; + if (offloads & DEV_RX_OFFLOAD_KEEP_CRC) { + if (priv->hw_fcs_strip) { + crc_present = 1; + } else { + WARN("%p: CRC stripping has been disabled but will still" + " be performed by hardware, make sure MLNX_OFED and" + " firmware are up to date", + (void *)dev); + } + } + DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from" + " incoming frames to hide it", + (void *)dev, + crc_present ? "disabled" : "enabled", + crc_present << 2); + /* Allocate and initialize Rx queue. */ + mlx4_zmallocv_socket("RXQ", vec, RTE_DIM(vec), socket); + if (!rxq) { + ERROR("%p: unable to allocate queue index %u", + (void *)dev, idx); + return -rte_errno; + } + *rxq = (struct rxq){ + .priv = priv, + .mp = mp, + .port_id = dev->data->port_id, + .sges_n = 0, + .elts_n = rte_log2_u32(desc), + .elts = elts, + /* Toggle Rx checksum offload if hardware supports it. */ + .csum = priv->hw_csum && + (offloads & DEV_RX_OFFLOAD_CHECKSUM), + .csum_l2tun = priv->hw_csum_l2tun && + (offloads & DEV_RX_OFFLOAD_CHECKSUM), + .crc_present = crc_present, + .l2tun_offload = priv->hw_csum_l2tun, + .stats = { + .idx = idx, + }, + .socket = socket, + }; + /* Enable scattered packets support for this queue if necessary. */ + assert(mb_len >= RTE_PKTMBUF_HEADROOM); + if (dev->data->dev_conf.rxmode.max_rx_pkt_len <= + (mb_len - RTE_PKTMBUF_HEADROOM)) { + ; + } else if (offloads & DEV_RX_OFFLOAD_SCATTER) { + uint32_t size = + RTE_PKTMBUF_HEADROOM + + dev->data->dev_conf.rxmode.max_rx_pkt_len; + uint32_t sges_n; + + /* + * Determine the number of SGEs needed for a full packet + * and round it to the next power of two. + */ + sges_n = rte_log2_u32((size / mb_len) + !!(size % mb_len)); + rxq->sges_n = sges_n; + /* Make sure sges_n did not overflow. */ + size = mb_len * (1 << rxq->sges_n); + size -= RTE_PKTMBUF_HEADROOM; + if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) { + rte_errno = EOVERFLOW; + ERROR("%p: too many SGEs (%u) needed to handle" + " requested maximum packet size %u", + (void *)dev, + 1 << sges_n, + dev->data->dev_conf.rxmode.max_rx_pkt_len); + goto error; + } + } else { + WARN("%p: the requested maximum Rx packet size (%u) is" + " larger than a single mbuf (%u) and scattered" + " mode has not been requested", + (void *)dev, + dev->data->dev_conf.rxmode.max_rx_pkt_len, + mb_len - RTE_PKTMBUF_HEADROOM); + } + DEBUG("%p: maximum number of segments per packet: %u", + (void *)dev, 1 << rxq->sges_n); + if (desc % (1 << rxq->sges_n)) { + rte_errno = EINVAL; + ERROR("%p: number of Rx queue descriptors (%u) is not a" + " multiple of maximum segments per packet (%u)", + (void *)dev, + desc, + 1 << rxq->sges_n); + goto error; + } + if (mlx4_mr_btree_init(&rxq->mr_ctrl.cache_bh, + MLX4_MR_BTREE_CACHE_N, socket)) { + /* rte_errno is already set. */ + goto error; + } + if (dev->data->dev_conf.intr_conf.rxq) { + rxq->channel = mlx4_glue->create_comp_channel(priv->ctx); + if (rxq->channel == NULL) { + rte_errno = ENOMEM; + ERROR("%p: Rx interrupt completion channel creation" + " failure: %s", + (void *)dev, strerror(rte_errno)); + goto error; + } + if (mlx4_fd_set_non_blocking(rxq->channel->fd) < 0) { + ERROR("%p: unable to make Rx interrupt completion" + " channel non-blocking: %s", + (void *)dev, strerror(rte_errno)); + goto error; + } + } + DEBUG("%p: adding Rx queue %p to list", (void *)dev, (void *)rxq); + dev->data->rx_queues[idx] = rxq; + return 0; +error: + dev->data->rx_queues[idx] = NULL; + ret = rte_errno; + mlx4_rx_queue_release(rxq); + rte_errno = ret; + assert(rte_errno > 0); + return -rte_errno; +} + +/** + * DPDK callback to release a Rx queue. + * + * @param dpdk_rxq + * Generic Rx queue pointer. + */ +void +mlx4_rx_queue_release(void *dpdk_rxq) +{ + struct rxq *rxq = (struct rxq *)dpdk_rxq; + struct mlx4_priv *priv; + unsigned int i; + + if (rxq == NULL) + return; + priv = rxq->priv; + for (i = 0; i != ETH_DEV(priv)->data->nb_rx_queues; ++i) + if (ETH_DEV(priv)->data->rx_queues[i] == rxq) { + DEBUG("%p: removing Rx queue %p from list", + (void *)ETH_DEV(priv), (void *)rxq); + ETH_DEV(priv)->data->rx_queues[i] = NULL; + break; + } + assert(!rxq->cq); + assert(!rxq->wq); + assert(!rxq->wqes); + assert(!rxq->rq_db); + if (rxq->channel) + claim_zero(mlx4_glue->destroy_comp_channel(rxq->channel)); + mlx4_mr_btree_free(&rxq->mr_ctrl.cache_bh); + rte_free(rxq); +} diff --git a/src/seastar/dpdk/drivers/net/mlx4/mlx4_rxtx.c b/src/seastar/dpdk/drivers/net/mlx4/mlx4_rxtx.c new file mode 100644 index 000000000..391271a61 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/mlx4/mlx4_rxtx.c @@ -0,0 +1,1396 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox Technologies, Ltd + */ + +/** + * @file + * Data plane functions for mlx4 driver. + */ + +#include <assert.h> +#include <stdint.h> +#include <string.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_branch_prediction.h> +#include <rte_common.h> +#include <rte_io.h> +#include <rte_mbuf.h> +#include <rte_mempool.h> +#include <rte_prefetch.h> + +#include "mlx4.h" +#include "mlx4_prm.h" +#include "mlx4_rxtx.h" +#include "mlx4_utils.h" + +/** + * Pointer-value pair structure used in tx_post_send for saving the first + * DWORD (32 byte) of a TXBB. + */ +struct pv { + union { + volatile struct mlx4_wqe_data_seg *dseg; + volatile uint32_t *dst; + }; + uint32_t val; +}; + +/** A helper structure for TSO packet handling. */ +struct tso_info { + /** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */ + struct pv *pv; + /** Current entry in the pv array. */ + int pv_counter; + /** Total size of the WQE including padding. */ + uint32_t wqe_size; + /** Size of TSO header to prepend to each packet to send. */ + uint16_t tso_header_size; + /** Total size of the TSO segment in the WQE. */ + uint16_t wqe_tso_seg_size; + /** Raw WQE size in units of 16 Bytes and without padding. */ + uint8_t fence_size; +}; + +/** A table to translate Rx completion flags to packet type. */ +uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = { + /* + * The index to the array should have: + * bit[7] - MLX4_CQE_L2_TUNNEL + * bit[6] - MLX4_CQE_L2_TUNNEL_IPV4 + * bit[5] - MLX4_CQE_STATUS_UDP + * bit[4] - MLX4_CQE_STATUS_TCP + * bit[3] - MLX4_CQE_STATUS_IPV4OPT + * bit[2] - MLX4_CQE_STATUS_IPV6 + * bit[1] - MLX4_CQE_STATUS_IPF + * bit[0] - MLX4_CQE_STATUS_IPV4 + * giving a total of up to 256 entries. + */ + /* L2 */ + [0x00] = RTE_PTYPE_L2_ETHER, + /* L3 */ + [0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_NONFRAG, + [0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG, + [0x03] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG, + [0x04] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_NONFRAG, + [0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG, + [0x08] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_NONFRAG, + [0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_NONFRAG, + [0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_FRAG, + [0x0b] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_FRAG, + /* TCP */ + [0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP, + [0x14] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP, + [0x16] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG, + [0x18] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_TCP, + [0x19] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_TCP, + /* UDP */ + [0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP, + [0x24] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP, + [0x26] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG, + [0x28] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_UDP, + [0x29] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_UDP, + /* Tunneled - L3 IPV6 */ + [0x80] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, + [0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_NONFRAG, + [0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, + [0x83] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, + [0x84] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_NONFRAG, + [0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, + [0x88] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_NONFRAG, + [0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_NONFRAG, + [0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_FRAG, + [0x8b] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_FRAG, + /* Tunneled - L3 IPV6, TCP */ + [0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP, + [0x94] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP, + [0x96] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, + [0x98] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_TCP, + [0x99] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_TCP, + /* Tunneled - L3 IPV6, UDP */ + [0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP, + [0xa4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP, + [0xa6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, + [0xa8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_UDP, + [0xa9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_UDP, + /* Tunneled - L3 IPV4 */ + [0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, + [0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_NONFRAG, + [0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, + [0xc3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, + [0xc4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_NONFRAG, + [0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, + [0xc8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_NONFRAG, + [0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_NONFRAG, + [0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_FRAG, + [0xcb] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_FRAG, + /* Tunneled - L3 IPV4, TCP */ + [0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP, + [0xd4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP, + [0xd6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, + [0xd8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_TCP, + [0xd9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_TCP, + /* Tunneled - L3 IPV4, UDP */ + [0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP, + [0xe4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP, + [0xe6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, + [0xe8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_UDP, + [0xe9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_UDP, +}; + +/** + * Stamp TXBB burst so it won't be reused by the HW. + * + * Routine is used when freeing WQE used by the chip or when failing + * building an WQ entry has failed leaving partial information on the queue. + * + * @param sq + * Pointer to the SQ structure. + * @param start + * Pointer to the first TXBB to stamp. + * @param end + * Pointer to the followed end TXBB to stamp. + * + * @return + * Stamping burst size in byte units. + */ +static uint32_t +mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq, volatile uint32_t *start, + volatile uint32_t *end) +{ + uint32_t stamp = sq->stamp; + int32_t size = (intptr_t)end - (intptr_t)start; + + assert(start != end); + /* Hold SQ ring wrap around. */ + if (size < 0) { + size = (int32_t)sq->size + size; + do { + *start = stamp; + start += MLX4_SQ_STAMP_DWORDS; + } while (start != (volatile uint32_t *)sq->eob); + start = (volatile uint32_t *)sq->buf; + /* Flip invalid stamping ownership. */ + stamp ^= RTE_BE32(1u << MLX4_SQ_OWNER_BIT); + sq->stamp = stamp; + if (start == end) + return size; + } + do { + *start = stamp; + start += MLX4_SQ_STAMP_DWORDS; + } while (start != end); + return (uint32_t)size; +} + +/** + * Manage Tx completions. + * + * When sending a burst, mlx4_tx_burst() posts several WRs. + * To improve performance, a completion event is only required once every + * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information + * for other WRs, but this information would not be used anyway. + * + * @param txq + * Pointer to Tx queue structure. + * @param elts_m + * Tx elements number mask. + * @param sq + * Pointer to the SQ structure. + */ +static void +mlx4_txq_complete(struct txq *txq, const unsigned int elts_m, + struct mlx4_sq *sq) +{ + unsigned int elts_tail = txq->elts_tail; + struct mlx4_cq *cq = &txq->mcq; + volatile struct mlx4_cqe *cqe; + uint32_t completed; + uint32_t cons_index = cq->cons_index; + volatile uint32_t *first_txbb; + + /* + * Traverse over all CQ entries reported and handle each WQ entry + * reported by them. + */ + do { + cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cons_index); + if (unlikely(!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ + !!(cons_index & cq->cqe_cnt))) + break; +#ifndef NDEBUG + /* + * Make sure we read the CQE after we read the ownership bit. + */ + rte_io_rmb(); + if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == + MLX4_CQE_OPCODE_ERROR)) { + volatile struct mlx4_err_cqe *cqe_err = + (volatile struct mlx4_err_cqe *)cqe; + ERROR("%p CQE error - vendor syndrome: 0x%x" + " syndrome: 0x%x\n", + (void *)txq, cqe_err->vendor_err, + cqe_err->syndrome); + break; + } +#endif /* NDEBUG */ + cons_index++; + } while (1); + completed = (cons_index - cq->cons_index) * txq->elts_comp_cd_init; + if (unlikely(!completed)) + return; + /* First stamping address is the end of the last one. */ + first_txbb = (&(*txq->elts)[elts_tail & elts_m])->eocb; + elts_tail += completed; + /* The new tail element holds the end address. */ + sq->remain_size += mlx4_txq_stamp_freed_wqe(sq, first_txbb, + (&(*txq->elts)[elts_tail & elts_m])->eocb); + /* Update CQ consumer index. */ + cq->cons_index = cons_index; + *cq->set_ci_db = rte_cpu_to_be_32(cons_index & MLX4_CQ_DB_CI_MASK); + txq->elts_tail = elts_tail; +} + +/** + * Write Tx data segment to the SQ. + * + * @param dseg + * Pointer to data segment in SQ. + * @param lkey + * Memory region lkey. + * @param addr + * Data address. + * @param byte_count + * Big endian bytes count of the data to send. + */ +static inline void +mlx4_fill_tx_data_seg(volatile struct mlx4_wqe_data_seg *dseg, + uint32_t lkey, uintptr_t addr, rte_be32_t byte_count) +{ + dseg->addr = rte_cpu_to_be_64(addr); + dseg->lkey = lkey; +#if RTE_CACHE_LINE_SIZE < 64 + /* + * Need a barrier here before writing the byte_count + * fields to make sure that all the data is visible + * before the byte_count field is set. + * Otherwise, if the segment begins a new cacheline, + * the HCA prefetcher could grab the 64-byte chunk and + * get a valid (!= 0xffffffff) byte count but stale + * data, and end up sending the wrong data. + */ + rte_io_wmb(); +#endif /* RTE_CACHE_LINE_SIZE */ + dseg->byte_count = byte_count; +} + +/** + * Obtain and calculate TSO information needed for assembling a TSO WQE. + * + * @param buf + * Pointer to the first packet mbuf. + * @param txq + * Pointer to Tx queue structure. + * @param tinfo + * Pointer to a structure to fill the info with. + * + * @return + * 0 on success, negative value upon error. + */ +static inline int +mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf, + struct txq *txq, + struct tso_info *tinfo) +{ + struct mlx4_sq *sq = &txq->msq; + const uint8_t tunneled = txq->priv->hw_csum_l2tun && + (buf->ol_flags & PKT_TX_TUNNEL_MASK); + + tinfo->tso_header_size = buf->l2_len + buf->l3_len + buf->l4_len; + if (tunneled) + tinfo->tso_header_size += + buf->outer_l2_len + buf->outer_l3_len; + if (unlikely(buf->tso_segsz == 0 || + tinfo->tso_header_size == 0 || + tinfo->tso_header_size > MLX4_MAX_TSO_HEADER || + tinfo->tso_header_size > buf->data_len)) + return -EINVAL; + /* + * Calculate the WQE TSO segment size + * Note: + * 1. An LSO segment must be padded such that the subsequent data + * segment is 16-byte aligned. + * 2. The start address of the TSO segment is always 16 Bytes aligned. + */ + tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg) + + tinfo->tso_header_size, + sizeof(struct mlx4_wqe_data_seg)); + tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) + + tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) + + buf->nb_segs; + tinfo->wqe_size = + RTE_ALIGN((uint32_t)(tinfo->fence_size << MLX4_SEG_SHIFT), + MLX4_TXBB_SIZE); + /* Validate WQE size and WQE space in the send queue. */ + if (sq->remain_size < tinfo->wqe_size || + tinfo->wqe_size > MLX4_MAX_WQE_SIZE) + return -ENOMEM; + /* Init pv. */ + tinfo->pv = (struct pv *)txq->bounce_buf; + tinfo->pv_counter = 0; + return 0; +} + +/** + * Fill the TSO WQE data segments with info on buffers to transmit . + * + * @param buf + * Pointer to the first packet mbuf. + * @param txq + * Pointer to Tx queue structure. + * @param tinfo + * Pointer to TSO info to use. + * @param dseg + * Pointer to the first data segment in the TSO WQE. + * @param ctrl + * Pointer to the control segment in the TSO WQE. + * + * @return + * 0 on success, negative value upon error. + */ +static inline volatile struct mlx4_wqe_ctrl_seg * +mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf, + struct txq *txq, + struct tso_info *tinfo, + volatile struct mlx4_wqe_data_seg *dseg, + volatile struct mlx4_wqe_ctrl_seg *ctrl) +{ + uint32_t lkey; + int nb_segs = buf->nb_segs; + int nb_segs_txbb; + struct mlx4_sq *sq = &txq->msq; + struct rte_mbuf *sbuf = buf; + struct pv *pv = tinfo->pv; + int *pv_counter = &tinfo->pv_counter; + volatile struct mlx4_wqe_ctrl_seg *ctrl_next = + (volatile struct mlx4_wqe_ctrl_seg *) + ((volatile uint8_t *)ctrl + tinfo->wqe_size); + uint16_t data_len = sbuf->data_len - tinfo->tso_header_size; + uintptr_t data_addr = rte_pktmbuf_mtod_offset(sbuf, uintptr_t, + tinfo->tso_header_size); + + do { + /* how many dseg entries do we have in the current TXBB ? */ + nb_segs_txbb = (MLX4_TXBB_SIZE - + ((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >> + MLX4_SEG_SHIFT; + switch (nb_segs_txbb) { +#ifndef NDEBUG + default: + /* Should never happen. */ + rte_panic("%p: Invalid number of SGEs(%d) for a TXBB", + (void *)txq, nb_segs_txbb); + /* rte_panic never returns. */ + break; +#endif /* NDEBUG */ + case 4: + /* Memory region key for this memory pool. */ + lkey = mlx4_tx_mb2mr(txq, sbuf); + if (unlikely(lkey == (uint32_t)-1)) + goto err; + dseg->addr = rte_cpu_to_be_64(data_addr); + dseg->lkey = lkey; + /* + * This data segment starts at the beginning of a new + * TXBB, so we need to postpone its byte_count writing + * for later. + */ + pv[*pv_counter].dseg = dseg; + /* + * Zero length segment is treated as inline segment + * with zero data. + */ + pv[(*pv_counter)++].val = + rte_cpu_to_be_32(data_len ? + data_len : + 0x80000000); + if (--nb_segs == 0) + return ctrl_next; + /* Prepare next buf info */ + sbuf = sbuf->next; + dseg++; + data_len = sbuf->data_len; + data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t); + /* fallthrough */ + case 3: + lkey = mlx4_tx_mb2mr(txq, sbuf); + if (unlikely(lkey == (uint32_t)-1)) + goto err; + mlx4_fill_tx_data_seg(dseg, lkey, data_addr, + rte_cpu_to_be_32(data_len ? + data_len : + 0x80000000)); + if (--nb_segs == 0) + return ctrl_next; + /* Prepare next buf info */ + sbuf = sbuf->next; + dseg++; + data_len = sbuf->data_len; + data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t); + /* fallthrough */ + case 2: + lkey = mlx4_tx_mb2mr(txq, sbuf); + if (unlikely(lkey == (uint32_t)-1)) + goto err; + mlx4_fill_tx_data_seg(dseg, lkey, data_addr, + rte_cpu_to_be_32(data_len ? + data_len : + 0x80000000)); + if (--nb_segs == 0) + return ctrl_next; + /* Prepare next buf info */ + sbuf = sbuf->next; + dseg++; + data_len = sbuf->data_len; + data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t); + /* fallthrough */ + case 1: + lkey = mlx4_tx_mb2mr(txq, sbuf); + if (unlikely(lkey == (uint32_t)-1)) + goto err; + mlx4_fill_tx_data_seg(dseg, lkey, data_addr, + rte_cpu_to_be_32(data_len ? + data_len : + 0x80000000)); + if (--nb_segs == 0) + return ctrl_next; + /* Prepare next buf info */ + sbuf = sbuf->next; + dseg++; + data_len = sbuf->data_len; + data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t); + /* fallthrough */ + } + /* Wrap dseg if it points at the end of the queue. */ + if ((volatile uint8_t *)dseg >= sq->eob) + dseg = (volatile struct mlx4_wqe_data_seg *) + ((volatile uint8_t *)dseg - sq->size); + } while (true); +err: + return NULL; +} + +/** + * Fill the packet's l2, l3 and l4 headers to the WQE. + * + * This will be used as the header for each TSO segment that is transmitted. + * + * @param buf + * Pointer to the first packet mbuf. + * @param txq + * Pointer to Tx queue structure. + * @param tinfo + * Pointer to TSO info to use. + * @param ctrl + * Pointer to the control segment in the TSO WQE. + * + * @return + * 0 on success, negative value upon error. + */ +static inline volatile struct mlx4_wqe_data_seg * +mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf, + struct txq *txq, + struct tso_info *tinfo, + volatile struct mlx4_wqe_ctrl_seg *ctrl) +{ + volatile struct mlx4_wqe_lso_seg *tseg = + (volatile struct mlx4_wqe_lso_seg *)(ctrl + 1); + struct mlx4_sq *sq = &txq->msq; + struct pv *pv = tinfo->pv; + int *pv_counter = &tinfo->pv_counter; + int remain_size = tinfo->tso_header_size; + char *from = rte_pktmbuf_mtod(buf, char *); + uint16_t txbb_avail_space; + /* Union to overcome volatile constraints when copying TSO header. */ + union { + volatile uint8_t *vto; + uint8_t *to; + } thdr = { .vto = (volatile uint8_t *)tseg->header, }; + + /* + * TSO data always starts at offset 20 from the beginning of the TXBB + * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned + * we can write the first 44 TSO header bytes without worry for TxQ + * wrapping or overwriting the first TXBB 32bit word. + */ + txbb_avail_space = MLX4_TXBB_SIZE - + (sizeof(struct mlx4_wqe_ctrl_seg) + + sizeof(struct mlx4_wqe_lso_seg)); + while (remain_size >= (int)(txbb_avail_space + sizeof(uint32_t))) { + /* Copy to end of txbb. */ + rte_memcpy(thdr.to, from, txbb_avail_space); + from += txbb_avail_space; + thdr.to += txbb_avail_space; + /* New TXBB, Check for TxQ wrap. */ + if (thdr.to >= sq->eob) + thdr.vto = sq->buf; + /* New TXBB, stash the first 32bits for later use. */ + pv[*pv_counter].dst = (volatile uint32_t *)thdr.to; + pv[(*pv_counter)++].val = *(uint32_t *)from, + from += sizeof(uint32_t); + thdr.to += sizeof(uint32_t); + remain_size -= txbb_avail_space + sizeof(uint32_t); + /* Avail space in new TXBB is TXBB size - 4 */ + txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t); + } + if (remain_size > txbb_avail_space) { + rte_memcpy(thdr.to, from, txbb_avail_space); + from += txbb_avail_space; + thdr.to += txbb_avail_space; + remain_size -= txbb_avail_space; + /* New TXBB, Check for TxQ wrap. */ + if (thdr.to >= sq->eob) + thdr.vto = sq->buf; + pv[*pv_counter].dst = (volatile uint32_t *)thdr.to; + rte_memcpy(&pv[*pv_counter].val, from, remain_size); + (*pv_counter)++; + } else if (remain_size) { + rte_memcpy(thdr.to, from, remain_size); + } + tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) | + tinfo->tso_header_size); + /* Calculate data segment location */ + return (volatile struct mlx4_wqe_data_seg *) + ((uintptr_t)tseg + tinfo->wqe_tso_seg_size); +} + +/** + * Write data segments and header for TSO uni/multi segment packet. + * + * @param buf + * Pointer to the first packet mbuf. + * @param txq + * Pointer to Tx queue structure. + * @param ctrl + * Pointer to the WQE control segment. + * + * @return + * Pointer to the next WQE control segment on success, NULL otherwise. + */ +static volatile struct mlx4_wqe_ctrl_seg * +mlx4_tx_burst_tso(struct rte_mbuf *buf, struct txq *txq, + volatile struct mlx4_wqe_ctrl_seg *ctrl) +{ + volatile struct mlx4_wqe_data_seg *dseg; + volatile struct mlx4_wqe_ctrl_seg *ctrl_next; + struct mlx4_sq *sq = &txq->msq; + struct tso_info tinfo; + struct pv *pv; + int pv_counter; + int ret; + + ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo); + if (unlikely(ret)) + goto error; + dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl); + if (unlikely(dseg == NULL)) + goto error; + if ((uintptr_t)dseg >= (uintptr_t)sq->eob) + dseg = (volatile struct mlx4_wqe_data_seg *) + ((uintptr_t)dseg - sq->size); + ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl); + if (unlikely(ctrl_next == NULL)) + goto error; + /* Write the first DWORD of each TXBB save earlier. */ + if (likely(tinfo.pv_counter)) { + pv = tinfo.pv; + pv_counter = tinfo.pv_counter; + /* Need a barrier here before writing the first TXBB word. */ + rte_io_wmb(); + do { + --pv_counter; + *pv[pv_counter].dst = pv[pv_counter].val; + } while (pv_counter > 0); + } + ctrl->fence_size = tinfo.fence_size; + sq->remain_size -= tinfo.wqe_size; + return ctrl_next; +error: + txq->stats.odropped++; + return NULL; +} + +/** + * Write data segments of multi-segment packet. + * + * @param buf + * Pointer to the first packet mbuf. + * @param txq + * Pointer to Tx queue structure. + * @param ctrl + * Pointer to the WQE control segment. + * + * @return + * Pointer to the next WQE control segment on success, NULL otherwise. + */ +static volatile struct mlx4_wqe_ctrl_seg * +mlx4_tx_burst_segs(struct rte_mbuf *buf, struct txq *txq, + volatile struct mlx4_wqe_ctrl_seg *ctrl) +{ + struct pv *pv = (struct pv *)txq->bounce_buf; + struct mlx4_sq *sq = &txq->msq; + struct rte_mbuf *sbuf = buf; + uint32_t lkey; + int pv_counter = 0; + int nb_segs = buf->nb_segs; + uint32_t wqe_size; + volatile struct mlx4_wqe_data_seg *dseg = + (volatile struct mlx4_wqe_data_seg *)(ctrl + 1); + + ctrl->fence_size = 1 + nb_segs; + wqe_size = RTE_ALIGN((uint32_t)(ctrl->fence_size << MLX4_SEG_SHIFT), + MLX4_TXBB_SIZE); + /* Validate WQE size and WQE space in the send queue. */ + if (sq->remain_size < wqe_size || + wqe_size > MLX4_MAX_WQE_SIZE) + return NULL; + /* + * Fill the data segments with buffer information. + * First WQE TXBB head segment is always control segment, + * so jump to tail TXBB data segments code for the first + * WQE data segments filling. + */ + goto txbb_tail_segs; +txbb_head_seg: + /* Memory region key (big endian) for this memory pool. */ + lkey = mlx4_tx_mb2mr(txq, sbuf); + if (unlikely(lkey == (uint32_t)-1)) { + DEBUG("%p: unable to get MP <-> MR association", + (void *)txq); + return NULL; + } + /* Handle WQE wraparound. */ + if (dseg >= + (volatile struct mlx4_wqe_data_seg *)sq->eob) + dseg = (volatile struct mlx4_wqe_data_seg *) + sq->buf; + dseg->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(sbuf, uintptr_t)); + dseg->lkey = lkey; + /* + * This data segment starts at the beginning of a new + * TXBB, so we need to postpone its byte_count writing + * for later. + */ + pv[pv_counter].dseg = dseg; + /* + * Zero length segment is treated as inline segment + * with zero data. + */ + pv[pv_counter++].val = rte_cpu_to_be_32(sbuf->data_len ? + sbuf->data_len : 0x80000000); + sbuf = sbuf->next; + dseg++; + nb_segs--; +txbb_tail_segs: + /* Jump to default if there are more than two segments remaining. */ + switch (nb_segs) { + default: + lkey = mlx4_tx_mb2mr(txq, sbuf); + if (unlikely(lkey == (uint32_t)-1)) { + DEBUG("%p: unable to get MP <-> MR association", + (void *)txq); + return NULL; + } + mlx4_fill_tx_data_seg(dseg, lkey, + rte_pktmbuf_mtod(sbuf, uintptr_t), + rte_cpu_to_be_32(sbuf->data_len ? + sbuf->data_len : + 0x80000000)); + sbuf = sbuf->next; + dseg++; + nb_segs--; + /* fallthrough */ + case 2: + lkey = mlx4_tx_mb2mr(txq, sbuf); + if (unlikely(lkey == (uint32_t)-1)) { + DEBUG("%p: unable to get MP <-> MR association", + (void *)txq); + return NULL; + } + mlx4_fill_tx_data_seg(dseg, lkey, + rte_pktmbuf_mtod(sbuf, uintptr_t), + rte_cpu_to_be_32(sbuf->data_len ? + sbuf->data_len : + 0x80000000)); + sbuf = sbuf->next; + dseg++; + nb_segs--; + /* fallthrough */ + case 1: + lkey = mlx4_tx_mb2mr(txq, sbuf); + if (unlikely(lkey == (uint32_t)-1)) { + DEBUG("%p: unable to get MP <-> MR association", + (void *)txq); + return NULL; + } + mlx4_fill_tx_data_seg(dseg, lkey, + rte_pktmbuf_mtod(sbuf, uintptr_t), + rte_cpu_to_be_32(sbuf->data_len ? + sbuf->data_len : + 0x80000000)); + nb_segs--; + if (nb_segs) { + sbuf = sbuf->next; + dseg++; + goto txbb_head_seg; + } + /* fallthrough */ + case 0: + break; + } + /* Write the first DWORD of each TXBB save earlier. */ + if (pv_counter) { + /* Need a barrier here before writing the byte_count. */ + rte_io_wmb(); + for (--pv_counter; pv_counter >= 0; pv_counter--) + pv[pv_counter].dseg->byte_count = pv[pv_counter].val; + } + sq->remain_size -= wqe_size; + /* Align next WQE address to the next TXBB. */ + return (volatile struct mlx4_wqe_ctrl_seg *) + ((volatile uint8_t *)ctrl + wqe_size); +} + +/** + * DPDK callback for Tx. + * + * @param dpdk_txq + * Generic pointer to Tx queue structure. + * @param[in] pkts + * Packets to transmit. + * @param pkts_n + * Number of packets in array. + * + * @return + * Number of packets successfully transmitted (<= pkts_n). + */ +uint16_t +mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + struct txq *txq = (struct txq *)dpdk_txq; + unsigned int elts_head = txq->elts_head; + const unsigned int elts_n = txq->elts_n; + const unsigned int elts_m = elts_n - 1; + unsigned int bytes_sent = 0; + unsigned int i; + unsigned int max = elts_head - txq->elts_tail; + struct mlx4_sq *sq = &txq->msq; + volatile struct mlx4_wqe_ctrl_seg *ctrl; + struct txq_elt *elt; + + assert(txq->elts_comp_cd != 0); + if (likely(max >= txq->elts_comp_cd_init)) + mlx4_txq_complete(txq, elts_m, sq); + max = elts_n - max; + assert(max >= 1); + assert(max <= elts_n); + /* Always leave one free entry in the ring. */ + --max; + if (max > pkts_n) + max = pkts_n; + elt = &(*txq->elts)[elts_head & elts_m]; + /* First Tx burst element saves the next WQE control segment. */ + ctrl = elt->wqe; + for (i = 0; (i != max); ++i) { + struct rte_mbuf *buf = pkts[i]; + struct txq_elt *elt_next = &(*txq->elts)[++elts_head & elts_m]; + uint32_t owner_opcode = sq->owner_opcode; + volatile struct mlx4_wqe_data_seg *dseg = + (volatile struct mlx4_wqe_data_seg *)(ctrl + 1); + volatile struct mlx4_wqe_ctrl_seg *ctrl_next; + union { + uint32_t flags; + uint16_t flags16[2]; + } srcrb; + uint32_t lkey; + bool tso = txq->priv->tso && (buf->ol_flags & PKT_TX_TCP_SEG); + + /* Clean up old buffer. */ + if (likely(elt->buf != NULL)) { + struct rte_mbuf *tmp = elt->buf; + +#ifndef NDEBUG + /* Poisoning. */ + memset(&elt->buf, 0x66, sizeof(struct rte_mbuf *)); +#endif + /* Faster than rte_pktmbuf_free(). */ + do { + struct rte_mbuf *next = tmp->next; + + rte_pktmbuf_free_seg(tmp); + tmp = next; + } while (tmp != NULL); + } + RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); + if (tso) { + /* Change opcode to TSO */ + owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD; + owner_opcode |= MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR; + ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl); + if (!ctrl_next) { + elt->buf = NULL; + break; + } + } else if (buf->nb_segs == 1) { + /* Validate WQE space in the send queue. */ + if (sq->remain_size < MLX4_TXBB_SIZE) { + elt->buf = NULL; + break; + } + lkey = mlx4_tx_mb2mr(txq, buf); + if (unlikely(lkey == (uint32_t)-1)) { + /* MR does not exist. */ + DEBUG("%p: unable to get MP <-> MR association", + (void *)txq); + elt->buf = NULL; + break; + } + mlx4_fill_tx_data_seg(dseg++, lkey, + rte_pktmbuf_mtod(buf, uintptr_t), + rte_cpu_to_be_32(buf->data_len)); + /* Set WQE size in 16-byte units. */ + ctrl->fence_size = 0x2; + sq->remain_size -= MLX4_TXBB_SIZE; + /* Align next WQE address to the next TXBB. */ + ctrl_next = ctrl + 0x4; + } else { + ctrl_next = mlx4_tx_burst_segs(buf, txq, ctrl); + if (!ctrl_next) { + elt->buf = NULL; + break; + } + } + /* Hold SQ ring wrap around. */ + if ((volatile uint8_t *)ctrl_next >= sq->eob) { + ctrl_next = (volatile struct mlx4_wqe_ctrl_seg *) + ((volatile uint8_t *)ctrl_next - sq->size); + /* Flip HW valid ownership. */ + sq->owner_opcode ^= 1u << MLX4_SQ_OWNER_BIT; + } + /* + * For raw Ethernet, the SOLICIT flag is used to indicate + * that no ICRC should be calculated. + */ + if (--txq->elts_comp_cd == 0) { + /* Save the completion burst end address. */ + elt_next->eocb = (volatile uint32_t *)ctrl_next; + txq->elts_comp_cd = txq->elts_comp_cd_init; + srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT | + MLX4_WQE_CTRL_CQ_UPDATE); + } else { + srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT); + } + /* Enable HW checksum offload if requested */ + if (txq->csum && + (buf->ol_flags & + (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))) { + const uint64_t is_tunneled = (buf->ol_flags & + (PKT_TX_TUNNEL_GRE | + PKT_TX_TUNNEL_VXLAN)); + + if (is_tunneled && txq->csum_l2tun) { + owner_opcode |= MLX4_WQE_CTRL_IIP_HDR_CSUM | + MLX4_WQE_CTRL_IL4_HDR_CSUM; + if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM) + srcrb.flags |= + RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM); + } else { + srcrb.flags |= + RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM | + MLX4_WQE_CTRL_TCP_UDP_CSUM); + } + } + if (txq->lb) { + /* + * Copy destination MAC address to the WQE, this allows + * loopback in eSwitch, so that VFs and PF can + * communicate with each other. + */ + srcrb.flags16[0] = *(rte_pktmbuf_mtod(buf, uint16_t *)); + ctrl->imm = *(rte_pktmbuf_mtod_offset(buf, uint32_t *, + sizeof(uint16_t))); + } else { + ctrl->imm = 0; + } + ctrl->srcrb_flags = srcrb.flags; + /* + * Make sure descriptor is fully written before + * setting ownership bit (because HW can start + * executing as soon as we do). + */ + rte_io_wmb(); + ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode); + elt->buf = buf; + bytes_sent += buf->pkt_len; + ctrl = ctrl_next; + elt = elt_next; + } + /* Take a shortcut if nothing must be sent. */ + if (unlikely(i == 0)) + return 0; + /* Save WQE address of the next Tx burst element. */ + elt->wqe = ctrl; + /* Increment send statistics counters. */ + txq->stats.opackets += i; + txq->stats.obytes += bytes_sent; + /* Make sure that descriptors are written before doorbell record. */ + rte_wmb(); + /* Ring QP doorbell. */ + rte_write32(txq->msq.doorbell_qpn, MLX4_TX_BFREG(txq)); + txq->elts_head += i; + return i; +} + +/** + * Translate Rx completion flags to packet type. + * + * @param[in] cqe + * Pointer to CQE. + * + * @return + * Packet type for struct rte_mbuf. + */ +static inline uint32_t +rxq_cq_to_pkt_type(volatile struct mlx4_cqe *cqe, + uint32_t l2tun_offload) +{ + uint8_t idx = 0; + uint32_t pinfo = rte_be_to_cpu_32(cqe->vlan_my_qpn); + uint32_t status = rte_be_to_cpu_32(cqe->status); + + /* + * The index to the array should have: + * bit[7] - MLX4_CQE_L2_TUNNEL + * bit[6] - MLX4_CQE_L2_TUNNEL_IPV4 + */ + if (l2tun_offload && (pinfo & MLX4_CQE_L2_TUNNEL)) + idx |= ((pinfo & MLX4_CQE_L2_TUNNEL) >> 20) | + ((pinfo & MLX4_CQE_L2_TUNNEL_IPV4) >> 19); + /* + * The index to the array should have: + * bit[5] - MLX4_CQE_STATUS_UDP + * bit[4] - MLX4_CQE_STATUS_TCP + * bit[3] - MLX4_CQE_STATUS_IPV4OPT + * bit[2] - MLX4_CQE_STATUS_IPV6 + * bit[1] - MLX4_CQE_STATUS_IPF + * bit[0] - MLX4_CQE_STATUS_IPV4 + * giving a total of up to 256 entries. + */ + idx |= ((status & MLX4_CQE_STATUS_PTYPE_MASK) >> 22); + if (status & MLX4_CQE_STATUS_IPV6) + idx |= ((status & MLX4_CQE_STATUS_IPV6F) >> 11); + return mlx4_ptype_table[idx]; +} + +/** + * Translate Rx completion flags to offload flags. + * + * @param flags + * Rx completion flags returned by mlx4_cqe_flags(). + * @param csum + * Whether Rx checksums are enabled. + * @param csum_l2tun + * Whether Rx L2 tunnel checksums are enabled. + * + * @return + * Offload flags (ol_flags) in mbuf format. + */ +static inline uint32_t +rxq_cq_to_ol_flags(uint32_t flags, int csum, int csum_l2tun) +{ + uint32_t ol_flags = 0; + + if (csum) + ol_flags |= + mlx4_transpose(flags, + MLX4_CQE_STATUS_IP_HDR_CSUM_OK, + PKT_RX_IP_CKSUM_GOOD) | + mlx4_transpose(flags, + MLX4_CQE_STATUS_TCP_UDP_CSUM_OK, + PKT_RX_L4_CKSUM_GOOD); + if ((flags & MLX4_CQE_L2_TUNNEL) && csum_l2tun) + ol_flags |= + mlx4_transpose(flags, + MLX4_CQE_L2_TUNNEL_IPOK, + PKT_RX_IP_CKSUM_GOOD) | + mlx4_transpose(flags, + MLX4_CQE_L2_TUNNEL_L4_CSUM, + PKT_RX_L4_CKSUM_GOOD); + return ol_flags; +} + +/** + * Extract checksum information from CQE flags. + * + * @param cqe + * Pointer to CQE structure. + * @param csum + * Whether Rx checksums are enabled. + * @param csum_l2tun + * Whether Rx L2 tunnel checksums are enabled. + * + * @return + * CQE checksum information. + */ +static inline uint32_t +mlx4_cqe_flags(volatile struct mlx4_cqe *cqe, int csum, int csum_l2tun) +{ + uint32_t flags = 0; + + /* + * The relevant bits are in different locations on their + * CQE fields therefore we can join them in one 32bit + * variable. + */ + if (csum) + flags = (rte_be_to_cpu_32(cqe->status) & + MLX4_CQE_STATUS_IPV4_CSUM_OK); + if (csum_l2tun) + flags |= (rte_be_to_cpu_32(cqe->vlan_my_qpn) & + (MLX4_CQE_L2_TUNNEL | + MLX4_CQE_L2_TUNNEL_IPOK | + MLX4_CQE_L2_TUNNEL_L4_CSUM | + MLX4_CQE_L2_TUNNEL_IPV4)); + return flags; +} + +/** + * Poll one CQE from CQ. + * + * @param rxq + * Pointer to the receive queue structure. + * @param[out] out + * Just polled CQE. + * + * @return + * Number of bytes of the CQE, 0 in case there is no completion. + */ +static unsigned int +mlx4_cq_poll_one(struct rxq *rxq, volatile struct mlx4_cqe **out) +{ + int ret = 0; + volatile struct mlx4_cqe *cqe = NULL; + struct mlx4_cq *cq = &rxq->mcq; + + cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cq->cons_index); + if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ + !!(cq->cons_index & cq->cqe_cnt)) + goto out; + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rte_rmb(); + assert(!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)); + assert((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != + MLX4_CQE_OPCODE_ERROR); + ret = rte_be_to_cpu_32(cqe->byte_cnt); + ++cq->cons_index; +out: + *out = cqe; + return ret; +} + +/** + * DPDK callback for Rx with scattered packets support. + * + * @param dpdk_rxq + * Generic pointer to Rx queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * + * @return + * Number of packets successfully received (<= pkts_n). + */ +uint16_t +mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + struct rxq *rxq = dpdk_rxq; + const uint32_t wr_cnt = (1 << rxq->elts_n) - 1; + const uint16_t sges_n = rxq->sges_n; + struct rte_mbuf *pkt = NULL; + struct rte_mbuf *seg = NULL; + unsigned int i = 0; + uint32_t rq_ci = rxq->rq_ci << sges_n; + int len = 0; + + while (pkts_n) { + volatile struct mlx4_cqe *cqe; + uint32_t idx = rq_ci & wr_cnt; + struct rte_mbuf *rep = (*rxq->elts)[idx]; + volatile struct mlx4_wqe_data_seg *scat = &(*rxq->wqes)[idx]; + + /* Update the 'next' pointer of the previous segment. */ + if (pkt) + seg->next = rep; + seg = rep; + rte_prefetch0(seg); + rte_prefetch0(scat); + rep = rte_mbuf_raw_alloc(rxq->mp); + if (unlikely(rep == NULL)) { + ++rxq->stats.rx_nombuf; + if (!pkt) { + /* + * No buffers before we even started, + * bail out silently. + */ + break; + } + while (pkt != seg) { + assert(pkt != (*rxq->elts)[idx]); + rep = pkt->next; + pkt->next = NULL; + pkt->nb_segs = 1; + rte_mbuf_raw_free(pkt); + pkt = rep; + } + break; + } + if (!pkt) { + /* Looking for the new packet. */ + len = mlx4_cq_poll_one(rxq, &cqe); + if (!len) { + rte_mbuf_raw_free(rep); + break; + } + if (unlikely(len < 0)) { + /* Rx error, packet is likely too large. */ + rte_mbuf_raw_free(rep); + ++rxq->stats.idropped; + goto skip; + } + pkt = seg; + assert(len >= (rxq->crc_present << 2)); + /* Update packet information. */ + pkt->packet_type = + rxq_cq_to_pkt_type(cqe, rxq->l2tun_offload); + pkt->ol_flags = PKT_RX_RSS_HASH; + pkt->hash.rss = cqe->immed_rss_invalid; + if (rxq->crc_present) + len -= ETHER_CRC_LEN; + pkt->pkt_len = len; + if (rxq->csum | rxq->csum_l2tun) { + uint32_t flags = + mlx4_cqe_flags(cqe, + rxq->csum, + rxq->csum_l2tun); + + pkt->ol_flags = + rxq_cq_to_ol_flags(flags, + rxq->csum, + rxq->csum_l2tun); + } + } + rep->nb_segs = 1; + rep->port = rxq->port_id; + rep->data_len = seg->data_len; + rep->data_off = seg->data_off; + (*rxq->elts)[idx] = rep; + /* + * Fill NIC descriptor with the new buffer. The lkey and size + * of the buffers are already known, only the buffer address + * changes. + */ + scat->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); + /* If there's only one MR, no need to replace LKey in WQE. */ + if (unlikely(mlx4_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) + scat->lkey = mlx4_rx_mb2mr(rxq, rep); + if (len > seg->data_len) { + len -= seg->data_len; + ++pkt->nb_segs; + ++rq_ci; + continue; + } + /* The last segment. */ + seg->data_len = len; + /* Increment bytes counter. */ + rxq->stats.ibytes += pkt->pkt_len; + /* Return packet. */ + *(pkts++) = pkt; + pkt = NULL; + --pkts_n; + ++i; +skip: + /* Align consumer index to the next stride. */ + rq_ci >>= sges_n; + ++rq_ci; + rq_ci <<= sges_n; + } + if (unlikely(i == 0 && (rq_ci >> sges_n) == rxq->rq_ci)) + return 0; + /* Update the consumer index. */ + rxq->rq_ci = rq_ci >> sges_n; + rte_wmb(); + *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); + *rxq->mcq.set_ci_db = + rte_cpu_to_be_32(rxq->mcq.cons_index & MLX4_CQ_DB_CI_MASK); + /* Increment packets counter. */ + rxq->stats.ipackets += i; + return i; +} + +/** + * Dummy DPDK callback for Tx. + * + * This function is used to temporarily replace the real callback during + * unsafe control operations on the queue, or in case of error. + * + * @param dpdk_txq + * Generic pointer to Tx queue structure. + * @param[in] pkts + * Packets to transmit. + * @param pkts_n + * Number of packets in array. + * + * @return + * Number of packets successfully transmitted (<= pkts_n). + */ +uint16_t +mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + (void)dpdk_txq; + (void)pkts; + (void)pkts_n; + rte_mb(); + return 0; +} + +/** + * Dummy DPDK callback for Rx. + * + * This function is used to temporarily replace the real callback during + * unsafe control operations on the queue, or in case of error. + * + * @param dpdk_rxq + * Generic pointer to Rx queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * + * @return + * Number of packets successfully received (<= pkts_n). + */ +uint16_t +mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + (void)dpdk_rxq; + (void)pkts; + (void)pkts_n; + rte_mb(); + return 0; +} diff --git a/src/seastar/dpdk/drivers/net/mlx4/mlx4_rxtx.h b/src/seastar/dpdk/drivers/net/mlx4/mlx4_rxtx.h new file mode 100644 index 000000000..8baf33fa9 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/mlx4/mlx4_rxtx.h @@ -0,0 +1,251 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox Technologies, Ltd + */ + +#ifndef MLX4_RXTX_H_ +#define MLX4_RXTX_H_ + +#include <stdint.h> +#include <sys/queue.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/mlx4dv.h> +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_ethdev_driver.h> +#include <rte_mbuf.h> +#include <rte_mempool.h> + +#include "mlx4.h" +#include "mlx4_prm.h" +#include "mlx4_mr.h" + +/** Rx queue counters. */ +struct mlx4_rxq_stats { + unsigned int idx; /**< Mapping index. */ + uint64_t ipackets; /**< Total of successfully received packets. */ + uint64_t ibytes; /**< Total of successfully received bytes. */ + uint64_t idropped; /**< Total of packets dropped when Rx ring full. */ + uint64_t rx_nombuf; /**< Total of Rx mbuf allocation failures. */ +}; + +/** Rx queue descriptor. */ +struct rxq { + struct mlx4_priv *priv; /**< Back pointer to private data. */ + struct rte_mempool *mp; /**< Memory pool for allocations. */ + struct ibv_cq *cq; /**< Completion queue. */ + struct ibv_wq *wq; /**< Work queue. */ + struct ibv_comp_channel *channel; /**< Rx completion channel. */ + uint16_t rq_ci; /**< Saved RQ consumer index. */ + uint16_t port_id; /**< Port ID for incoming packets. */ + uint16_t sges_n; /**< Number of segments per packet (log2 value). */ + uint16_t elts_n; /**< Mbuf queue size (log2 value). */ + struct mlx4_mr_ctrl mr_ctrl; /* MR control descriptor. */ + struct rte_mbuf *(*elts)[]; /**< Rx elements. */ + volatile struct mlx4_wqe_data_seg (*wqes)[]; /**< HW queue entries. */ + volatile uint32_t *rq_db; /**< RQ doorbell record. */ + uint32_t csum:1; /**< Enable checksum offloading. */ + uint32_t csum_l2tun:1; /**< Same for L2 tunnels. */ + uint32_t crc_present:1; /**< CRC must be subtracted. */ + uint32_t l2tun_offload:1; /**< L2 tunnel offload is enabled. */ + struct mlx4_cq mcq; /**< Info for directly manipulating the CQ. */ + struct mlx4_rxq_stats stats; /**< Rx queue counters. */ + unsigned int socket; /**< CPU socket ID for allocations. */ + uint32_t usecnt; /**< Number of users relying on queue resources. */ + uint8_t data[]; /**< Remaining queue resources. */ +}; + +/** Shared flow target for Rx queues. */ +struct mlx4_rss { + LIST_ENTRY(mlx4_rss) next; /**< Next entry in list. */ + struct mlx4_priv *priv; /**< Back pointer to private data. */ + uint32_t refcnt; /**< Reference count for this object. */ + uint32_t usecnt; /**< Number of users relying on @p qp and @p ind. */ + struct ibv_qp *qp; /**< Queue pair. */ + struct ibv_rwq_ind_table *ind; /**< Indirection table. */ + uint64_t fields; /**< Fields for RSS processing (Verbs format). */ + uint8_t key[MLX4_RSS_HASH_KEY_SIZE]; /**< Hash key to use. */ + uint16_t queues; /**< Number of target queues. */ + uint16_t queue_id[]; /**< Target queues. */ +}; + +/** Tx element. */ +struct txq_elt { + struct rte_mbuf *buf; /**< Buffer. */ + union { + volatile struct mlx4_wqe_ctrl_seg *wqe; /**< SQ WQE. */ + volatile uint32_t *eocb; /**< End of completion burst. */ + }; +}; + +/** Tx queue counters. */ +struct mlx4_txq_stats { + unsigned int idx; /**< Mapping index. */ + uint64_t opackets; /**< Total of successfully sent packets. */ + uint64_t obytes; /**< Total of successfully sent bytes. */ + uint64_t odropped; /**< Total number of packets failed to transmit. */ +}; + +/** Tx queue descriptor. */ +struct txq { + struct mlx4_sq msq; /**< Info for directly manipulating the SQ. */ + struct mlx4_cq mcq; /**< Info for directly manipulating the CQ. */ + uint16_t port_id; /**< Port ID of device. */ + unsigned int elts_head; /**< Current index in (*elts)[]. */ + unsigned int elts_tail; /**< First element awaiting completion. */ + int elts_comp_cd; /**< Countdown for next completion. */ + unsigned int elts_comp_cd_init; /**< Initial value for countdown. */ + unsigned int elts_n; /**< (*elts)[] length. */ + struct mlx4_mr_ctrl mr_ctrl; /* MR control descriptor. */ + struct txq_elt (*elts)[]; /**< Tx elements. */ + struct mlx4_txq_stats stats; /**< Tx queue counters. */ + uint32_t max_inline; /**< Max inline send size. */ + uint32_t csum:1; /**< Enable checksum offloading. */ + uint32_t csum_l2tun:1; /**< Same for L2 tunnels. */ + uint32_t lb:1; /**< Whether packets should be looped back by eSwitch. */ + uint8_t *bounce_buf; + /**< Memory used for storing the first DWORD of data TXBBs. */ + struct mlx4_priv *priv; /**< Back pointer to private data. */ + unsigned int socket; /**< CPU socket ID for allocations. */ + struct ibv_cq *cq; /**< Completion queue. */ + struct ibv_qp *qp; /**< Queue pair. */ + uint8_t data[]; /**< Remaining queue resources. */ +}; + +#define MLX4_TX_BFREG(txq) \ + (MLX4_PROC_PRIV((txq)->port_id)->uar_table[(txq)->stats.idx]) + +/* mlx4_rxq.c */ + +uint8_t mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE]; +int mlx4_rss_init(struct mlx4_priv *priv); +void mlx4_rss_deinit(struct mlx4_priv *priv); +struct mlx4_rss *mlx4_rss_get(struct mlx4_priv *priv, uint64_t fields, + const uint8_t key[MLX4_RSS_HASH_KEY_SIZE], + uint16_t queues, const uint16_t queue_id[]); +void mlx4_rss_put(struct mlx4_rss *rss); +int mlx4_rss_attach(struct mlx4_rss *rss); +void mlx4_rss_detach(struct mlx4_rss *rss); +int mlx4_rxq_attach(struct rxq *rxq); +void mlx4_rxq_detach(struct rxq *rxq); +uint64_t mlx4_get_rx_port_offloads(struct mlx4_priv *priv); +uint64_t mlx4_get_rx_queue_offloads(struct mlx4_priv *priv); +int mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, + uint16_t desc, unsigned int socket, + const struct rte_eth_rxconf *conf, + struct rte_mempool *mp); +void mlx4_rx_queue_release(void *dpdk_rxq); + +/* mlx4_rxtx.c */ + +uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, + uint16_t pkts_n); + +/* mlx4_txq.c */ + +int mlx4_tx_uar_init_secondary(struct rte_eth_dev *dev, int fd); +uint64_t mlx4_get_tx_port_offloads(struct mlx4_priv *priv); +int mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, + uint16_t desc, unsigned int socket, + const struct rte_eth_txconf *conf); +void mlx4_tx_queue_release(void *dpdk_txq); + +/* mlx4_mr.c */ + +void mlx4_mr_flush_local_cache(struct mlx4_mr_ctrl *mr_ctrl); +uint32_t mlx4_rx_addr2mr_bh(struct rxq *rxq, uintptr_t addr); +uint32_t mlx4_tx_mb2mr_bh(struct txq *txq, struct rte_mbuf *mb); +uint32_t mlx4_tx_update_ext_mp(struct txq *txq, uintptr_t addr, + struct rte_mempool *mp); + +/** + * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which the + * cloned mbuf is allocated is returned instead. + * + * @param buf + * Pointer to mbuf. + * + * @return + * Memory pool where data is located for given mbuf. + */ +static inline struct rte_mempool * +mlx4_mb2mp(struct rte_mbuf *buf) +{ + if (unlikely(RTE_MBUF_CLONED(buf))) + return rte_mbuf_from_indirect(buf)->pool; + return buf->pool; +} + +/** + * Query LKey from a packet buffer for Rx. No need to flush local caches for Rx + * as mempool is pre-configured and static. + * + * @param rxq + * Pointer to Rx queue structure. + * @param addr + * Address to search. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +static __rte_always_inline uint32_t +mlx4_rx_addr2mr(struct rxq *rxq, uintptr_t addr) +{ + struct mlx4_mr_ctrl *mr_ctrl = &rxq->mr_ctrl; + uint32_t lkey; + + /* Linear search on MR cache array. */ + lkey = mlx4_mr_lookup_cache(mr_ctrl->cache, &mr_ctrl->mru, + MLX4_MR_CACHE_N, addr); + if (likely(lkey != UINT32_MAX)) + return lkey; + /* Take slower bottom-half (Binary Search) on miss. */ + return mlx4_rx_addr2mr_bh(rxq, addr); +} + +#define mlx4_rx_mb2mr(rxq, mb) mlx4_rx_addr2mr(rxq, (uintptr_t)((mb)->buf_addr)) + +/** + * Query LKey from a packet buffer for Tx. If not found, add the mempool. + * + * @param txq + * Pointer to Tx queue structure. + * @param addr + * Address to search. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +static __rte_always_inline uint32_t +mlx4_tx_mb2mr(struct txq *txq, struct rte_mbuf *mb) +{ + struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl; + uintptr_t addr = (uintptr_t)mb->buf_addr; + uint32_t lkey; + + /* Check generation bit to see if there's any change on existing MRs. */ + if (unlikely(*mr_ctrl->dev_gen_ptr != mr_ctrl->cur_gen)) + mlx4_mr_flush_local_cache(mr_ctrl); + /* Linear search on MR cache array. */ + lkey = mlx4_mr_lookup_cache(mr_ctrl->cache, &mr_ctrl->mru, + MLX4_MR_CACHE_N, addr); + if (likely(lkey != UINT32_MAX)) + return lkey; + /* Take slower bottom-half on miss. */ + return mlx4_tx_mb2mr_bh(txq, mb); +} + +#endif /* MLX4_RXTX_H_ */ diff --git a/src/seastar/dpdk/drivers/net/mlx4/mlx4_txq.c b/src/seastar/dpdk/drivers/net/mlx4/mlx4_txq.c new file mode 100644 index 000000000..01a5efd80 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/mlx4/mlx4_txq.c @@ -0,0 +1,527 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox Technologies, Ltd + */ + +/** + * @file + * Tx queues configuration for mlx4 driver. + */ + +#include <assert.h> +#include <errno.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> +#include <sys/mman.h> +#include <inttypes.h> +#include <unistd.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_common.h> +#include <rte_errno.h> +#include <rte_ethdev_driver.h> +#include <rte_malloc.h> +#include <rte_mbuf.h> +#include <rte_mempool.h> + +#include "mlx4.h" +#include "mlx4_glue.h" +#include "mlx4_prm.h" +#include "mlx4_rxtx.h" +#include "mlx4_utils.h" + +/** + * Initialize Tx UAR registers for primary process. + * + * @param txq + * Pointer to Tx queue structure. + */ +static void +txq_uar_init(struct txq *txq) +{ + struct mlx4_priv *priv = txq->priv; + struct mlx4_proc_priv *ppriv = MLX4_PROC_PRIV(PORT_ID(priv)); + + assert(rte_eal_process_type() == RTE_PROC_PRIMARY); + assert(ppriv); + ppriv->uar_table[txq->stats.idx] = txq->msq.db; +} + +#ifdef HAVE_IBV_MLX4_UAR_MMAP_OFFSET +/** + * Remap UAR register of a Tx queue for secondary process. + * + * Remapped address is stored at the table in the process private structure of + * the device, indexed by queue index. + * + * @param txq + * Pointer to Tx queue structure. + * @param fd + * Verbs file descriptor to map UAR pages. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +txq_uar_init_secondary(struct txq *txq, int fd) +{ + struct mlx4_priv *priv = txq->priv; + struct mlx4_proc_priv *ppriv = MLX4_PROC_PRIV(PORT_ID(priv)); + void *addr; + uintptr_t uar_va; + uintptr_t offset; + const size_t page_size = sysconf(_SC_PAGESIZE); + + assert(ppriv); + /* + * As rdma-core, UARs are mapped in size of OS page + * size. Ref to libmlx4 function: mlx4_init_context() + */ + uar_va = (uintptr_t)txq->msq.db; + offset = uar_va & (page_size - 1); /* Offset in page. */ + addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd, + txq->msq.uar_mmap_offset); + if (addr == MAP_FAILED) { + ERROR("port %u mmap failed for BF reg of txq %u", + txq->port_id, txq->stats.idx); + rte_errno = ENXIO; + return -rte_errno; + } + addr = RTE_PTR_ADD(addr, offset); + ppriv->uar_table[txq->stats.idx] = addr; + return 0; +} + +/** + * Unmap UAR register of a Tx queue for secondary process. + * + * @param txq + * Pointer to Tx queue structure. + */ +static void +txq_uar_uninit_secondary(struct txq *txq) +{ + struct mlx4_proc_priv *ppriv = MLX4_PROC_PRIV(PORT_ID(txq->priv)); + const size_t page_size = sysconf(_SC_PAGESIZE); + void *addr; + + addr = ppriv->uar_table[txq->stats.idx]; + munmap(RTE_PTR_ALIGN_FLOOR(addr, page_size), page_size); +} + +/** + * Initialize Tx UAR registers for secondary process. + * + * @param dev + * Pointer to Ethernet device. + * @param fd + * Verbs file descriptor to map UAR pages. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx4_tx_uar_init_secondary(struct rte_eth_dev *dev, int fd) +{ + const unsigned int txqs_n = dev->data->nb_tx_queues; + struct txq *txq; + unsigned int i; + int ret; + + assert(rte_eal_process_type() == RTE_PROC_SECONDARY); + for (i = 0; i != txqs_n; ++i) { + txq = dev->data->tx_queues[i]; + if (!txq) + continue; + assert(txq->stats.idx == (uint16_t)i); + ret = txq_uar_init_secondary(txq, fd); + if (ret) + goto error; + } + return 0; +error: + /* Rollback. */ + do { + txq = dev->data->tx_queues[i]; + if (!txq) + continue; + txq_uar_uninit_secondary(txq); + } while (i--); + return -rte_errno; +} +#else +int +mlx4_tx_uar_init_secondary(struct rte_eth_dev *dev __rte_unused, + int fd __rte_unused) +{ + assert(rte_eal_process_type() == RTE_PROC_SECONDARY); + ERROR("UAR remap is not supported"); + rte_errno = ENOTSUP; + return -rte_errno; +} +#endif + +/** + * Free Tx queue elements. + * + * @param txq + * Pointer to Tx queue structure. + */ +static void +mlx4_txq_free_elts(struct txq *txq) +{ + unsigned int elts_head = txq->elts_head; + unsigned int elts_tail = txq->elts_tail; + struct txq_elt (*elts)[txq->elts_n] = txq->elts; + unsigned int elts_m = txq->elts_n - 1; + + DEBUG("%p: freeing WRs", (void *)txq); + while (elts_tail != elts_head) { + struct txq_elt *elt = &(*elts)[elts_tail++ & elts_m]; + + assert(elt->buf != NULL); + rte_pktmbuf_free(elt->buf); + elt->buf = NULL; + elt->wqe = NULL; + } + txq->elts_tail = txq->elts_head; +} + +/** + * Retrieves information needed in order to directly access the Tx queue. + * + * @param txq + * Pointer to Tx queue structure. + * @param mlxdv + * Pointer to device information for this Tx queue. + */ +static void +mlx4_txq_fill_dv_obj_info(struct txq *txq, struct mlx4dv_obj *mlxdv) +{ + struct mlx4_sq *sq = &txq->msq; + struct mlx4_cq *cq = &txq->mcq; + struct mlx4dv_qp *dqp = mlxdv->qp.out; + struct mlx4dv_cq *dcq = mlxdv->cq.out; + + /* Total length, including headroom and spare WQEs. */ + sq->size = (uint32_t)dqp->rq.offset - (uint32_t)dqp->sq.offset; + sq->buf = (uint8_t *)dqp->buf.buf + dqp->sq.offset; + sq->eob = sq->buf + sq->size; + uint32_t headroom_size = 2048 + (1 << dqp->sq.wqe_shift); + /* Continuous headroom size bytes must always stay freed. */ + sq->remain_size = sq->size - headroom_size; + sq->owner_opcode = MLX4_OPCODE_SEND | (0u << MLX4_SQ_OWNER_BIT); + sq->stamp = rte_cpu_to_be_32(MLX4_SQ_STAMP_VAL | + (0u << MLX4_SQ_OWNER_BIT)); +#ifdef HAVE_IBV_MLX4_UAR_MMAP_OFFSET + sq->uar_mmap_offset = dqp->uar_mmap_offset; +#else + sq->uar_mmap_offset = -1; /* Make mmap() fail. */ +#endif + sq->db = dqp->sdb; + sq->doorbell_qpn = dqp->doorbell_qpn; + cq->buf = dcq->buf.buf; + cq->cqe_cnt = dcq->cqe_cnt; + cq->set_ci_db = dcq->set_ci_db; + cq->cqe_64 = (dcq->cqe_size & 64) ? 1 : 0; +} + +/** + * Returns the per-port supported offloads. + * + * @param priv + * Pointer to private structure. + * + * @return + * Supported Tx offloads. + */ +uint64_t +mlx4_get_tx_port_offloads(struct mlx4_priv *priv) +{ + uint64_t offloads = DEV_TX_OFFLOAD_MULTI_SEGS; + + if (priv->hw_csum) { + offloads |= (DEV_TX_OFFLOAD_IPV4_CKSUM | + DEV_TX_OFFLOAD_UDP_CKSUM | + DEV_TX_OFFLOAD_TCP_CKSUM); + } + if (priv->tso) + offloads |= DEV_TX_OFFLOAD_TCP_TSO; + if (priv->hw_csum_l2tun) { + offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM; + if (priv->tso) + offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO | + DEV_TX_OFFLOAD_GRE_TNL_TSO); + } + return offloads; +} + +/** + * DPDK callback to configure a Tx queue. + * + * @param dev + * Pointer to Ethernet device structure. + * @param idx + * Tx queue index. + * @param desc + * Number of descriptors to configure in queue. + * @param socket + * NUMA socket on which memory must be allocated. + * @param[in] conf + * Thresholds parameters. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + unsigned int socket, const struct rte_eth_txconf *conf) +{ + struct mlx4_priv *priv = dev->data->dev_private; + struct mlx4dv_obj mlxdv; + struct mlx4dv_qp dv_qp; + struct mlx4dv_cq dv_cq; + struct txq_elt (*elts)[rte_align32pow2(desc)]; + struct ibv_qp_init_attr qp_init_attr; + struct txq *txq; + uint8_t *bounce_buf; + struct mlx4_malloc_vec vec[] = { + { + .align = RTE_CACHE_LINE_SIZE, + .size = sizeof(*txq), + .addr = (void **)&txq, + }, + { + .align = RTE_CACHE_LINE_SIZE, + .size = sizeof(*elts), + .addr = (void **)&elts, + }, + { + .align = RTE_CACHE_LINE_SIZE, + .size = MLX4_MAX_WQE_SIZE, + .addr = (void **)&bounce_buf, + }, + }; + int ret; + uint64_t offloads; + + offloads = conf->offloads | dev->data->dev_conf.txmode.offloads; + DEBUG("%p: configuring queue %u for %u descriptors", + (void *)dev, idx, desc); + if (idx >= dev->data->nb_tx_queues) { + rte_errno = EOVERFLOW; + ERROR("%p: queue index out of range (%u >= %u)", + (void *)dev, idx, dev->data->nb_tx_queues); + return -rte_errno; + } + txq = dev->data->tx_queues[idx]; + if (txq) { + rte_errno = EEXIST; + DEBUG("%p: Tx queue %u already configured, release it first", + (void *)dev, idx); + return -rte_errno; + } + if (!desc) { + rte_errno = EINVAL; + ERROR("%p: invalid number of Tx descriptors", (void *)dev); + return -rte_errno; + } + if (desc != RTE_DIM(*elts)) { + desc = RTE_DIM(*elts); + WARN("%p: increased number of descriptors in Tx queue %u" + " to the next power of two (%u)", + (void *)dev, idx, desc); + } + /* Allocate and initialize Tx queue. */ + mlx4_zmallocv_socket("TXQ", vec, RTE_DIM(vec), socket); + if (!txq) { + ERROR("%p: unable to allocate queue index %u", + (void *)dev, idx); + return -rte_errno; + } + *txq = (struct txq){ + .priv = priv, + .port_id = dev->data->port_id, + .stats = { + .idx = idx, + }, + .socket = socket, + .elts_n = desc, + .elts = elts, + .elts_head = 0, + .elts_tail = 0, + /* + * Request send completion every MLX4_PMD_TX_PER_COMP_REQ + * packets or at least 4 times per ring. + */ + .elts_comp_cd = + RTE_MIN(MLX4_PMD_TX_PER_COMP_REQ, desc / 4), + .elts_comp_cd_init = + RTE_MIN(MLX4_PMD_TX_PER_COMP_REQ, desc / 4), + .csum = priv->hw_csum && + (offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM | + DEV_TX_OFFLOAD_UDP_CKSUM | + DEV_TX_OFFLOAD_TCP_CKSUM)), + .csum_l2tun = priv->hw_csum_l2tun && + (offloads & + DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM), + /* Enable Tx loopback for VF devices. */ + .lb = !!priv->vf, + .bounce_buf = bounce_buf, + }; + priv->verbs_alloc_ctx.type = MLX4_VERBS_ALLOC_TYPE_TX_QUEUE; + priv->verbs_alloc_ctx.obj = txq; + txq->cq = mlx4_glue->create_cq(priv->ctx, desc, NULL, NULL, 0); + if (!txq->cq) { + rte_errno = ENOMEM; + ERROR("%p: CQ creation failure: %s", + (void *)dev, strerror(rte_errno)); + goto error; + } + qp_init_attr = (struct ibv_qp_init_attr){ + .send_cq = txq->cq, + .recv_cq = txq->cq, + .cap = { + .max_send_wr = + RTE_MIN(priv->device_attr.max_qp_wr, desc), + .max_send_sge = 1, + .max_inline_data = MLX4_PMD_MAX_INLINE, + }, + .qp_type = IBV_QPT_RAW_PACKET, + /* No completion events must occur by default. */ + .sq_sig_all = 0, + }; + txq->qp = mlx4_glue->create_qp(priv->pd, &qp_init_attr); + if (!txq->qp) { + rte_errno = errno ? errno : EINVAL; + ERROR("%p: QP creation failure: %s", + (void *)dev, strerror(rte_errno)); + goto error; + } + txq->max_inline = qp_init_attr.cap.max_inline_data; + ret = mlx4_glue->modify_qp + (txq->qp, + &(struct ibv_qp_attr){ + .qp_state = IBV_QPS_INIT, + .port_num = priv->port, + }, + IBV_QP_STATE | IBV_QP_PORT); + if (ret) { + rte_errno = ret; + ERROR("%p: QP state to IBV_QPS_INIT failed: %s", + (void *)dev, strerror(rte_errno)); + goto error; + } + ret = mlx4_glue->modify_qp + (txq->qp, + &(struct ibv_qp_attr){ + .qp_state = IBV_QPS_RTR, + }, + IBV_QP_STATE); + if (ret) { + rte_errno = ret; + ERROR("%p: QP state to IBV_QPS_RTR failed: %s", + (void *)dev, strerror(rte_errno)); + goto error; + } + ret = mlx4_glue->modify_qp + (txq->qp, + &(struct ibv_qp_attr){ + .qp_state = IBV_QPS_RTS, + }, + IBV_QP_STATE); + if (ret) { + rte_errno = ret; + ERROR("%p: QP state to IBV_QPS_RTS failed: %s", + (void *)dev, strerror(rte_errno)); + goto error; + } + /* Retrieve device queue information. */ +#ifdef HAVE_IBV_MLX4_UAR_MMAP_OFFSET + dv_qp = (struct mlx4dv_qp){ + .comp_mask = MLX4DV_QP_MASK_UAR_MMAP_OFFSET, + }; +#endif + mlxdv.cq.in = txq->cq; + mlxdv.cq.out = &dv_cq; + mlxdv.qp.in = txq->qp; + mlxdv.qp.out = &dv_qp; + ret = mlx4_glue->dv_init_obj(&mlxdv, MLX4DV_OBJ_QP | MLX4DV_OBJ_CQ); + if (ret) { + rte_errno = EINVAL; + ERROR("%p: failed to obtain information needed for" + " accessing the device queues", (void *)dev); + goto error; + } +#ifdef HAVE_IBV_MLX4_UAR_MMAP_OFFSET + if (!(dv_qp.comp_mask & MLX4DV_QP_MASK_UAR_MMAP_OFFSET)) { + WARN("%p: failed to obtain UAR mmap offset", (void *)dev); + dv_qp.uar_mmap_offset = -1; /* Make mmap() fail. */ + } +#endif + mlx4_txq_fill_dv_obj_info(txq, &mlxdv); + txq_uar_init(txq); + /* Save first wqe pointer in the first element. */ + (&(*txq->elts)[0])->wqe = + (volatile struct mlx4_wqe_ctrl_seg *)txq->msq.buf; + if (mlx4_mr_btree_init(&txq->mr_ctrl.cache_bh, + MLX4_MR_BTREE_CACHE_N, socket)) { + /* rte_errno is already set. */ + goto error; + } + /* Save pointer of global generation number to check memory event. */ + txq->mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen; + DEBUG("%p: adding Tx queue %p to list", (void *)dev, (void *)txq); + dev->data->tx_queues[idx] = txq; + priv->verbs_alloc_ctx.type = MLX4_VERBS_ALLOC_TYPE_NONE; + return 0; +error: + dev->data->tx_queues[idx] = NULL; + ret = rte_errno; + mlx4_tx_queue_release(txq); + rte_errno = ret; + assert(rte_errno > 0); + priv->verbs_alloc_ctx.type = MLX4_VERBS_ALLOC_TYPE_NONE; + return -rte_errno; +} + +/** + * DPDK callback to release a Tx queue. + * + * @param dpdk_txq + * Generic Tx queue pointer. + */ +void +mlx4_tx_queue_release(void *dpdk_txq) +{ + struct txq *txq = (struct txq *)dpdk_txq; + struct mlx4_priv *priv; + unsigned int i; + + if (txq == NULL) + return; + priv = txq->priv; + for (i = 0; i != ETH_DEV(priv)->data->nb_tx_queues; ++i) + if (ETH_DEV(priv)->data->tx_queues[i] == txq) { + DEBUG("%p: removing Tx queue %p from list", + (void *)ETH_DEV(priv), (void *)txq); + ETH_DEV(priv)->data->tx_queues[i] = NULL; + break; + } + mlx4_txq_free_elts(txq); + if (txq->qp) + claim_zero(mlx4_glue->destroy_qp(txq->qp)); + if (txq->cq) + claim_zero(mlx4_glue->destroy_cq(txq->cq)); + mlx4_mr_btree_free(&txq->mr_ctrl.cache_bh); + rte_free(txq); +} diff --git a/src/seastar/dpdk/drivers/net/mlx4/mlx4_utils.c b/src/seastar/dpdk/drivers/net/mlx4/mlx4_utils.c new file mode 100644 index 000000000..a727d7036 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/mlx4/mlx4_utils.c @@ -0,0 +1,189 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox Technologies, Ltd + */ + +/** + * @file + * Utility functions used by the mlx4 driver. + */ + +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <stddef.h> +#include <stdint.h> + +#include <rte_errno.h> +#include <rte_malloc.h> +#include <rte_memory.h> + +#include "mlx4_utils.h" + +/** + * Make a file descriptor non-blocking. + * + * @param fd + * File descriptor to alter. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_fd_set_non_blocking(int fd) +{ + int ret = fcntl(fd, F_GETFL); + + if (ret != -1 && !fcntl(fd, F_SETFL, ret | O_NONBLOCK)) + return 0; + assert(errno); + rte_errno = errno; + return -rte_errno; +} + +/** + * Internal helper to allocate memory once for several disparate objects. + * + * The most restrictive alignment constraint for standard objects is assumed + * to be sizeof(double) and is used as a default value. + * + * C11 code would include stdalign.h and use alignof(max_align_t) however + * we'll stick with C99 for the time being. + */ +static inline size_t +mlx4_mallocv_inline(const char *type, const struct mlx4_malloc_vec *vec, + unsigned int cnt, int zero, int socket) +{ + unsigned int i; + size_t size; + size_t least; + uint8_t *data = NULL; + int fill = !vec[0].addr; + +fill: + size = 0; + least = 0; + for (i = 0; i < cnt; ++i) { + size_t align = (uintptr_t)vec[i].align; + + if (!align) { + align = sizeof(double); + } else if (!rte_is_power_of_2(align)) { + rte_errno = EINVAL; + goto error; + } + if (least < align) + least = align; + align = RTE_ALIGN_CEIL(size, align); + size = align + vec[i].size; + if (fill && vec[i].addr) + *vec[i].addr = data + align; + } + if (fill) + return size; + if (!zero) + data = rte_malloc_socket(type, size, least, socket); + else + data = rte_zmalloc_socket(type, size, least, socket); + if (data) { + fill = 1; + goto fill; + } + rte_errno = ENOMEM; +error: + for (i = 0; i != cnt; ++i) + if (vec[i].addr) + *vec[i].addr = NULL; + return 0; +} + +/** + * Allocate memory once for several disparate objects. + * + * This function adds iovec-like semantics (e.g. readv()) to rte_malloc(). + * Memory is allocated once for several contiguous objects of nonuniform + * sizes and alignment constraints. + * + * Each entry of @p vec describes the size, alignment constraint and + * provides a buffer address where the resulting object pointer must be + * stored. + * + * The buffer of the first entry is guaranteed to point to the beginning of + * the allocated region and is safe to use with rte_free(). + * + * NULL buffers are silently ignored. + * + * Providing a NULL buffer in the first entry prevents this function from + * allocating any memory but has otherwise no effect on its behavior. In + * this case, the contents of remaining non-NULL buffers are updated with + * addresses relative to zero (i.e. offsets that would have been used during + * the allocation). + * + * @param[in] type + * A string identifying the type of allocated objects (useful for debug + * purposes, such as identifying the cause of a memory leak). Can be NULL. + * @param[in, out] vec + * Description of objects to allocate memory for. + * @param cnt + * Number of entries in @p vec. + * + * @return + * Size in bytes of the allocated region including any padding. In case of + * error, rte_errno is set, 0 is returned and NULL is stored in the + * non-NULL buffers pointed by @p vec. + * + * @see struct mlx4_malloc_vec + * @see rte_malloc() + */ +size_t +mlx4_mallocv(const char *type, const struct mlx4_malloc_vec *vec, + unsigned int cnt) +{ + return mlx4_mallocv_inline(type, vec, cnt, 0, SOCKET_ID_ANY); +} + +/** + * Combines the semantics of mlx4_mallocv() with those of rte_zmalloc(). + * + * @see mlx4_mallocv() + * @see rte_zmalloc() + */ +size_t +mlx4_zmallocv(const char *type, const struct mlx4_malloc_vec *vec, + unsigned int cnt) +{ + return mlx4_mallocv_inline(type, vec, cnt, 1, SOCKET_ID_ANY); +} + +/** + * Socket-aware version of mlx4_mallocv(). + * + * This function takes one additional parameter. + * + * @param socket + * NUMA socket to allocate memory on. If SOCKET_ID_ANY is used, this + * function will behave the same as mlx4_mallocv(). + * + * @see mlx4_mallocv() + * @see rte_malloc_socket() + */ +size_t +mlx4_mallocv_socket(const char *type, const struct mlx4_malloc_vec *vec, + unsigned int cnt, int socket) +{ + return mlx4_mallocv_inline(type, vec, cnt, 0, socket); +} + +/** + * Combines the semantics of mlx4_mallocv_socket() with those of + * mlx4_zmalloc_socket(). + * + * @see mlx4_mallocv_socket() + * @see rte_zmalloc_socket() + */ +size_t +mlx4_zmallocv_socket(const char *type, const struct mlx4_malloc_vec *vec, + unsigned int cnt, int socket) +{ + return mlx4_mallocv_inline(type, vec, cnt, 1, socket); +} diff --git a/src/seastar/dpdk/drivers/net/mlx4/mlx4_utils.h b/src/seastar/dpdk/drivers/net/mlx4/mlx4_utils.h new file mode 100644 index 000000000..86abb3b7e --- /dev/null +++ b/src/seastar/dpdk/drivers/net/mlx4/mlx4_utils.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox Technologies, Ltd + */ + +#ifndef MLX4_UTILS_H_ +#define MLX4_UTILS_H_ + +#include <assert.h> +#include <stddef.h> +#include <stdio.h> + +#include <rte_common.h> +#include <rte_log.h> + +#include "mlx4.h" + +#ifndef NDEBUG + +/* + * When debugging is enabled (NDEBUG not defined), file, line and function + * information replace the driver name (MLX4_DRIVER_NAME) in log messages. + */ + +/** Return the file name part of a path. */ +static inline const char * +pmd_drv_log_basename(const char *s) +{ + const char *n = s; + + while (*n) + if (*(n++) == '/') + s = n; + return s; +} + +#define PMD_DRV_LOG(level, ...) \ + RTE_LOG(level, PMD, \ + RTE_FMT("%s:%u: %s(): " RTE_FMT_HEAD(__VA_ARGS__,) "\n", \ + pmd_drv_log_basename(__FILE__), \ + __LINE__, \ + __func__, \ + RTE_FMT_TAIL(__VA_ARGS__,))) +#define DEBUG(...) PMD_DRV_LOG(DEBUG, __VA_ARGS__) +#define claim_zero(...) assert((__VA_ARGS__) == 0) + +#else /* NDEBUG */ + +/* + * Like assert(), DEBUG() becomes a no-op and claim_zero() does not perform + * any check when debugging is disabled. + */ + +#define PMD_DRV_LOG(level, ...) \ + RTE_LOG(level, PMD, \ + RTE_FMT(MLX4_DRIVER_NAME ": " \ + RTE_FMT_HEAD(__VA_ARGS__,) "\n", \ + RTE_FMT_TAIL(__VA_ARGS__,))) +#define DEBUG(...) (void)0 +#define claim_zero(...) (__VA_ARGS__) + +#endif /* NDEBUG */ + +#define INFO(...) PMD_DRV_LOG(INFO, __VA_ARGS__) +#define WARN(...) PMD_DRV_LOG(WARNING, __VA_ARGS__) +#define ERROR(...) PMD_DRV_LOG(ERR, __VA_ARGS__) + +/** Allocate a buffer on the stack and fill it with a printf format string. */ +#define MKSTR(name, ...) \ + char name[snprintf(NULL, 0, __VA_ARGS__) + 1]; \ + \ + snprintf(name, sizeof(name), __VA_ARGS__) + +/** Generate a string out of the provided arguments. */ +#define MLX4_STR(...) # __VA_ARGS__ + +/** Similar to MLX4_STR() with enclosed macros expanded first. */ +#define MLX4_STR_EXPAND(...) MLX4_STR(__VA_ARGS__) + +/** Object description used with mlx4_mallocv() and similar functions. */ +struct mlx4_malloc_vec { + size_t align; /**< Alignment constraint (power of 2), 0 if unknown. */ + size_t size; /**< Object size. */ + void **addr; /**< Storage for allocation address. */ +}; + +/* mlx4_utils.c */ + +int mlx4_fd_set_non_blocking(int fd); +size_t mlx4_mallocv(const char *type, const struct mlx4_malloc_vec *vec, + unsigned int cnt); +size_t mlx4_zmallocv(const char *type, const struct mlx4_malloc_vec *vec, + unsigned int cnt); +size_t mlx4_mallocv_socket(const char *type, const struct mlx4_malloc_vec *vec, + unsigned int cnt, int socket); +size_t mlx4_zmallocv_socket(const char *type, const struct mlx4_malloc_vec *vec, + unsigned int cnt, int socket); + +#endif /* MLX4_UTILS_H_ */ diff --git a/src/seastar/dpdk/drivers/net/mlx4/rte_pmd_mlx4_version.map b/src/seastar/dpdk/drivers/net/mlx4/rte_pmd_mlx4_version.map new file mode 100644 index 000000000..ef3539840 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/mlx4/rte_pmd_mlx4_version.map @@ -0,0 +1,4 @@ +DPDK_2.0 { + + local: *; +}; |