diff options
Diffstat (limited to 'src/spdk/dpdk/drivers/net/mlx5')
34 files changed, 45111 insertions, 0 deletions
diff --git a/src/spdk/dpdk/drivers/net/mlx5/Makefile b/src/spdk/dpdk/drivers/net/mlx5/Makefile new file mode 100644 index 000000000..2577ee5e5 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/Makefile @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2015 6WIND S.A. +# Copyright 2015 Mellanox Technologies, Ltd + +include $(RTE_SDK)/mk/rte.vars.mk + +# Library name. +LIB = librte_pmd_mlx5.a + +# Sources. +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxq.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_txq.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxtx.c +ifneq ($(filter y,$(CONFIG_RTE_ARCH_X86_64) \ + $(CONFIG_RTE_ARCH_PPC_64) \ + $(CONFIG_RTE_ARCH_ARM64)),) +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxtx_vec.c +endif +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_trigger.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_ethdev.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mac.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxmode.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_vlan.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_stats.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rss.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mr.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow_meter.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow_dv.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow_verbs.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mp.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_utils.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_socket.c + +# Basic CFLAGS. +CFLAGS += -O3 +CFLAGS += -std=c11 -Wall -Wextra +CFLAGS += -g +CFLAGS += -I$(RTE_SDK)/drivers/common/mlx5 +CFLAGS += -I$(RTE_SDK)/drivers/net/mlx5 +CFLAGS += -I$(BUILDDIR)/drivers/common/mlx5 +CFLAGS += -D_BSD_SOURCE +CFLAGS += -D_DEFAULT_SOURCE +CFLAGS += -D_XOPEN_SOURCE=600 +CFLAGS += $(WERROR_FLAGS) +CFLAGS += -Wno-strict-prototypes +LDLIBS += -lrte_common_mlx5 +LDLIBS += -lm +LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring +LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs +LDLIBS += -lrte_bus_pci + +# A few warnings cannot be avoided in external headers. +CFLAGS += -Wno-error=cast-qual + +EXPORT_MAP := rte_pmd_mlx5_version.map + +# DEBUG which is usually provided on the command-line may enable +# CONFIG_RTE_LIBRTE_MLX5_DEBUG. +ifeq ($(DEBUG),1) +CONFIG_RTE_LIBRTE_MLX5_DEBUG := y +endif + +# User-defined CFLAGS. +ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DEBUG),y) +CFLAGS += -pedantic +ifneq ($(CONFIG_RTE_TOOLCHAIN_ICC),y) +CFLAGS += -DPEDANTIC +endif +AUTO_CONFIG_CFLAGS += -Wno-pedantic +else +CFLAGS += -UPEDANTIC +endif + +include $(RTE_SDK)/mk/rte.lib.mk + diff --git a/src/spdk/dpdk/drivers/net/mlx5/meson.build b/src/spdk/dpdk/drivers/net/mlx5/meson.build new file mode 100644 index 000000000..928663af7 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/meson.build @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2018 6WIND S.A. +# Copyright 2018 Mellanox Technologies, Ltd + +if not is_linux + build = false + reason = 'only supported on Linux' + subdir_done() +endif + +deps += ['hash', 'common_mlx5'] +sources = files( + 'mlx5.c', + 'mlx5_ethdev.c', + 'mlx5_flow.c', + 'mlx5_flow_meter.c', + 'mlx5_flow_dv.c', + 'mlx5_flow_verbs.c', + 'mlx5_mac.c', + 'mlx5_mr.c', + 'mlx5_rss.c', + 'mlx5_rxmode.c', + 'mlx5_rxq.c', + 'mlx5_rxtx.c', + 'mlx5_mp.c', + 'mlx5_stats.c', + 'mlx5_trigger.c', + 'mlx5_txq.c', + 'mlx5_vlan.c', + 'mlx5_utils.c', + 'mlx5_socket.c', +) +if (dpdk_conf.has('RTE_ARCH_X86_64') + or dpdk_conf.has('RTE_ARCH_ARM64') + or dpdk_conf.has('RTE_ARCH_PPC_64')) + sources += files('mlx5_rxtx_vec.c') +endif +cflags_options = [ + '-std=c11', + '-Wno-strict-prototypes', + '-D_BSD_SOURCE', + '-D_DEFAULT_SOURCE', + '-D_XOPEN_SOURCE=600' +] +foreach option:cflags_options + if cc.has_argument(option) + cflags += option + endif +endforeach +if get_option('buildtype').contains('debug') + cflags += [ '-pedantic', '-DPEDANTIC' ] +else + cflags += [ '-UPEDANTIC' ] +endif diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5.c new file mode 100644 index 000000000..5589772eb --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5.c @@ -0,0 +1,3814 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2015 6WIND S.A. + * Copyright 2015 Mellanox Technologies, Ltd + */ + +#include <stddef.h> +#include <unistd.h> +#include <string.h> +#include <stdint.h> +#include <stdlib.h> +#include <errno.h> +#include <net/if.h> +#include <sys/mman.h> +#include <linux/rtnetlink.h> + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_malloc.h> +#include <rte_ethdev_driver.h> +#include <rte_ethdev_pci.h> +#include <rte_pci.h> +#include <rte_bus_pci.h> +#include <rte_common.h> +#include <rte_kvargs.h> +#include <rte_rwlock.h> +#include <rte_spinlock.h> +#include <rte_string_fns.h> +#include <rte_alarm.h> + +#include <mlx5_glue.h> +#include <mlx5_devx_cmds.h> +#include <mlx5_common.h> +#include <mlx5_common_mp.h> + +#include "mlx5_defs.h" +#include "mlx5.h" +#include "mlx5_utils.h" +#include "mlx5_rxtx.h" +#include "mlx5_autoconf.h" +#include "mlx5_mr.h" +#include "mlx5_flow.h" +#include "rte_pmd_mlx5.h" + +/* Device parameter to enable RX completion queue compression. */ +#define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en" + +/* Device parameter to enable RX completion entry padding to 128B. */ +#define MLX5_RXQ_CQE_PAD_EN "rxq_cqe_pad_en" + +/* Device parameter to enable padding Rx packet to cacheline size. */ +#define MLX5_RXQ_PKT_PAD_EN "rxq_pkt_pad_en" + +/* Device parameter to enable Multi-Packet Rx queue. */ +#define MLX5_RX_MPRQ_EN "mprq_en" + +/* Device parameter to configure log 2 of the number of strides for MPRQ. */ +#define MLX5_RX_MPRQ_LOG_STRIDE_NUM "mprq_log_stride_num" + +/* Device parameter to configure log 2 of the stride size for MPRQ. */ +#define MLX5_RX_MPRQ_LOG_STRIDE_SIZE "mprq_log_stride_size" + +/* Device parameter to limit the size of memcpy'd packet for MPRQ. */ +#define MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len" + +/* Device parameter to set the minimum number of Rx queues to enable MPRQ. */ +#define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq" + +/* Device parameter to configure inline send. Deprecated, ignored.*/ +#define MLX5_TXQ_INLINE "txq_inline" + +/* Device parameter to limit packet size to inline with ordinary SEND. */ +#define MLX5_TXQ_INLINE_MAX "txq_inline_max" + +/* Device parameter to configure minimal data size to inline. */ +#define MLX5_TXQ_INLINE_MIN "txq_inline_min" + +/* Device parameter to limit packet size to inline with Enhanced MPW. */ +#define MLX5_TXQ_INLINE_MPW "txq_inline_mpw" + +/* + * Device parameter to configure the number of TX queues threshold for + * enabling inline send. + */ +#define MLX5_TXQS_MIN_INLINE "txqs_min_inline" + +/* + * Device parameter to configure the number of TX queues threshold for + * enabling vectorized Tx, deprecated, ignored (no vectorized Tx routines). + */ +#define MLX5_TXQS_MAX_VEC "txqs_max_vec" + +/* Device parameter to enable multi-packet send WQEs. */ +#define MLX5_TXQ_MPW_EN "txq_mpw_en" + +/* + * Device parameter to force doorbell register mapping + * to non-cahed region eliminating the extra write memory barrier. + */ +#define MLX5_TX_DB_NC "tx_db_nc" + +/* + * Device parameter to include 2 dsegs in the title WQEBB. + * Deprecated, ignored. + */ +#define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en" + +/* + * Device parameter to limit the size of inlining packet. + * Deprecated, ignored. + */ +#define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len" + +/* + * Device parameter to enable hardware Tx vector. + * Deprecated, ignored (no vectorized Tx routines anymore). + */ +#define MLX5_TX_VEC_EN "tx_vec_en" + +/* Device parameter to enable hardware Rx vector. */ +#define MLX5_RX_VEC_EN "rx_vec_en" + +/* Allow L3 VXLAN flow creation. */ +#define MLX5_L3_VXLAN_EN "l3_vxlan_en" + +/* Activate DV E-Switch flow steering. */ +#define MLX5_DV_ESW_EN "dv_esw_en" + +/* Activate DV flow steering. */ +#define MLX5_DV_FLOW_EN "dv_flow_en" + +/* Enable extensive flow metadata support. */ +#define MLX5_DV_XMETA_EN "dv_xmeta_en" + +/* Activate Netlink support in VF mode. */ +#define MLX5_VF_NL_EN "vf_nl_en" + +/* Enable extending memsegs when creating a MR. */ +#define MLX5_MR_EXT_MEMSEG_EN "mr_ext_memseg_en" + +/* Select port representors to instantiate. */ +#define MLX5_REPRESENTOR "representor" + +/* Device parameter to configure the maximum number of dump files per queue. */ +#define MLX5_MAX_DUMP_FILES_NUM "max_dump_files_num" + +/* Configure timeout of LRO session (in microseconds). */ +#define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec" + +/* + * Device parameter to configure the total data buffer size for a single + * hairpin queue (logarithm value). + */ +#define MLX5_HP_BUF_SIZE "hp_buf_log_sz" + +#ifndef HAVE_IBV_MLX5_MOD_MPW +#define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2) +#define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) +#endif + +#ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP +#define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4) +#endif + +static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data"; + +/* Shared memory between primary and secondary processes. */ +struct mlx5_shared_data *mlx5_shared_data; + +/* Spinlock for mlx5_shared_data allocation. */ +static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER; + +/* Process local data for secondary processes. */ +static struct mlx5_local_data mlx5_local_data; + +/** Driver-specific log messages type. */ +int mlx5_logtype; + +/** Data associated with devices to spawn. */ +struct mlx5_dev_spawn_data { + uint32_t ifindex; /**< Network interface index. */ + uint32_t max_port; /**< IB device maximal port index. */ + uint32_t ibv_port; /**< IB device physical port index. */ + int pf_bond; /**< bonding device PF index. < 0 - no bonding */ + struct mlx5_switch_info info; /**< Switch information. */ + struct ibv_device *ibv_dev; /**< Associated IB device. */ + struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */ + struct rte_pci_device *pci_dev; /**< Backend PCI device. */ +}; + +static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER(); +static pthread_mutex_t mlx5_ibv_list_mutex = PTHREAD_MUTEX_INITIALIZER; + +static struct mlx5_indexed_pool_config mlx5_ipool_cfg[] = { +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + { + .size = sizeof(struct mlx5_flow_dv_encap_decap_resource), + .trunk_size = 64, + .grow_trunk = 3, + .grow_shift = 2, + .need_lock = 0, + .release_mem_en = 1, + .malloc = rte_malloc_socket, + .free = rte_free, + .type = "mlx5_encap_decap_ipool", + }, + { + .size = sizeof(struct mlx5_flow_dv_push_vlan_action_resource), + .trunk_size = 64, + .grow_trunk = 3, + .grow_shift = 2, + .need_lock = 0, + .release_mem_en = 1, + .malloc = rte_malloc_socket, + .free = rte_free, + .type = "mlx5_push_vlan_ipool", + }, + { + .size = sizeof(struct mlx5_flow_dv_tag_resource), + .trunk_size = 64, + .grow_trunk = 3, + .grow_shift = 2, + .need_lock = 0, + .release_mem_en = 1, + .malloc = rte_malloc_socket, + .free = rte_free, + .type = "mlx5_tag_ipool", + }, + { + .size = sizeof(struct mlx5_flow_dv_port_id_action_resource), + .trunk_size = 64, + .grow_trunk = 3, + .grow_shift = 2, + .need_lock = 0, + .release_mem_en = 1, + .malloc = rte_malloc_socket, + .free = rte_free, + .type = "mlx5_port_id_ipool", + }, + { + .size = sizeof(struct mlx5_flow_tbl_data_entry), + .trunk_size = 64, + .grow_trunk = 3, + .grow_shift = 2, + .need_lock = 0, + .release_mem_en = 1, + .malloc = rte_malloc_socket, + .free = rte_free, + .type = "mlx5_jump_ipool", + }, +#endif + { + .size = sizeof(struct mlx5_flow_meter), + .trunk_size = 64, + .grow_trunk = 3, + .grow_shift = 2, + .need_lock = 0, + .release_mem_en = 1, + .malloc = rte_malloc_socket, + .free = rte_free, + .type = "mlx5_meter_ipool", + }, + { + .size = sizeof(struct mlx5_flow_mreg_copy_resource), + .trunk_size = 64, + .grow_trunk = 3, + .grow_shift = 2, + .need_lock = 0, + .release_mem_en = 1, + .malloc = rte_malloc_socket, + .free = rte_free, + .type = "mlx5_mcp_ipool", + }, + { + .size = (sizeof(struct mlx5_hrxq) + MLX5_RSS_HASH_KEY_LEN), + .trunk_size = 64, + .grow_trunk = 3, + .grow_shift = 2, + .need_lock = 0, + .release_mem_en = 1, + .malloc = rte_malloc_socket, + .free = rte_free, + .type = "mlx5_hrxq_ipool", + }, + { + .size = sizeof(struct mlx5_flow_handle), + .trunk_size = 64, + .grow_trunk = 3, + .grow_shift = 2, + .need_lock = 0, + .release_mem_en = 1, + .malloc = rte_malloc_socket, + .free = rte_free, + .type = "mlx5_flow_handle_ipool", + }, + { + .size = sizeof(struct rte_flow), + .trunk_size = 4096, + .need_lock = 1, + .release_mem_en = 1, + .malloc = rte_malloc_socket, + .free = rte_free, + .type = "rte_flow_ipool", + }, +}; + + +#define MLX5_FLOW_MIN_ID_POOL_SIZE 512 +#define MLX5_ID_GENERATION_ARRAY_FACTOR 16 + +#define MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE 4096 +#define MLX5_TAGS_HLIST_ARRAY_SIZE 8192 + +/** + * Allocate ID pool structure. + * + * @param[in] max_id + * The maximum id can be allocated from the pool. + * + * @return + * Pointer to pool object, NULL value otherwise. + */ +struct mlx5_flow_id_pool * +mlx5_flow_id_pool_alloc(uint32_t max_id) +{ + struct mlx5_flow_id_pool *pool; + void *mem; + + pool = rte_zmalloc("id pool allocation", sizeof(*pool), + RTE_CACHE_LINE_SIZE); + if (!pool) { + DRV_LOG(ERR, "can't allocate id pool"); + rte_errno = ENOMEM; + return NULL; + } + mem = rte_zmalloc("", MLX5_FLOW_MIN_ID_POOL_SIZE * sizeof(uint32_t), + RTE_CACHE_LINE_SIZE); + if (!mem) { + DRV_LOG(ERR, "can't allocate mem for id pool"); + rte_errno = ENOMEM; + goto error; + } + pool->free_arr = mem; + pool->curr = pool->free_arr; + pool->last = pool->free_arr + MLX5_FLOW_MIN_ID_POOL_SIZE; + pool->base_index = 0; + pool->max_id = max_id; + return pool; +error: + rte_free(pool); + return NULL; +} + +/** + * Release ID pool structure. + * + * @param[in] pool + * Pointer to flow id pool object to free. + */ +void +mlx5_flow_id_pool_release(struct mlx5_flow_id_pool *pool) +{ + rte_free(pool->free_arr); + rte_free(pool); +} + +/** + * Generate ID. + * + * @param[in] pool + * Pointer to flow id pool. + * @param[out] id + * The generated ID. + * + * @return + * 0 on success, error value otherwise. + */ +uint32_t +mlx5_flow_id_get(struct mlx5_flow_id_pool *pool, uint32_t *id) +{ + if (pool->curr == pool->free_arr) { + if (pool->base_index == pool->max_id) { + rte_errno = ENOMEM; + DRV_LOG(ERR, "no free id"); + return -rte_errno; + } + *id = ++pool->base_index; + return 0; + } + *id = *(--pool->curr); + return 0; +} + +/** + * Release ID. + * + * @param[in] pool + * Pointer to flow id pool. + * @param[out] id + * The generated ID. + * + * @return + * 0 on success, error value otherwise. + */ +uint32_t +mlx5_flow_id_release(struct mlx5_flow_id_pool *pool, uint32_t id) +{ + uint32_t size; + uint32_t size2; + void *mem; + + if (pool->curr == pool->last) { + size = pool->curr - pool->free_arr; + size2 = size * MLX5_ID_GENERATION_ARRAY_FACTOR; + MLX5_ASSERT(size2 > size); + mem = rte_malloc("", size2 * sizeof(uint32_t), 0); + if (!mem) { + DRV_LOG(ERR, "can't allocate mem for id pool"); + rte_errno = ENOMEM; + return -rte_errno; + } + memcpy(mem, pool->free_arr, size * sizeof(uint32_t)); + rte_free(pool->free_arr); + pool->free_arr = mem; + pool->curr = pool->free_arr + size; + pool->last = pool->free_arr + size2; + } + *pool->curr = id; + pool->curr++; + return 0; +} + +/** + * Initialize the shared aging list information per port. + * + * @param[in] sh + * Pointer to mlx5_ibv_shared object. + */ +static void +mlx5_flow_aging_init(struct mlx5_ibv_shared *sh) +{ + uint32_t i; + struct mlx5_age_info *age_info; + + for (i = 0; i < sh->max_port; i++) { + age_info = &sh->port[i].age_info; + age_info->flags = 0; + TAILQ_INIT(&age_info->aged_counters); + rte_spinlock_init(&age_info->aged_sl); + MLX5_AGE_SET(age_info, MLX5_AGE_TRIGGER); + } +} + +/** + * Initialize the counters management structure. + * + * @param[in] sh + * Pointer to mlx5_ibv_shared object to free + */ +static void +mlx5_flow_counters_mng_init(struct mlx5_ibv_shared *sh) +{ + int i; + + memset(&sh->cmng, 0, sizeof(sh->cmng)); + TAILQ_INIT(&sh->cmng.flow_counters); + for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) { + TAILQ_INIT(&sh->cmng.ccont[i].pool_list); + rte_spinlock_init(&sh->cmng.ccont[i].resize_sl); + } +} + +/** + * Destroy all the resources allocated for a counter memory management. + * + * @param[in] mng + * Pointer to the memory management structure. + */ +static void +mlx5_flow_destroy_counter_stat_mem_mng(struct mlx5_counter_stats_mem_mng *mng) +{ + uint8_t *mem = (uint8_t *)(uintptr_t)mng->raws[0].data; + + LIST_REMOVE(mng, next); + claim_zero(mlx5_devx_cmd_destroy(mng->dm)); + claim_zero(mlx5_glue->devx_umem_dereg(mng->umem)); + rte_free(mem); +} + +/** + * Close and release all the resources of the counters management. + * + * @param[in] sh + * Pointer to mlx5_ibv_shared object to free. + */ +static void +mlx5_flow_counters_mng_close(struct mlx5_ibv_shared *sh) +{ + struct mlx5_counter_stats_mem_mng *mng; + int i; + int j; + int retries = 1024; + + rte_errno = 0; + while (--retries) { + rte_eal_alarm_cancel(mlx5_flow_query_alarm, sh); + if (rte_errno != EINPROGRESS) + break; + rte_pause(); + } + for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) { + struct mlx5_flow_counter_pool *pool; + uint32_t batch = !!(i > 1); + + if (!sh->cmng.ccont[i].pools) + continue; + pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list); + while (pool) { + if (batch && pool->min_dcs) + claim_zero(mlx5_devx_cmd_destroy + (pool->min_dcs)); + for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) { + if (MLX5_POOL_GET_CNT(pool, j)->action) + claim_zero + (mlx5_glue->destroy_flow_action + (MLX5_POOL_GET_CNT + (pool, j)->action)); + if (!batch && MLX5_GET_POOL_CNT_EXT + (pool, j)->dcs) + claim_zero(mlx5_devx_cmd_destroy + (MLX5_GET_POOL_CNT_EXT + (pool, j)->dcs)); + } + TAILQ_REMOVE(&sh->cmng.ccont[i].pool_list, pool, next); + rte_free(pool); + pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list); + } + rte_free(sh->cmng.ccont[i].pools); + } + mng = LIST_FIRST(&sh->cmng.mem_mngs); + while (mng) { + mlx5_flow_destroy_counter_stat_mem_mng(mng); + mng = LIST_FIRST(&sh->cmng.mem_mngs); + } + memset(&sh->cmng, 0, sizeof(sh->cmng)); +} + +/** + * Initialize the flow resources' indexed mempool. + * + * @param[in] sh + * Pointer to mlx5_ibv_shared object. + * @param[in] sh + * Pointer to user dev config. + */ +static void +mlx5_flow_ipool_create(struct mlx5_ibv_shared *sh, + const struct mlx5_dev_config *config __rte_unused) +{ + uint8_t i; + +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + /* + * While DV is supported, user chooses the verbs mode, + * the mlx5 flow handle size is different with the + * MLX5_FLOW_HANDLE_VERBS_SIZE. + */ + if (!config->dv_flow_en) + mlx5_ipool_cfg[MLX5_IPOOL_MLX5_FLOW].size = + MLX5_FLOW_HANDLE_VERBS_SIZE; +#endif + for (i = 0; i < MLX5_IPOOL_MAX; ++i) + sh->ipool[i] = mlx5_ipool_create(&mlx5_ipool_cfg[i]); +} + +/** + * Release the flow resources' indexed mempool. + * + * @param[in] sh + * Pointer to mlx5_ibv_shared object. + */ +static void +mlx5_flow_ipool_destroy(struct mlx5_ibv_shared *sh) +{ + uint8_t i; + + for (i = 0; i < MLX5_IPOOL_MAX; ++i) + mlx5_ipool_destroy(sh->ipool[i]); +} + +/** + * Extract pdn of PD object using DV API. + * + * @param[in] pd + * Pointer to the verbs PD object. + * @param[out] pdn + * Pointer to the PD object number variable. + * + * @return + * 0 on success, error value otherwise. + */ +#ifdef HAVE_IBV_FLOW_DV_SUPPORT +static int +mlx5_get_pdn(struct ibv_pd *pd __rte_unused, uint32_t *pdn __rte_unused) +{ + struct mlx5dv_obj obj; + struct mlx5dv_pd pd_info; + int ret = 0; + + obj.pd.in = pd; + obj.pd.out = &pd_info; + ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD); + if (ret) { + DRV_LOG(DEBUG, "Fail to get PD object info"); + return ret; + } + *pdn = pd_info.pdn; + return 0; +} +#endif /* HAVE_IBV_FLOW_DV_SUPPORT */ + +static int +mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config) +{ + char *env; + int value; + + MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); + /* Get environment variable to store. */ + env = getenv(MLX5_SHUT_UP_BF); + value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET; + if (config->dbnc == MLX5_ARG_UNSET) + setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1); + else + setenv(MLX5_SHUT_UP_BF, + config->dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1); + return value; +} + +static void +mlx5_restore_doorbell_mapping_env(int value) +{ + MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); + /* Restore the original environment variable state. */ + if (value == MLX5_ARG_UNSET) + unsetenv(MLX5_SHUT_UP_BF); + else + setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1); +} + +/** + * Allocate shared IB device context. If there is multiport device the + * master and representors will share this context, if there is single + * port dedicated IB device, the context will be used by only given + * port due to unification. + * + * Routine first searches the context for the specified IB device name, + * if found the shared context assumed and reference counter is incremented. + * If no context found the new one is created and initialized with specified + * IB device context and parameters. + * + * @param[in] spawn + * Pointer to the IB device attributes (name, port, etc). + * @param[in] config + * Pointer to device configuration structure. + * + * @return + * Pointer to mlx5_ibv_shared object on success, + * otherwise NULL and rte_errno is set. + */ +static struct mlx5_ibv_shared * +mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn, + const struct mlx5_dev_config *config) +{ + struct mlx5_ibv_shared *sh; + int dbmap_env; + int err = 0; + uint32_t i; +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + struct mlx5_devx_tis_attr tis_attr = { 0 }; +#endif + + MLX5_ASSERT(spawn); + /* Secondary process should not create the shared context. */ + MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); + pthread_mutex_lock(&mlx5_ibv_list_mutex); + /* Search for IB context by device name. */ + LIST_FOREACH(sh, &mlx5_ibv_list, next) { + if (!strcmp(sh->ibdev_name, spawn->ibv_dev->name)) { + sh->refcnt++; + goto exit; + } + } + /* No device found, we have to create new shared context. */ + MLX5_ASSERT(spawn->max_port); + sh = rte_zmalloc("ethdev shared ib context", + sizeof(struct mlx5_ibv_shared) + + spawn->max_port * + sizeof(struct mlx5_ibv_shared_port), + RTE_CACHE_LINE_SIZE); + if (!sh) { + DRV_LOG(ERR, "shared context allocation failure"); + rte_errno = ENOMEM; + goto exit; + } + /* + * Configure environment variable "MLX5_BF_SHUT_UP" + * before the device creation. The rdma_core library + * checks the variable at device creation and + * stores the result internally. + */ + dbmap_env = mlx5_config_doorbell_mapping_env(config); + /* Try to open IB device with DV first, then usual Verbs. */ + errno = 0; + sh->ctx = mlx5_glue->dv_open_device(spawn->ibv_dev); + if (sh->ctx) { + sh->devx = 1; + DRV_LOG(DEBUG, "DevX is supported"); + /* The device is created, no need for environment. */ + mlx5_restore_doorbell_mapping_env(dbmap_env); + } else { + /* The environment variable is still configured. */ + sh->ctx = mlx5_glue->open_device(spawn->ibv_dev); + err = errno ? errno : ENODEV; + /* + * The environment variable is not needed anymore, + * all device creation attempts are completed. + */ + mlx5_restore_doorbell_mapping_env(dbmap_env); + if (!sh->ctx) + goto error; + DRV_LOG(DEBUG, "DevX is NOT supported"); + } + err = mlx5_glue->query_device_ex(sh->ctx, NULL, &sh->device_attr); + if (err) { + DRV_LOG(DEBUG, "ibv_query_device_ex() failed"); + goto error; + } + sh->refcnt = 1; + sh->max_port = spawn->max_port; + strncpy(sh->ibdev_name, sh->ctx->device->name, + sizeof(sh->ibdev_name)); + strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path, + sizeof(sh->ibdev_path)); + pthread_mutex_init(&sh->intr_mutex, NULL); + /* + * Setting port_id to max unallowed value means + * there is no interrupt subhandler installed for + * the given port index i. + */ + for (i = 0; i < sh->max_port; i++) { + sh->port[i].ih_port_id = RTE_MAX_ETHPORTS; + sh->port[i].devx_ih_port_id = RTE_MAX_ETHPORTS; + } + sh->pd = mlx5_glue->alloc_pd(sh->ctx); + if (sh->pd == NULL) { + DRV_LOG(ERR, "PD allocation failure"); + err = ENOMEM; + goto error; + } +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + if (sh->devx) { + err = mlx5_get_pdn(sh->pd, &sh->pdn); + if (err) { + DRV_LOG(ERR, "Fail to extract pdn from PD"); + goto error; + } + sh->td = mlx5_devx_cmd_create_td(sh->ctx); + if (!sh->td) { + DRV_LOG(ERR, "TD allocation failure"); + err = ENOMEM; + goto error; + } + tis_attr.transport_domain = sh->td->id; + sh->tis = mlx5_devx_cmd_create_tis(sh->ctx, &tis_attr); + if (!sh->tis) { + DRV_LOG(ERR, "TIS allocation failure"); + err = ENOMEM; + goto error; + } + } + sh->flow_id_pool = mlx5_flow_id_pool_alloc + ((1 << HAIRPIN_FLOW_ID_BITS) - 1); + if (!sh->flow_id_pool) { + DRV_LOG(ERR, "can't create flow id pool"); + err = ENOMEM; + goto error; + } +#endif /* HAVE_IBV_FLOW_DV_SUPPORT */ + /* + * Once the device is added to the list of memory event + * callback, its global MR cache table cannot be expanded + * on the fly because of deadlock. If it overflows, lookup + * should be done by searching MR list linearly, which is slow. + * + * At this point the device is not added to the memory + * event list yet, context is just being created. + */ + err = mlx5_mr_btree_init(&sh->share_cache.cache, + MLX5_MR_BTREE_CACHE_N * 2, + spawn->pci_dev->device.numa_node); + if (err) { + err = rte_errno; + goto error; + } + mlx5_flow_aging_init(sh); + mlx5_flow_counters_mng_init(sh); + mlx5_flow_ipool_create(sh, config); + /* Add device to memory callback list. */ + rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); + LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list, + sh, mem_event_cb); + rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); + /* Add context to the global device list. */ + LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next); +exit: + pthread_mutex_unlock(&mlx5_ibv_list_mutex); + return sh; +error: + pthread_mutex_unlock(&mlx5_ibv_list_mutex); + MLX5_ASSERT(sh); + if (sh->tis) + claim_zero(mlx5_devx_cmd_destroy(sh->tis)); + if (sh->td) + claim_zero(mlx5_devx_cmd_destroy(sh->td)); + if (sh->pd) + claim_zero(mlx5_glue->dealloc_pd(sh->pd)); + if (sh->ctx) + claim_zero(mlx5_glue->close_device(sh->ctx)); + if (sh->flow_id_pool) + mlx5_flow_id_pool_release(sh->flow_id_pool); + rte_free(sh); + MLX5_ASSERT(err > 0); + rte_errno = err; + return NULL; +} + +/** + * Free shared IB device context. Decrement counter and if zero free + * all allocated resources and close handles. + * + * @param[in] sh + * Pointer to mlx5_ibv_shared object to free + */ +static void +mlx5_free_shared_ibctx(struct mlx5_ibv_shared *sh) +{ + pthread_mutex_lock(&mlx5_ibv_list_mutex); +#ifdef RTE_LIBRTE_MLX5_DEBUG + /* Check the object presence in the list. */ + struct mlx5_ibv_shared *lctx; + + LIST_FOREACH(lctx, &mlx5_ibv_list, next) + if (lctx == sh) + break; + MLX5_ASSERT(lctx); + if (lctx != sh) { + DRV_LOG(ERR, "Freeing non-existing shared IB context"); + goto exit; + } +#endif + MLX5_ASSERT(sh); + MLX5_ASSERT(sh->refcnt); + /* Secondary process should not free the shared context. */ + MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); + if (--sh->refcnt) + goto exit; + /* Remove from memory callback device list. */ + rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); + LIST_REMOVE(sh, mem_event_cb); + rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); + /* Release created Memory Regions. */ + mlx5_mr_release_cache(&sh->share_cache); + /* Remove context from the global device list. */ + LIST_REMOVE(sh, next); + /* + * Ensure there is no async event handler installed. + * Only primary process handles async device events. + **/ + mlx5_flow_counters_mng_close(sh); + mlx5_flow_ipool_destroy(sh); + MLX5_ASSERT(!sh->intr_cnt); + if (sh->intr_cnt) + mlx5_intr_callback_unregister + (&sh->intr_handle, mlx5_dev_interrupt_handler, sh); +#ifdef HAVE_MLX5_DEVX_ASYNC_SUPPORT + if (sh->devx_intr_cnt) { + if (sh->intr_handle_devx.fd) + rte_intr_callback_unregister(&sh->intr_handle_devx, + mlx5_dev_interrupt_handler_devx, sh); + if (sh->devx_comp) + mlx5dv_devx_destroy_cmd_comp(sh->devx_comp); + } +#endif + pthread_mutex_destroy(&sh->intr_mutex); + if (sh->pd) + claim_zero(mlx5_glue->dealloc_pd(sh->pd)); + if (sh->tis) + claim_zero(mlx5_devx_cmd_destroy(sh->tis)); + if (sh->td) + claim_zero(mlx5_devx_cmd_destroy(sh->td)); + if (sh->ctx) + claim_zero(mlx5_glue->close_device(sh->ctx)); + if (sh->flow_id_pool) + mlx5_flow_id_pool_release(sh->flow_id_pool); + rte_free(sh); +exit: + pthread_mutex_unlock(&mlx5_ibv_list_mutex); +} + +/** + * Destroy table hash list and all the root entries per domain. + * + * @param[in] priv + * Pointer to the private device data structure. + */ +static void +mlx5_free_table_hash_list(struct mlx5_priv *priv) +{ + struct mlx5_ibv_shared *sh = priv->sh; + struct mlx5_flow_tbl_data_entry *tbl_data; + union mlx5_flow_tbl_key table_key = { + { + .table_id = 0, + .reserved = 0, + .domain = 0, + .direction = 0, + } + }; + struct mlx5_hlist_entry *pos; + + if (!sh->flow_tbls) + return; + pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64); + if (pos) { + tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry, + entry); + MLX5_ASSERT(tbl_data); + mlx5_hlist_remove(sh->flow_tbls, pos); + rte_free(tbl_data); + } + table_key.direction = 1; + pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64); + if (pos) { + tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry, + entry); + MLX5_ASSERT(tbl_data); + mlx5_hlist_remove(sh->flow_tbls, pos); + rte_free(tbl_data); + } + table_key.direction = 0; + table_key.domain = 1; + pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64); + if (pos) { + tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry, + entry); + MLX5_ASSERT(tbl_data); + mlx5_hlist_remove(sh->flow_tbls, pos); + rte_free(tbl_data); + } + mlx5_hlist_destroy(sh->flow_tbls, NULL, NULL); +} + +/** + * Initialize flow table hash list and create the root tables entry + * for each domain. + * + * @param[in] priv + * Pointer to the private device data structure. + * + * @return + * Zero on success, positive error code otherwise. + */ +static int +mlx5_alloc_table_hash_list(struct mlx5_priv *priv) +{ + struct mlx5_ibv_shared *sh = priv->sh; + char s[MLX5_HLIST_NAMESIZE]; + int err = 0; + + MLX5_ASSERT(sh); + snprintf(s, sizeof(s), "%s_flow_table", priv->sh->ibdev_name); + sh->flow_tbls = mlx5_hlist_create(s, MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE); + if (!sh->flow_tbls) { + DRV_LOG(ERR, "flow tables with hash creation failed.\n"); + err = ENOMEM; + return err; + } +#ifndef HAVE_MLX5DV_DR + /* + * In case we have not DR support, the zero tables should be created + * because DV expect to see them even if they cannot be created by + * RDMA-CORE. + */ + union mlx5_flow_tbl_key table_key = { + { + .table_id = 0, + .reserved = 0, + .domain = 0, + .direction = 0, + } + }; + struct mlx5_flow_tbl_data_entry *tbl_data = rte_zmalloc(NULL, + sizeof(*tbl_data), 0); + + if (!tbl_data) { + err = ENOMEM; + goto error; + } + tbl_data->entry.key = table_key.v64; + err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry); + if (err) + goto error; + rte_atomic32_init(&tbl_data->tbl.refcnt); + rte_atomic32_inc(&tbl_data->tbl.refcnt); + table_key.direction = 1; + tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0); + if (!tbl_data) { + err = ENOMEM; + goto error; + } + tbl_data->entry.key = table_key.v64; + err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry); + if (err) + goto error; + rte_atomic32_init(&tbl_data->tbl.refcnt); + rte_atomic32_inc(&tbl_data->tbl.refcnt); + table_key.direction = 0; + table_key.domain = 1; + tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0); + if (!tbl_data) { + err = ENOMEM; + goto error; + } + tbl_data->entry.key = table_key.v64; + err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry); + if (err) + goto error; + rte_atomic32_init(&tbl_data->tbl.refcnt); + rte_atomic32_inc(&tbl_data->tbl.refcnt); + return err; +error: + mlx5_free_table_hash_list(priv); +#endif /* HAVE_MLX5DV_DR */ + return err; +} + +/** + * Initialize DR related data within private structure. + * Routine checks the reference counter and does actual + * resources creation/initialization only if counter is zero. + * + * @param[in] priv + * Pointer to the private device data structure. + * + * @return + * Zero on success, positive error code otherwise. + */ +static int +mlx5_alloc_shared_dr(struct mlx5_priv *priv) +{ + struct mlx5_ibv_shared *sh = priv->sh; + char s[MLX5_HLIST_NAMESIZE]; + int err = 0; + + if (!sh->flow_tbls) + err = mlx5_alloc_table_hash_list(priv); + else + DRV_LOG(DEBUG, "sh->flow_tbls[%p] already created, reuse\n", + (void *)sh->flow_tbls); + if (err) + return err; + /* Create tags hash list table. */ + snprintf(s, sizeof(s), "%s_tags", sh->ibdev_name); + sh->tag_table = mlx5_hlist_create(s, MLX5_TAGS_HLIST_ARRAY_SIZE); + if (!sh->tag_table) { + DRV_LOG(ERR, "tags with hash creation failed.\n"); + err = ENOMEM; + goto error; + } +#ifdef HAVE_MLX5DV_DR + void *domain; + + if (sh->dv_refcnt) { + /* Shared DV/DR structures is already initialized. */ + sh->dv_refcnt++; + priv->dr_shared = 1; + return 0; + } + /* Reference counter is zero, we should initialize structures. */ + domain = mlx5_glue->dr_create_domain(sh->ctx, + MLX5DV_DR_DOMAIN_TYPE_NIC_RX); + if (!domain) { + DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed"); + err = errno; + goto error; + } + sh->rx_domain = domain; + domain = mlx5_glue->dr_create_domain(sh->ctx, + MLX5DV_DR_DOMAIN_TYPE_NIC_TX); + if (!domain) { + DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed"); + err = errno; + goto error; + } + pthread_mutex_init(&sh->dv_mutex, NULL); + sh->tx_domain = domain; +#ifdef HAVE_MLX5DV_DR_ESWITCH + if (priv->config.dv_esw_en) { + domain = mlx5_glue->dr_create_domain + (sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB); + if (!domain) { + DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed"); + err = errno; + goto error; + } + sh->fdb_domain = domain; + sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop(); + } +#endif + sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan(); +#endif /* HAVE_MLX5DV_DR */ + sh->dv_refcnt++; + priv->dr_shared = 1; + return 0; +error: + /* Rollback the created objects. */ + if (sh->rx_domain) { + mlx5_glue->dr_destroy_domain(sh->rx_domain); + sh->rx_domain = NULL; + } + if (sh->tx_domain) { + mlx5_glue->dr_destroy_domain(sh->tx_domain); + sh->tx_domain = NULL; + } + if (sh->fdb_domain) { + mlx5_glue->dr_destroy_domain(sh->fdb_domain); + sh->fdb_domain = NULL; + } + if (sh->esw_drop_action) { + mlx5_glue->destroy_flow_action(sh->esw_drop_action); + sh->esw_drop_action = NULL; + } + if (sh->pop_vlan_action) { + mlx5_glue->destroy_flow_action(sh->pop_vlan_action); + sh->pop_vlan_action = NULL; + } + if (sh->tag_table) { + /* tags should be destroyed with flow before. */ + mlx5_hlist_destroy(sh->tag_table, NULL, NULL); + sh->tag_table = NULL; + } + mlx5_free_table_hash_list(priv); + return err; +} + +/** + * Destroy DR related data within private structure. + * + * @param[in] priv + * Pointer to the private device data structure. + */ +static void +mlx5_free_shared_dr(struct mlx5_priv *priv) +{ + struct mlx5_ibv_shared *sh; + + if (!priv->dr_shared) + return; + priv->dr_shared = 0; + sh = priv->sh; + MLX5_ASSERT(sh); +#ifdef HAVE_MLX5DV_DR + MLX5_ASSERT(sh->dv_refcnt); + if (sh->dv_refcnt && --sh->dv_refcnt) + return; + if (sh->rx_domain) { + mlx5_glue->dr_destroy_domain(sh->rx_domain); + sh->rx_domain = NULL; + } + if (sh->tx_domain) { + mlx5_glue->dr_destroy_domain(sh->tx_domain); + sh->tx_domain = NULL; + } +#ifdef HAVE_MLX5DV_DR_ESWITCH + if (sh->fdb_domain) { + mlx5_glue->dr_destroy_domain(sh->fdb_domain); + sh->fdb_domain = NULL; + } + if (sh->esw_drop_action) { + mlx5_glue->destroy_flow_action(sh->esw_drop_action); + sh->esw_drop_action = NULL; + } +#endif + if (sh->pop_vlan_action) { + mlx5_glue->destroy_flow_action(sh->pop_vlan_action); + sh->pop_vlan_action = NULL; + } + pthread_mutex_destroy(&sh->dv_mutex); +#endif /* HAVE_MLX5DV_DR */ + if (sh->tag_table) { + /* tags should be destroyed with flow before. */ + mlx5_hlist_destroy(sh->tag_table, NULL, NULL); + sh->tag_table = NULL; + } + mlx5_free_table_hash_list(priv); +} + +/** + * Initialize shared data between primary and secondary process. + * + * A memzone is reserved by primary process and secondary processes attach to + * the memzone. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_init_shared_data(void) +{ + const struct rte_memzone *mz; + int ret = 0; + + rte_spinlock_lock(&mlx5_shared_data_lock); + if (mlx5_shared_data == NULL) { + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + /* Allocate shared memory. */ + mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA, + sizeof(*mlx5_shared_data), + SOCKET_ID_ANY, 0); + if (mz == NULL) { + DRV_LOG(ERR, + "Cannot allocate mlx5 shared data"); + ret = -rte_errno; + goto error; + } + mlx5_shared_data = mz->addr; + memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data)); + rte_spinlock_init(&mlx5_shared_data->lock); + } else { + /* Lookup allocated shared memory. */ + mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA); + if (mz == NULL) { + DRV_LOG(ERR, + "Cannot attach mlx5 shared data"); + ret = -rte_errno; + goto error; + } + mlx5_shared_data = mz->addr; + memset(&mlx5_local_data, 0, sizeof(mlx5_local_data)); + } + } +error: + rte_spinlock_unlock(&mlx5_shared_data_lock); + return ret; +} + +/** + * Retrieve integer value from environment variable. + * + * @param[in] name + * Environment variable name. + * + * @return + * Integer value, 0 if the variable is not set. + */ +int +mlx5_getenv_int(const char *name) +{ + const char *val = getenv(name); + + if (val == NULL) + return 0; + return atoi(val); +} + +/** + * Verbs callback to allocate a memory. This function should allocate the space + * according to the size provided residing inside a huge page. + * Please note that all allocation must respect the alignment from libmlx5 + * (i.e. currently sysconf(_SC_PAGESIZE)). + * + * @param[in] size + * The size in bytes of the memory to allocate. + * @param[in] data + * A pointer to the callback data. + * + * @return + * Allocated buffer, NULL otherwise and rte_errno is set. + */ +static void * +mlx5_alloc_verbs_buf(size_t size, void *data) +{ + struct mlx5_priv *priv = data; + void *ret; + size_t alignment = sysconf(_SC_PAGESIZE); + unsigned int socket = SOCKET_ID_ANY; + + if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) { + const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; + + socket = ctrl->socket; + } else if (priv->verbs_alloc_ctx.type == + MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) { + const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; + + socket = ctrl->socket; + } + MLX5_ASSERT(data != NULL); + ret = rte_malloc_socket(__func__, size, alignment, socket); + if (!ret && size) + rte_errno = ENOMEM; + return ret; +} + +/** + * Verbs callback to free a memory. + * + * @param[in] ptr + * A pointer to the memory to free. + * @param[in] data + * A pointer to the callback data. + */ +static void +mlx5_free_verbs_buf(void *ptr, void *data __rte_unused) +{ + MLX5_ASSERT(data != NULL); + rte_free(ptr); +} + +/** + * DPDK callback to add udp tunnel port + * + * @param[in] dev + * A pointer to eth_dev + * @param[in] udp_tunnel + * A pointer to udp tunnel + * + * @return + * 0 on valid udp ports and tunnels, -ENOTSUP otherwise. + */ +int +mlx5_udp_tunnel_port_add(struct rte_eth_dev *dev __rte_unused, + struct rte_eth_udp_tunnel *udp_tunnel) +{ + MLX5_ASSERT(udp_tunnel != NULL); + if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN && + udp_tunnel->udp_port == 4789) + return 0; + if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN_GPE && + udp_tunnel->udp_port == 4790) + return 0; + return -ENOTSUP; +} + +/** + * Initialize process private data structure. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_proc_priv_init(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_proc_priv *ppriv; + size_t ppriv_size; + + /* + * UAR register table follows the process private structure. BlueFlame + * registers for Tx queues are stored in the table. + */ + ppriv_size = + sizeof(struct mlx5_proc_priv) + priv->txqs_n * sizeof(void *); + ppriv = rte_malloc_socket("mlx5_proc_priv", ppriv_size, + RTE_CACHE_LINE_SIZE, dev->device->numa_node); + if (!ppriv) { + rte_errno = ENOMEM; + return -rte_errno; + } + ppriv->uar_table_sz = ppriv_size; + dev->process_private = ppriv; + return 0; +} + +/** + * Un-initialize process private data structure. + * + * @param dev + * Pointer to Ethernet device structure. + */ +static void +mlx5_proc_priv_uninit(struct rte_eth_dev *dev) +{ + if (!dev->process_private) + return; + rte_free(dev->process_private); + dev->process_private = NULL; +} + +/** + * DPDK callback to close the device. + * + * Destroy all queues and objects, free memory. + * + * @param dev + * Pointer to Ethernet device structure. + */ +static void +mlx5_dev_close(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + unsigned int i; + int ret; + + DRV_LOG(DEBUG, "port %u closing device \"%s\"", + dev->data->port_id, + ((priv->sh->ctx != NULL) ? priv->sh->ctx->device->name : "")); + /* In case mlx5_dev_stop() has not been called. */ + mlx5_dev_interrupt_handler_uninstall(dev); + mlx5_dev_interrupt_handler_devx_uninstall(dev); + /* + * If default mreg copy action is removed at the stop stage, + * the search will return none and nothing will be done anymore. + */ + mlx5_flow_stop_default(dev); + mlx5_traffic_disable(dev); + /* + * If all the flows are already flushed in the device stop stage, + * then this will return directly without any action. + */ + mlx5_flow_list_flush(dev, &priv->flows, true); + mlx5_flow_meter_flush(dev, NULL); + /* Free the intermediate buffers for flow creation. */ + mlx5_flow_free_intermediate(dev); + /* Prevent crashes when queues are still in use. */ + dev->rx_pkt_burst = removed_rx_burst; + dev->tx_pkt_burst = removed_tx_burst; + rte_wmb(); + /* Disable datapath on secondary process. */ + mlx5_mp_req_stop_rxtx(dev); + if (priv->rxqs != NULL) { + /* XXX race condition if mlx5_rx_burst() is still running. */ + usleep(1000); + for (i = 0; (i != priv->rxqs_n); ++i) + mlx5_rxq_release(dev, i); + priv->rxqs_n = 0; + priv->rxqs = NULL; + } + if (priv->txqs != NULL) { + /* XXX race condition if mlx5_tx_burst() is still running. */ + usleep(1000); + for (i = 0; (i != priv->txqs_n); ++i) + mlx5_txq_release(dev, i); + priv->txqs_n = 0; + priv->txqs = NULL; + } + mlx5_proc_priv_uninit(dev); + if (priv->mreg_cp_tbl) + mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL); + mlx5_mprq_free_mp(dev); + mlx5_free_shared_dr(priv); + if (priv->rss_conf.rss_key != NULL) + rte_free(priv->rss_conf.rss_key); + if (priv->reta_idx != NULL) + rte_free(priv->reta_idx); + if (priv->config.vf) + mlx5_nl_mac_addr_flush(priv->nl_socket_route, mlx5_ifindex(dev), + dev->data->mac_addrs, + MLX5_MAX_MAC_ADDRESSES, priv->mac_own); + if (priv->nl_socket_route >= 0) + close(priv->nl_socket_route); + if (priv->nl_socket_rdma >= 0) + close(priv->nl_socket_rdma); + if (priv->vmwa_context) + mlx5_vlan_vmwa_exit(priv->vmwa_context); + ret = mlx5_hrxq_verify(dev); + if (ret) + DRV_LOG(WARNING, "port %u some hash Rx queue still remain", + dev->data->port_id); + ret = mlx5_ind_table_obj_verify(dev); + if (ret) + DRV_LOG(WARNING, "port %u some indirection table still remain", + dev->data->port_id); + ret = mlx5_rxq_obj_verify(dev); + if (ret) + DRV_LOG(WARNING, "port %u some Rx queue objects still remain", + dev->data->port_id); + ret = mlx5_rxq_verify(dev); + if (ret) + DRV_LOG(WARNING, "port %u some Rx queues still remain", + dev->data->port_id); + ret = mlx5_txq_obj_verify(dev); + if (ret) + DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain", + dev->data->port_id); + ret = mlx5_txq_verify(dev); + if (ret) + DRV_LOG(WARNING, "port %u some Tx queues still remain", + dev->data->port_id); + ret = mlx5_flow_verify(dev); + if (ret) + DRV_LOG(WARNING, "port %u some flows still remain", + dev->data->port_id); + if (priv->sh) { + /* + * Free the shared context in last turn, because the cleanup + * routines above may use some shared fields, like + * mlx5_nl_mac_addr_flush() uses ibdev_path for retrieveing + * ifindex if Netlink fails. + */ + mlx5_free_shared_ibctx(priv->sh); + priv->sh = NULL; + } + if (priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) { + unsigned int c = 0; + uint16_t port_id; + + MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { + struct mlx5_priv *opriv = + rte_eth_devices[port_id].data->dev_private; + + if (!opriv || + opriv->domain_id != priv->domain_id || + &rte_eth_devices[port_id] == dev) + continue; + ++c; + break; + } + if (!c) + claim_zero(rte_eth_switch_domain_free(priv->domain_id)); + } + memset(priv, 0, sizeof(*priv)); + priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; + /* + * Reset mac_addrs to NULL such that it is not freed as part of + * rte_eth_dev_release_port(). mac_addrs is part of dev_private so + * it is freed when dev_private is freed. + */ + dev->data->mac_addrs = NULL; +} + +const struct eth_dev_ops mlx5_dev_ops = { + .dev_configure = mlx5_dev_configure, + .dev_start = mlx5_dev_start, + .dev_stop = mlx5_dev_stop, + .dev_set_link_down = mlx5_set_link_down, + .dev_set_link_up = mlx5_set_link_up, + .dev_close = mlx5_dev_close, + .promiscuous_enable = mlx5_promiscuous_enable, + .promiscuous_disable = mlx5_promiscuous_disable, + .allmulticast_enable = mlx5_allmulticast_enable, + .allmulticast_disable = mlx5_allmulticast_disable, + .link_update = mlx5_link_update, + .stats_get = mlx5_stats_get, + .stats_reset = mlx5_stats_reset, + .xstats_get = mlx5_xstats_get, + .xstats_reset = mlx5_xstats_reset, + .xstats_get_names = mlx5_xstats_get_names, + .fw_version_get = mlx5_fw_version_get, + .dev_infos_get = mlx5_dev_infos_get, + .read_clock = mlx5_read_clock, + .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, + .vlan_filter_set = mlx5_vlan_filter_set, + .rx_queue_setup = mlx5_rx_queue_setup, + .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup, + .tx_queue_setup = mlx5_tx_queue_setup, + .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup, + .rx_queue_release = mlx5_rx_queue_release, + .tx_queue_release = mlx5_tx_queue_release, + .flow_ctrl_get = mlx5_dev_get_flow_ctrl, + .flow_ctrl_set = mlx5_dev_set_flow_ctrl, + .mac_addr_remove = mlx5_mac_addr_remove, + .mac_addr_add = mlx5_mac_addr_add, + .mac_addr_set = mlx5_mac_addr_set, + .set_mc_addr_list = mlx5_set_mc_addr_list, + .mtu_set = mlx5_dev_set_mtu, + .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, + .vlan_offload_set = mlx5_vlan_offload_set, + .reta_update = mlx5_dev_rss_reta_update, + .reta_query = mlx5_dev_rss_reta_query, + .rss_hash_update = mlx5_rss_hash_update, + .rss_hash_conf_get = mlx5_rss_hash_conf_get, + .filter_ctrl = mlx5_dev_filter_ctrl, + .rx_descriptor_status = mlx5_rx_descriptor_status, + .tx_descriptor_status = mlx5_tx_descriptor_status, + .rxq_info_get = mlx5_rxq_info_get, + .txq_info_get = mlx5_txq_info_get, + .rx_burst_mode_get = mlx5_rx_burst_mode_get, + .tx_burst_mode_get = mlx5_tx_burst_mode_get, + .rx_queue_count = mlx5_rx_queue_count, + .rx_queue_intr_enable = mlx5_rx_intr_enable, + .rx_queue_intr_disable = mlx5_rx_intr_disable, + .is_removed = mlx5_is_removed, + .udp_tunnel_port_add = mlx5_udp_tunnel_port_add, + .get_module_info = mlx5_get_module_info, + .get_module_eeprom = mlx5_get_module_eeprom, + .hairpin_cap_get = mlx5_hairpin_cap_get, + .mtr_ops_get = mlx5_flow_meter_ops_get, +}; + +/* Available operations from secondary process. */ +static const struct eth_dev_ops mlx5_dev_sec_ops = { + .stats_get = mlx5_stats_get, + .stats_reset = mlx5_stats_reset, + .xstats_get = mlx5_xstats_get, + .xstats_reset = mlx5_xstats_reset, + .xstats_get_names = mlx5_xstats_get_names, + .fw_version_get = mlx5_fw_version_get, + .dev_infos_get = mlx5_dev_infos_get, + .rx_descriptor_status = mlx5_rx_descriptor_status, + .tx_descriptor_status = mlx5_tx_descriptor_status, + .rxq_info_get = mlx5_rxq_info_get, + .txq_info_get = mlx5_txq_info_get, + .rx_burst_mode_get = mlx5_rx_burst_mode_get, + .tx_burst_mode_get = mlx5_tx_burst_mode_get, + .get_module_info = mlx5_get_module_info, + .get_module_eeprom = mlx5_get_module_eeprom, +}; + +/* Available operations in flow isolated mode. */ +const struct eth_dev_ops mlx5_dev_ops_isolate = { + .dev_configure = mlx5_dev_configure, + .dev_start = mlx5_dev_start, + .dev_stop = mlx5_dev_stop, + .dev_set_link_down = mlx5_set_link_down, + .dev_set_link_up = mlx5_set_link_up, + .dev_close = mlx5_dev_close, + .promiscuous_enable = mlx5_promiscuous_enable, + .promiscuous_disable = mlx5_promiscuous_disable, + .allmulticast_enable = mlx5_allmulticast_enable, + .allmulticast_disable = mlx5_allmulticast_disable, + .link_update = mlx5_link_update, + .stats_get = mlx5_stats_get, + .stats_reset = mlx5_stats_reset, + .xstats_get = mlx5_xstats_get, + .xstats_reset = mlx5_xstats_reset, + .xstats_get_names = mlx5_xstats_get_names, + .fw_version_get = mlx5_fw_version_get, + .dev_infos_get = mlx5_dev_infos_get, + .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, + .vlan_filter_set = mlx5_vlan_filter_set, + .rx_queue_setup = mlx5_rx_queue_setup, + .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup, + .tx_queue_setup = mlx5_tx_queue_setup, + .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup, + .rx_queue_release = mlx5_rx_queue_release, + .tx_queue_release = mlx5_tx_queue_release, + .flow_ctrl_get = mlx5_dev_get_flow_ctrl, + .flow_ctrl_set = mlx5_dev_set_flow_ctrl, + .mac_addr_remove = mlx5_mac_addr_remove, + .mac_addr_add = mlx5_mac_addr_add, + .mac_addr_set = mlx5_mac_addr_set, + .set_mc_addr_list = mlx5_set_mc_addr_list, + .mtu_set = mlx5_dev_set_mtu, + .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, + .vlan_offload_set = mlx5_vlan_offload_set, + .filter_ctrl = mlx5_dev_filter_ctrl, + .rx_descriptor_status = mlx5_rx_descriptor_status, + .tx_descriptor_status = mlx5_tx_descriptor_status, + .rxq_info_get = mlx5_rxq_info_get, + .txq_info_get = mlx5_txq_info_get, + .rx_burst_mode_get = mlx5_rx_burst_mode_get, + .tx_burst_mode_get = mlx5_tx_burst_mode_get, + .rx_queue_intr_enable = mlx5_rx_intr_enable, + .rx_queue_intr_disable = mlx5_rx_intr_disable, + .is_removed = mlx5_is_removed, + .get_module_info = mlx5_get_module_info, + .get_module_eeprom = mlx5_get_module_eeprom, + .hairpin_cap_get = mlx5_hairpin_cap_get, + .mtr_ops_get = mlx5_flow_meter_ops_get, +}; + +/** + * Verify and store value for device argument. + * + * @param[in] key + * Key argument to verify. + * @param[in] val + * Value associated with key. + * @param opaque + * User data. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_args_check(const char *key, const char *val, void *opaque) +{ + struct mlx5_dev_config *config = opaque; + unsigned long tmp; + + /* No-op, port representors are processed in mlx5_dev_spawn(). */ + if (!strcmp(MLX5_REPRESENTOR, key)) + return 0; + errno = 0; + tmp = strtoul(val, NULL, 0); + if (errno) { + rte_errno = errno; + DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val); + return -rte_errno; + } + if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) { + config->cqe_comp = !!tmp; + } else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) { + config->cqe_pad = !!tmp; + } else if (strcmp(MLX5_RXQ_PKT_PAD_EN, key) == 0) { + config->hw_padding = !!tmp; + } else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) { + config->mprq.enabled = !!tmp; + } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) { + config->mprq.stride_num_n = tmp; + } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_SIZE, key) == 0) { + config->mprq.stride_size_n = tmp; + } else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) { + config->mprq.max_memcpy_len = tmp; + } else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) { + config->mprq.min_rxqs_num = tmp; + } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) { + DRV_LOG(WARNING, "%s: deprecated parameter," + " converted to txq_inline_max", key); + config->txq_inline_max = tmp; + } else if (strcmp(MLX5_TXQ_INLINE_MAX, key) == 0) { + config->txq_inline_max = tmp; + } else if (strcmp(MLX5_TXQ_INLINE_MIN, key) == 0) { + config->txq_inline_min = tmp; + } else if (strcmp(MLX5_TXQ_INLINE_MPW, key) == 0) { + config->txq_inline_mpw = tmp; + } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) { + config->txqs_inline = tmp; + } else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) { + DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key); + } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) { + config->mps = !!tmp; + } else if (strcmp(MLX5_TX_DB_NC, key) == 0) { + if (tmp != MLX5_TXDB_CACHED && + tmp != MLX5_TXDB_NCACHED && + tmp != MLX5_TXDB_HEURISTIC) { + DRV_LOG(ERR, "invalid Tx doorbell " + "mapping parameter"); + rte_errno = EINVAL; + return -rte_errno; + } + config->dbnc = tmp; + } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) { + DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key); + } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) { + DRV_LOG(WARNING, "%s: deprecated parameter," + " converted to txq_inline_mpw", key); + config->txq_inline_mpw = tmp; + } else if (strcmp(MLX5_TX_VEC_EN, key) == 0) { + DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key); + } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) { + config->rx_vec_en = !!tmp; + } else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) { + config->l3_vxlan_en = !!tmp; + } else if (strcmp(MLX5_VF_NL_EN, key) == 0) { + config->vf_nl_en = !!tmp; + } else if (strcmp(MLX5_DV_ESW_EN, key) == 0) { + config->dv_esw_en = !!tmp; + } else if (strcmp(MLX5_DV_FLOW_EN, key) == 0) { + config->dv_flow_en = !!tmp; + } else if (strcmp(MLX5_DV_XMETA_EN, key) == 0) { + if (tmp != MLX5_XMETA_MODE_LEGACY && + tmp != MLX5_XMETA_MODE_META16 && + tmp != MLX5_XMETA_MODE_META32) { + DRV_LOG(ERR, "invalid extensive " + "metadata parameter"); + rte_errno = EINVAL; + return -rte_errno; + } + config->dv_xmeta_en = tmp; + } else if (strcmp(MLX5_MR_EXT_MEMSEG_EN, key) == 0) { + config->mr_ext_memseg_en = !!tmp; + } else if (strcmp(MLX5_MAX_DUMP_FILES_NUM, key) == 0) { + config->max_dump_files_num = tmp; + } else if (strcmp(MLX5_LRO_TIMEOUT_USEC, key) == 0) { + config->lro.timeout = tmp; + } else if (strcmp(MLX5_CLASS_ARG_NAME, key) == 0) { + DRV_LOG(DEBUG, "class argument is %s.", val); + } else if (strcmp(MLX5_HP_BUF_SIZE, key) == 0) { + config->log_hp_size = tmp; + } else { + DRV_LOG(WARNING, "%s: unknown parameter", key); + rte_errno = EINVAL; + return -rte_errno; + } + return 0; +} + +/** + * Parse device parameters. + * + * @param config + * Pointer to device configuration structure. + * @param devargs + * Device arguments structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs) +{ + const char **params = (const char *[]){ + MLX5_RXQ_CQE_COMP_EN, + MLX5_RXQ_CQE_PAD_EN, + MLX5_RXQ_PKT_PAD_EN, + MLX5_RX_MPRQ_EN, + MLX5_RX_MPRQ_LOG_STRIDE_NUM, + MLX5_RX_MPRQ_LOG_STRIDE_SIZE, + MLX5_RX_MPRQ_MAX_MEMCPY_LEN, + MLX5_RXQS_MIN_MPRQ, + MLX5_TXQ_INLINE, + MLX5_TXQ_INLINE_MIN, + MLX5_TXQ_INLINE_MAX, + MLX5_TXQ_INLINE_MPW, + MLX5_TXQS_MIN_INLINE, + MLX5_TXQS_MAX_VEC, + MLX5_TXQ_MPW_EN, + MLX5_TXQ_MPW_HDR_DSEG_EN, + MLX5_TXQ_MAX_INLINE_LEN, + MLX5_TX_DB_NC, + MLX5_TX_VEC_EN, + MLX5_RX_VEC_EN, + MLX5_L3_VXLAN_EN, + MLX5_VF_NL_EN, + MLX5_DV_ESW_EN, + MLX5_DV_FLOW_EN, + MLX5_DV_XMETA_EN, + MLX5_MR_EXT_MEMSEG_EN, + MLX5_REPRESENTOR, + MLX5_MAX_DUMP_FILES_NUM, + MLX5_LRO_TIMEOUT_USEC, + MLX5_CLASS_ARG_NAME, + MLX5_HP_BUF_SIZE, + NULL, + }; + struct rte_kvargs *kvlist; + int ret = 0; + int i; + + if (devargs == NULL) + return 0; + /* Following UGLY cast is done to pass checkpatch. */ + kvlist = rte_kvargs_parse(devargs->args, params); + if (kvlist == NULL) { + rte_errno = EINVAL; + return -rte_errno; + } + /* Process parameters. */ + for (i = 0; (params[i] != NULL); ++i) { + if (rte_kvargs_count(kvlist, params[i])) { + ret = rte_kvargs_process(kvlist, params[i], + mlx5_args_check, config); + if (ret) { + rte_errno = EINVAL; + rte_kvargs_free(kvlist); + return -rte_errno; + } + } + } + rte_kvargs_free(kvlist); + return 0; +} + +static struct rte_pci_driver mlx5_driver; + +/** + * PMD global initialization. + * + * Independent from individual device, this function initializes global + * per-PMD data structures distinguishing primary and secondary processes. + * Hence, each initialization is called once per a process. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_init_once(void) +{ + struct mlx5_shared_data *sd; + struct mlx5_local_data *ld = &mlx5_local_data; + int ret = 0; + + if (mlx5_init_shared_data()) + return -rte_errno; + sd = mlx5_shared_data; + MLX5_ASSERT(sd); + rte_spinlock_lock(&sd->lock); + switch (rte_eal_process_type()) { + case RTE_PROC_PRIMARY: + if (sd->init_done) + break; + LIST_INIT(&sd->mem_event_cb_list); + rte_rwlock_init(&sd->mem_event_rwlock); + rte_mem_event_callback_register("MLX5_MEM_EVENT_CB", + mlx5_mr_mem_event_cb, NULL); + ret = mlx5_mp_init_primary(MLX5_MP_NAME, + mlx5_mp_primary_handle); + if (ret) + goto out; + sd->init_done = true; + break; + case RTE_PROC_SECONDARY: + if (ld->init_done) + break; + ret = mlx5_mp_init_secondary(MLX5_MP_NAME, + mlx5_mp_secondary_handle); + if (ret) + goto out; + ++sd->secondary_cnt; + ld->init_done = true; + break; + default: + break; + } +out: + rte_spinlock_unlock(&sd->lock); + return ret; +} + +/** + * Configures the minimal amount of data to inline into WQE + * while sending packets. + * + * - the txq_inline_min has the maximal priority, if this + * key is specified in devargs + * - if DevX is enabled the inline mode is queried from the + * device (HCA attributes and NIC vport context if needed). + * - otherwise L2 mode (18 bytes) is assumed for ConnectX-4/4 Lx + * and none (0 bytes) for other NICs + * + * @param spawn + * Verbs device parameters (name, port, switch_info) to spawn. + * @param config + * Device configuration parameters. + */ +static void +mlx5_set_min_inline(struct mlx5_dev_spawn_data *spawn, + struct mlx5_dev_config *config) +{ + if (config->txq_inline_min != MLX5_ARG_UNSET) { + /* Application defines size of inlined data explicitly. */ + switch (spawn->pci_dev->id.device_id) { + case PCI_DEVICE_ID_MELLANOX_CONNECTX4: + case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: + if (config->txq_inline_min < + (int)MLX5_INLINE_HSIZE_L2) { + DRV_LOG(DEBUG, + "txq_inline_mix aligned to minimal" + " ConnectX-4 required value %d", + (int)MLX5_INLINE_HSIZE_L2); + config->txq_inline_min = MLX5_INLINE_HSIZE_L2; + } + break; + } + goto exit; + } + if (config->hca_attr.eth_net_offloads) { + /* We have DevX enabled, inline mode queried successfully. */ + switch (config->hca_attr.wqe_inline_mode) { + case MLX5_CAP_INLINE_MODE_L2: + /* outer L2 header must be inlined. */ + config->txq_inline_min = MLX5_INLINE_HSIZE_L2; + goto exit; + case MLX5_CAP_INLINE_MODE_NOT_REQUIRED: + /* No inline data are required by NIC. */ + config->txq_inline_min = MLX5_INLINE_HSIZE_NONE; + config->hw_vlan_insert = + config->hca_attr.wqe_vlan_insert; + DRV_LOG(DEBUG, "Tx VLAN insertion is supported"); + goto exit; + case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT: + /* inline mode is defined by NIC vport context. */ + if (!config->hca_attr.eth_virt) + break; + switch (config->hca_attr.vport_inline_mode) { + case MLX5_INLINE_MODE_NONE: + config->txq_inline_min = + MLX5_INLINE_HSIZE_NONE; + goto exit; + case MLX5_INLINE_MODE_L2: + config->txq_inline_min = + MLX5_INLINE_HSIZE_L2; + goto exit; + case MLX5_INLINE_MODE_IP: + config->txq_inline_min = + MLX5_INLINE_HSIZE_L3; + goto exit; + case MLX5_INLINE_MODE_TCP_UDP: + config->txq_inline_min = + MLX5_INLINE_HSIZE_L4; + goto exit; + case MLX5_INLINE_MODE_INNER_L2: + config->txq_inline_min = + MLX5_INLINE_HSIZE_INNER_L2; + goto exit; + case MLX5_INLINE_MODE_INNER_IP: + config->txq_inline_min = + MLX5_INLINE_HSIZE_INNER_L3; + goto exit; + case MLX5_INLINE_MODE_INNER_TCP_UDP: + config->txq_inline_min = + MLX5_INLINE_HSIZE_INNER_L4; + goto exit; + } + } + } + /* + * We get here if we are unable to deduce + * inline data size with DevX. Try PCI ID + * to determine old NICs. + */ + switch (spawn->pci_dev->id.device_id) { + case PCI_DEVICE_ID_MELLANOX_CONNECTX4: + case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: + case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX: + case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF: + config->txq_inline_min = MLX5_INLINE_HSIZE_L2; + config->hw_vlan_insert = 0; + break; + case PCI_DEVICE_ID_MELLANOX_CONNECTX5: + case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: + case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX: + case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: + /* + * These NICs support VLAN insertion from WQE and + * report the wqe_vlan_insert flag. But there is the bug + * and PFC control may be broken, so disable feature. + */ + config->hw_vlan_insert = 0; + config->txq_inline_min = MLX5_INLINE_HSIZE_NONE; + break; + default: + config->txq_inline_min = MLX5_INLINE_HSIZE_NONE; + break; + } +exit: + DRV_LOG(DEBUG, "min tx inline configured: %d", config->txq_inline_min); +} + +/** + * Configures the metadata mask fields in the shared context. + * + * @param [in] dev + * Pointer to Ethernet device. + */ +static void +mlx5_set_metadata_mask(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; + uint32_t meta, mark, reg_c0; + + reg_c0 = ~priv->vport_meta_mask; + switch (priv->config.dv_xmeta_en) { + case MLX5_XMETA_MODE_LEGACY: + meta = UINT32_MAX; + mark = MLX5_FLOW_MARK_MASK; + break; + case MLX5_XMETA_MODE_META16: + meta = reg_c0 >> rte_bsf32(reg_c0); + mark = MLX5_FLOW_MARK_MASK; + break; + case MLX5_XMETA_MODE_META32: + meta = UINT32_MAX; + mark = (reg_c0 >> rte_bsf32(reg_c0)) & MLX5_FLOW_MARK_MASK; + break; + default: + meta = 0; + mark = 0; + MLX5_ASSERT(false); + break; + } + if (sh->dv_mark_mask && sh->dv_mark_mask != mark) + DRV_LOG(WARNING, "metadata MARK mask mismatche %08X:%08X", + sh->dv_mark_mask, mark); + else + sh->dv_mark_mask = mark; + if (sh->dv_meta_mask && sh->dv_meta_mask != meta) + DRV_LOG(WARNING, "metadata META mask mismatche %08X:%08X", + sh->dv_meta_mask, meta); + else + sh->dv_meta_mask = meta; + if (sh->dv_regc0_mask && sh->dv_regc0_mask != reg_c0) + DRV_LOG(WARNING, "metadata reg_c0 mask mismatche %08X:%08X", + sh->dv_meta_mask, reg_c0); + else + sh->dv_regc0_mask = reg_c0; + DRV_LOG(DEBUG, "metadata mode %u", priv->config.dv_xmeta_en); + DRV_LOG(DEBUG, "metadata MARK mask %08X", sh->dv_mark_mask); + DRV_LOG(DEBUG, "metadata META mask %08X", sh->dv_meta_mask); + DRV_LOG(DEBUG, "metadata reg_c0 mask %08X", sh->dv_regc0_mask); +} + +/** + * Allocate page of door-bells and register it using DevX API. + * + * @param [in] dev + * Pointer to Ethernet device. + * + * @return + * Pointer to new page on success, NULL otherwise. + */ +static struct mlx5_devx_dbr_page * +mlx5_alloc_dbr_page(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_devx_dbr_page *page; + + /* Allocate space for door-bell page and management data. */ + page = rte_calloc_socket(__func__, 1, sizeof(struct mlx5_devx_dbr_page), + RTE_CACHE_LINE_SIZE, dev->device->numa_node); + if (!page) { + DRV_LOG(ERR, "port %u cannot allocate dbr page", + dev->data->port_id); + return NULL; + } + /* Register allocated memory. */ + page->umem = mlx5_glue->devx_umem_reg(priv->sh->ctx, page->dbrs, + MLX5_DBR_PAGE_SIZE, 0); + if (!page->umem) { + DRV_LOG(ERR, "port %u cannot umem reg dbr page", + dev->data->port_id); + rte_free(page); + return NULL; + } + return page; +} + +/** + * Find the next available door-bell, allocate new page if needed. + * + * @param [in] dev + * Pointer to Ethernet device. + * @param [out] dbr_page + * Door-bell page containing the page data. + * + * @return + * Door-bell address offset on success, a negative error value otherwise. + */ +int64_t +mlx5_get_dbr(struct rte_eth_dev *dev, struct mlx5_devx_dbr_page **dbr_page) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_devx_dbr_page *page = NULL; + uint32_t i, j; + + LIST_FOREACH(page, &priv->dbrpgs, next) + if (page->dbr_count < MLX5_DBR_PER_PAGE) + break; + if (!page) { /* No page with free door-bell exists. */ + page = mlx5_alloc_dbr_page(dev); + if (!page) /* Failed to allocate new page. */ + return (-1); + LIST_INSERT_HEAD(&priv->dbrpgs, page, next); + } + /* Loop to find bitmap part with clear bit. */ + for (i = 0; + i < MLX5_DBR_BITMAP_SIZE && page->dbr_bitmap[i] == UINT64_MAX; + i++) + ; /* Empty. */ + /* Find the first clear bit. */ + MLX5_ASSERT(i < MLX5_DBR_BITMAP_SIZE); + j = rte_bsf64(~page->dbr_bitmap[i]); + page->dbr_bitmap[i] |= (UINT64_C(1) << j); + page->dbr_count++; + *dbr_page = page; + return (((i * 64) + j) * sizeof(uint64_t)); +} + +/** + * Release a door-bell record. + * + * @param [in] dev + * Pointer to Ethernet device. + * @param [in] umem_id + * UMEM ID of page containing the door-bell record to release. + * @param [in] offset + * Offset of door-bell record in page. + * + * @return + * 0 on success, a negative error value otherwise. + */ +int32_t +mlx5_release_dbr(struct rte_eth_dev *dev, uint32_t umem_id, uint64_t offset) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_devx_dbr_page *page = NULL; + int ret = 0; + + LIST_FOREACH(page, &priv->dbrpgs, next) + /* Find the page this address belongs to. */ + if (page->umem->umem_id == umem_id) + break; + if (!page) + return -EINVAL; + page->dbr_count--; + if (!page->dbr_count) { + /* Page not used, free it and remove from list. */ + LIST_REMOVE(page, next); + if (page->umem) + ret = -mlx5_glue->devx_umem_dereg(page->umem); + rte_free(page); + } else { + /* Mark in bitmap that this door-bell is not in use. */ + offset /= MLX5_DBR_SIZE; + int i = offset / 64; + int j = offset % 64; + + page->dbr_bitmap[i] &= ~(UINT64_C(1) << j); + } + return ret; +} + +int +rte_pmd_mlx5_get_dyn_flag_names(char *names[], unsigned int n) +{ + static const char *const dynf_names[] = { + RTE_PMD_MLX5_FINE_GRANULARITY_INLINE, + RTE_MBUF_DYNFLAG_METADATA_NAME + }; + unsigned int i; + + if (n < RTE_DIM(dynf_names)) + return -ENOMEM; + for (i = 0; i < RTE_DIM(dynf_names); i++) { + if (names[i] == NULL) + return -EINVAL; + strcpy(names[i], dynf_names[i]); + } + return RTE_DIM(dynf_names); +} + +/** + * Check sibling device configurations. + * + * Sibling devices sharing the Infiniband device context + * should have compatible configurations. This regards + * representors and bonding slaves. + * + * @param priv + * Private device descriptor. + * @param config + * Configuration of the device is going to be created. + * + * @return + * 0 on success, EINVAL otherwise + */ +static int +mlx5_dev_check_sibling_config(struct mlx5_priv *priv, + struct mlx5_dev_config *config) +{ + struct mlx5_ibv_shared *sh = priv->sh; + struct mlx5_dev_config *sh_conf = NULL; + uint16_t port_id; + + MLX5_ASSERT(sh); + /* Nothing to compare for the single/first device. */ + if (sh->refcnt == 1) + return 0; + /* Find the device with shared context. */ + MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { + struct mlx5_priv *opriv = + rte_eth_devices[port_id].data->dev_private; + + if (opriv && opriv != priv && opriv->sh == sh) { + sh_conf = &opriv->config; + break; + } + } + if (!sh_conf) + return 0; + if (sh_conf->dv_flow_en ^ config->dv_flow_en) { + DRV_LOG(ERR, "\"dv_flow_en\" configuration mismatch" + " for shared %s context", sh->ibdev_name); + rte_errno = EINVAL; + return rte_errno; + } + if (sh_conf->dv_xmeta_en ^ config->dv_xmeta_en) { + DRV_LOG(ERR, "\"dv_xmeta_en\" configuration mismatch" + " for shared %s context", sh->ibdev_name); + rte_errno = EINVAL; + return rte_errno; + } + return 0; +} +/** + * Spawn an Ethernet device from Verbs information. + * + * @param dpdk_dev + * Backing DPDK device. + * @param spawn + * Verbs device parameters (name, port, switch_info) to spawn. + * @param config + * Device configuration parameters. + * + * @return + * A valid Ethernet device object on success, NULL otherwise and rte_errno + * is set. The following errors are defined: + * + * EBUSY: device is not supposed to be spawned. + * EEXIST: device is already spawned + */ +static struct rte_eth_dev * +mlx5_dev_spawn(struct rte_device *dpdk_dev, + struct mlx5_dev_spawn_data *spawn, + struct mlx5_dev_config config) +{ + const struct mlx5_switch_info *switch_info = &spawn->info; + struct mlx5_ibv_shared *sh = NULL; + struct ibv_port_attr port_attr; + struct mlx5dv_context dv_attr = { .comp_mask = 0 }; + struct rte_eth_dev *eth_dev = NULL; + struct mlx5_priv *priv = NULL; + int err = 0; + unsigned int hw_padding = 0; + unsigned int mps; + unsigned int cqe_comp; + unsigned int cqe_pad = 0; + unsigned int tunnel_en = 0; + unsigned int mpls_en = 0; + unsigned int swp = 0; + unsigned int mprq = 0; + unsigned int mprq_min_stride_size_n = 0; + unsigned int mprq_max_stride_size_n = 0; + unsigned int mprq_min_stride_num_n = 0; + unsigned int mprq_max_stride_num_n = 0; + struct rte_ether_addr mac; + char name[RTE_ETH_NAME_MAX_LEN]; + int own_domain_id = 0; + uint16_t port_id; + unsigned int i; +#ifdef HAVE_MLX5DV_DR_DEVX_PORT + struct mlx5dv_devx_port devx_port = { .comp_mask = 0 }; +#endif + + /* Determine if this port representor is supposed to be spawned. */ + if (switch_info->representor && dpdk_dev->devargs) { + struct rte_eth_devargs eth_da; + + err = rte_eth_devargs_parse(dpdk_dev->devargs->args, ð_da); + if (err) { + rte_errno = -err; + DRV_LOG(ERR, "failed to process device arguments: %s", + strerror(rte_errno)); + return NULL; + } + for (i = 0; i < eth_da.nb_representor_ports; ++i) + if (eth_da.representor_ports[i] == + (uint16_t)switch_info->port_name) + break; + if (i == eth_da.nb_representor_ports) { + rte_errno = EBUSY; + return NULL; + } + } + /* Build device name. */ + if (spawn->pf_bond < 0) { + /* Single device. */ + if (!switch_info->representor) + strlcpy(name, dpdk_dev->name, sizeof(name)); + else + snprintf(name, sizeof(name), "%s_representor_%u", + dpdk_dev->name, switch_info->port_name); + } else { + /* Bonding device. */ + if (!switch_info->representor) + snprintf(name, sizeof(name), "%s_%s", + dpdk_dev->name, spawn->ibv_dev->name); + else + snprintf(name, sizeof(name), "%s_%s_representor_%u", + dpdk_dev->name, spawn->ibv_dev->name, + switch_info->port_name); + } + /* check if the device is already spawned */ + if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) { + rte_errno = EEXIST; + return NULL; + } + DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name); + if (rte_eal_process_type() == RTE_PROC_SECONDARY) { + struct mlx5_mp_id mp_id; + + eth_dev = rte_eth_dev_attach_secondary(name); + if (eth_dev == NULL) { + DRV_LOG(ERR, "can not attach rte ethdev"); + rte_errno = ENOMEM; + return NULL; + } + eth_dev->device = dpdk_dev; + eth_dev->dev_ops = &mlx5_dev_sec_ops; + err = mlx5_proc_priv_init(eth_dev); + if (err) + return NULL; + mp_id.port_id = eth_dev->data->port_id; + strlcpy(mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN); + /* Receive command fd from primary process */ + err = mlx5_mp_req_verbs_cmd_fd(&mp_id); + if (err < 0) + return NULL; + /* Remap UAR for Tx queues. */ + err = mlx5_tx_uar_init_secondary(eth_dev, err); + if (err) + return NULL; + /* + * Ethdev pointer is still required as input since + * the primary device is not accessible from the + * secondary process. + */ + eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev); + eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev); + return eth_dev; + } + /* + * Some parameters ("tx_db_nc" in particularly) are needed in + * advance to create dv/verbs device context. We proceed the + * devargs here to get ones, and later proceed devargs again + * to override some hardware settings. + */ + err = mlx5_args(&config, dpdk_dev->devargs); + if (err) { + err = rte_errno; + DRV_LOG(ERR, "failed to process device arguments: %s", + strerror(rte_errno)); + goto error; + } + sh = mlx5_alloc_shared_ibctx(spawn, &config); + if (!sh) + return NULL; + config.devx = sh->devx; +#ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR + config.dest_tir = 1; +#endif +#ifdef HAVE_IBV_MLX5_MOD_SWP + dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP; +#endif + /* + * Multi-packet send is supported by ConnectX-4 Lx PF as well + * as all ConnectX-5 devices. + */ +#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS; +#endif +#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT + dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ; +#endif + mlx5_glue->dv_query_device(sh->ctx, &dv_attr); + if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { + if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { + DRV_LOG(DEBUG, "enhanced MPW is supported"); + mps = MLX5_MPW_ENHANCED; + } else { + DRV_LOG(DEBUG, "MPW is supported"); + mps = MLX5_MPW; + } + } else { + DRV_LOG(DEBUG, "MPW isn't supported"); + mps = MLX5_MPW_DISABLED; + } +#ifdef HAVE_IBV_MLX5_MOD_SWP + if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP) + swp = dv_attr.sw_parsing_caps.sw_parsing_offloads; + DRV_LOG(DEBUG, "SWP support: %u", swp); +#endif + config.swp = !!swp; +#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT + if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) { + struct mlx5dv_striding_rq_caps mprq_caps = + dv_attr.striding_rq_caps; + + DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d", + mprq_caps.min_single_stride_log_num_of_bytes); + DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d", + mprq_caps.max_single_stride_log_num_of_bytes); + DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d", + mprq_caps.min_single_wqe_log_num_of_strides); + DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d", + mprq_caps.max_single_wqe_log_num_of_strides); + DRV_LOG(DEBUG, "\tsupported_qpts: %d", + mprq_caps.supported_qpts); + DRV_LOG(DEBUG, "device supports Multi-Packet RQ"); + mprq = 1; + mprq_min_stride_size_n = + mprq_caps.min_single_stride_log_num_of_bytes; + mprq_max_stride_size_n = + mprq_caps.max_single_stride_log_num_of_bytes; + mprq_min_stride_num_n = + mprq_caps.min_single_wqe_log_num_of_strides; + mprq_max_stride_num_n = + mprq_caps.max_single_wqe_log_num_of_strides; + } +#endif + if (RTE_CACHE_LINE_SIZE == 128 && + !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) + cqe_comp = 0; + else + cqe_comp = 1; + config.cqe_comp = cqe_comp; +#ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD + /* Whether device supports 128B Rx CQE padding. */ + cqe_pad = RTE_CACHE_LINE_SIZE == 128 && + (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD); +#endif +#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) { + tunnel_en = ((dv_attr.tunnel_offloads_caps & + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) && + (dv_attr.tunnel_offloads_caps & + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE) && + (dv_attr.tunnel_offloads_caps & + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE)); + } + DRV_LOG(DEBUG, "tunnel offloading is %ssupported", + tunnel_en ? "" : "not "); +#else + DRV_LOG(WARNING, + "tunnel offloading disabled due to old OFED/rdma-core version"); +#endif + config.tunnel_en = tunnel_en; +#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT + mpls_en = ((dv_attr.tunnel_offloads_caps & + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) && + (dv_attr.tunnel_offloads_caps & + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP)); + DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported", + mpls_en ? "" : "not "); +#else + DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to" + " old OFED/rdma-core version or firmware configuration"); +#endif + config.mpls_en = mpls_en; + /* Check port status. */ + err = mlx5_glue->query_port(sh->ctx, spawn->ibv_port, &port_attr); + if (err) { + DRV_LOG(ERR, "port query failed: %s", strerror(err)); + goto error; + } + if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { + DRV_LOG(ERR, "port is not configured in Ethernet mode"); + err = EINVAL; + goto error; + } + if (port_attr.state != IBV_PORT_ACTIVE) + DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)", + mlx5_glue->port_state_str(port_attr.state), + port_attr.state); + /* Allocate private eth device data. */ + priv = rte_zmalloc("ethdev private structure", + sizeof(*priv), + RTE_CACHE_LINE_SIZE); + if (priv == NULL) { + DRV_LOG(ERR, "priv allocation failure"); + err = ENOMEM; + goto error; + } + priv->sh = sh; + priv->ibv_port = spawn->ibv_port; + priv->pci_dev = spawn->pci_dev; + priv->mtu = RTE_ETHER_MTU; + priv->mp_id.port_id = port_id; + strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN); +#ifndef RTE_ARCH_64 + /* Initialize UAR access locks for 32bit implementations. */ + rte_spinlock_init(&priv->uar_lock_cq); + for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++) + rte_spinlock_init(&priv->uar_lock[i]); +#endif + /* Some internal functions rely on Netlink sockets, open them now. */ + priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA); + priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE); + priv->representor = !!switch_info->representor; + priv->master = !!switch_info->master; + priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; + priv->vport_meta_tag = 0; + priv->vport_meta_mask = 0; + priv->pf_bond = spawn->pf_bond; +#ifdef HAVE_MLX5DV_DR_DEVX_PORT + /* + * The DevX port query API is implemented. E-Switch may use + * either vport or reg_c[0] metadata register to match on + * vport index. The engaged part of metadata register is + * defined by mask. + */ + if (switch_info->representor || switch_info->master) { + devx_port.comp_mask = MLX5DV_DEVX_PORT_VPORT | + MLX5DV_DEVX_PORT_MATCH_REG_C_0; + err = mlx5_glue->devx_port_query(sh->ctx, spawn->ibv_port, + &devx_port); + if (err) { + DRV_LOG(WARNING, + "can't query devx port %d on device %s", + spawn->ibv_port, spawn->ibv_dev->name); + devx_port.comp_mask = 0; + } + } + if (devx_port.comp_mask & MLX5DV_DEVX_PORT_MATCH_REG_C_0) { + priv->vport_meta_tag = devx_port.reg_c_0.value; + priv->vport_meta_mask = devx_port.reg_c_0.mask; + if (!priv->vport_meta_mask) { + DRV_LOG(ERR, "vport zero mask for port %d" + " on bonding device %s", + spawn->ibv_port, spawn->ibv_dev->name); + err = ENOTSUP; + goto error; + } + if (priv->vport_meta_tag & ~priv->vport_meta_mask) { + DRV_LOG(ERR, "invalid vport tag for port %d" + " on bonding device %s", + spawn->ibv_port, spawn->ibv_dev->name); + err = ENOTSUP; + goto error; + } + } + if (devx_port.comp_mask & MLX5DV_DEVX_PORT_VPORT) { + priv->vport_id = devx_port.vport_num; + } else if (spawn->pf_bond >= 0) { + DRV_LOG(ERR, "can't deduce vport index for port %d" + " on bonding device %s", + spawn->ibv_port, spawn->ibv_dev->name); + err = ENOTSUP; + goto error; + } else { + /* Suppose vport index in compatible way. */ + priv->vport_id = switch_info->representor ? + switch_info->port_name + 1 : -1; + } +#else + /* + * Kernel/rdma_core support single E-Switch per PF configurations + * only and vport_id field contains the vport index for + * associated VF, which is deduced from representor port name. + * For example, let's have the IB device port 10, it has + * attached network device eth0, which has port name attribute + * pf0vf2, we can deduce the VF number as 2, and set vport index + * as 3 (2+1). This assigning schema should be changed if the + * multiple E-Switch instances per PF configurations or/and PCI + * subfunctions are added. + */ + priv->vport_id = switch_info->representor ? + switch_info->port_name + 1 : -1; +#endif + /* representor_id field keeps the unmodified VF index. */ + priv->representor_id = switch_info->representor ? + switch_info->port_name : -1; + /* + * Look for sibling devices in order to reuse their switch domain + * if any, otherwise allocate one. + */ + MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { + const struct mlx5_priv *opriv = + rte_eth_devices[port_id].data->dev_private; + + if (!opriv || + opriv->sh != priv->sh || + opriv->domain_id == + RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) + continue; + priv->domain_id = opriv->domain_id; + break; + } + if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) { + err = rte_eth_switch_domain_alloc(&priv->domain_id); + if (err) { + err = rte_errno; + DRV_LOG(ERR, "unable to allocate switch domain: %s", + strerror(rte_errno)); + goto error; + } + own_domain_id = 1; + } + /* Override some values set by hardware configuration. */ + mlx5_args(&config, dpdk_dev->devargs); + err = mlx5_dev_check_sibling_config(priv, &config); + if (err) + goto error; + config.hw_csum = !!(sh->device_attr.device_cap_flags_ex & + IBV_DEVICE_RAW_IP_CSUM); + DRV_LOG(DEBUG, "checksum offloading is %ssupported", + (config.hw_csum ? "" : "not ")); +#if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \ + !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) + DRV_LOG(DEBUG, "counters are not supported"); +#endif +#if !defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_MLX5DV_DR) + if (config.dv_flow_en) { + DRV_LOG(WARNING, "DV flow is not supported"); + config.dv_flow_en = 0; + } +#endif + config.ind_table_max_size = + sh->device_attr.rss_caps.max_rwq_indirection_table_size; + /* + * Remove this check once DPDK supports larger/variable + * indirection tables. + */ + if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512) + config.ind_table_max_size = ETH_RSS_RETA_SIZE_512; + DRV_LOG(DEBUG, "maximum Rx indirection table size is %u", + config.ind_table_max_size); + config.hw_vlan_strip = !!(sh->device_attr.raw_packet_caps & + IBV_RAW_PACKET_CAP_CVLAN_STRIPPING); + DRV_LOG(DEBUG, "VLAN stripping is %ssupported", + (config.hw_vlan_strip ? "" : "not ")); + config.hw_fcs_strip = !!(sh->device_attr.raw_packet_caps & + IBV_RAW_PACKET_CAP_SCATTER_FCS); + DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported", + (config.hw_fcs_strip ? "" : "not ")); +#if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING) + hw_padding = !!sh->device_attr.rx_pad_end_addr_align; +#elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING) + hw_padding = !!(sh->device_attr.device_cap_flags_ex & + IBV_DEVICE_PCI_WRITE_END_PADDING); +#endif + if (config.hw_padding && !hw_padding) { + DRV_LOG(DEBUG, "Rx end alignment padding isn't supported"); + config.hw_padding = 0; + } else if (config.hw_padding) { + DRV_LOG(DEBUG, "Rx end alignment padding is enabled"); + } + config.tso = (sh->device_attr.tso_caps.max_tso > 0 && + (sh->device_attr.tso_caps.supported_qpts & + (1 << IBV_QPT_RAW_PACKET))); + if (config.tso) + config.tso_max_payload_sz = sh->device_attr.tso_caps.max_tso; + /* + * MPW is disabled by default, while the Enhanced MPW is enabled + * by default. + */ + if (config.mps == MLX5_ARG_UNSET) + config.mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED : + MLX5_MPW_DISABLED; + else + config.mps = config.mps ? mps : MLX5_MPW_DISABLED; + DRV_LOG(INFO, "%sMPS is %s", + config.mps == MLX5_MPW_ENHANCED ? "enhanced " : + config.mps == MLX5_MPW ? "legacy " : "", + config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled"); + if (config.cqe_comp && !cqe_comp) { + DRV_LOG(WARNING, "Rx CQE compression isn't supported"); + config.cqe_comp = 0; + } + if (config.cqe_pad && !cqe_pad) { + DRV_LOG(WARNING, "Rx CQE padding isn't supported"); + config.cqe_pad = 0; + } else if (config.cqe_pad) { + DRV_LOG(INFO, "Rx CQE padding is enabled"); + } + if (config.devx) { + priv->counter_fallback = 0; + err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config.hca_attr); + if (err) { + err = -err; + goto error; + } + if (!config.hca_attr.flow_counters_dump) + priv->counter_fallback = 1; +#ifndef HAVE_IBV_DEVX_ASYNC + priv->counter_fallback = 1; +#endif + if (priv->counter_fallback) + DRV_LOG(INFO, "Use fall-back DV counter management"); + /* Check for LRO support. */ + if (config.dest_tir && config.hca_attr.lro_cap && + config.dv_flow_en) { + /* TBD check tunnel lro caps. */ + config.lro.supported = config.hca_attr.lro_cap; + DRV_LOG(DEBUG, "Device supports LRO"); + /* + * If LRO timeout is not configured by application, + * use the minimal supported value. + */ + if (!config.lro.timeout) + config.lro.timeout = + config.hca_attr.lro_timer_supported_periods[0]; + DRV_LOG(DEBUG, "LRO session timeout set to %d usec", + config.lro.timeout); + } +#if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER) + if (config.hca_attr.qos.sup && config.hca_attr.qos.srtcm_sup && + config.dv_flow_en) { + uint8_t reg_c_mask = + config.hca_attr.qos.flow_meter_reg_c_ids; + /* + * Meter needs two REG_C's for color match and pre-sfx + * flow match. Here get the REG_C for color match. + * REG_C_0 and REG_C_1 is reserved for metadata feature. + */ + reg_c_mask &= 0xfc; + if (__builtin_popcount(reg_c_mask) < 1) { + priv->mtr_en = 0; + DRV_LOG(WARNING, "No available register for" + " meter."); + } else { + priv->mtr_color_reg = ffs(reg_c_mask) - 1 + + REG_C_0; + priv->mtr_en = 1; + priv->mtr_reg_share = + config.hca_attr.qos.flow_meter_reg_share; + DRV_LOG(DEBUG, "The REG_C meter uses is %d", + priv->mtr_color_reg); + } + } +#endif + } + if (config.mprq.enabled && mprq) { + if (config.mprq.stride_num_n && + (config.mprq.stride_num_n > mprq_max_stride_num_n || + config.mprq.stride_num_n < mprq_min_stride_num_n)) { + config.mprq.stride_num_n = + RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, + mprq_min_stride_num_n), + mprq_max_stride_num_n); + DRV_LOG(WARNING, + "the number of strides" + " for Multi-Packet RQ is out of range," + " setting default value (%u)", + 1 << config.mprq.stride_num_n); + } + if (config.mprq.stride_size_n && + (config.mprq.stride_size_n > mprq_max_stride_size_n || + config.mprq.stride_size_n < mprq_min_stride_size_n)) { + config.mprq.stride_size_n = + RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_SIZE_N, + mprq_min_stride_size_n), + mprq_max_stride_size_n); + DRV_LOG(WARNING, + "the size of a stride" + " for Multi-Packet RQ is out of range," + " setting default value (%u)", + 1 << config.mprq.stride_size_n); + } + config.mprq.min_stride_size_n = mprq_min_stride_size_n; + config.mprq.max_stride_size_n = mprq_max_stride_size_n; + } else if (config.mprq.enabled && !mprq) { + DRV_LOG(WARNING, "Multi-Packet RQ isn't supported"); + config.mprq.enabled = 0; + } + if (config.max_dump_files_num == 0) + config.max_dump_files_num = 128; + eth_dev = rte_eth_dev_allocate(name); + if (eth_dev == NULL) { + DRV_LOG(ERR, "can not allocate rte ethdev"); + err = ENOMEM; + goto error; + } + /* Flag to call rte_eth_dev_release_port() in rte_eth_dev_close(). */ + eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE; + if (priv->representor) { + eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR; + eth_dev->data->representor_id = priv->representor_id; + } + /* + * Store associated network device interface index. This index + * is permanent throughout the lifetime of device. So, we may store + * the ifindex here and use the cached value further. + */ + MLX5_ASSERT(spawn->ifindex); + priv->if_index = spawn->ifindex; + eth_dev->data->dev_private = priv; + priv->dev_data = eth_dev->data; + eth_dev->data->mac_addrs = priv->mac; + eth_dev->device = dpdk_dev; + /* Configure the first MAC address by default. */ + if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) { + DRV_LOG(ERR, + "port %u cannot get MAC address, is mlx5_en" + " loaded? (errno: %s)", + eth_dev->data->port_id, strerror(rte_errno)); + err = ENODEV; + goto error; + } + DRV_LOG(INFO, + "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", + eth_dev->data->port_id, + mac.addr_bytes[0], mac.addr_bytes[1], + mac.addr_bytes[2], mac.addr_bytes[3], + mac.addr_bytes[4], mac.addr_bytes[5]); +#ifdef RTE_LIBRTE_MLX5_DEBUG + { + char ifname[IF_NAMESIZE]; + + if (mlx5_get_ifname(eth_dev, &ifname) == 0) + DRV_LOG(DEBUG, "port %u ifname is \"%s\"", + eth_dev->data->port_id, ifname); + else + DRV_LOG(DEBUG, "port %u ifname is unknown", + eth_dev->data->port_id); + } +#endif + /* Get actual MTU if possible. */ + err = mlx5_get_mtu(eth_dev, &priv->mtu); + if (err) { + err = rte_errno; + goto error; + } + DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id, + priv->mtu); + /* Initialize burst functions to prevent crashes before link-up. */ + eth_dev->rx_pkt_burst = removed_rx_burst; + eth_dev->tx_pkt_burst = removed_tx_burst; + eth_dev->dev_ops = &mlx5_dev_ops; + /* Register MAC address. */ + claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); + if (config.vf && config.vf_nl_en) + mlx5_nl_mac_addr_sync(priv->nl_socket_route, + mlx5_ifindex(eth_dev), + eth_dev->data->mac_addrs, + MLX5_MAX_MAC_ADDRESSES); + priv->flows = 0; + priv->ctrl_flows = 0; + TAILQ_INIT(&priv->flow_meters); + TAILQ_INIT(&priv->flow_meter_profiles); + /* Hint libmlx5 to use PMD allocator for data plane resources */ + struct mlx5dv_ctx_allocators alctr = { + .alloc = &mlx5_alloc_verbs_buf, + .free = &mlx5_free_verbs_buf, + .data = priv, + }; + mlx5_glue->dv_set_context_attr(sh->ctx, + MLX5DV_CTX_ATTR_BUF_ALLOCATORS, + (void *)((uintptr_t)&alctr)); + /* Bring Ethernet device up. */ + DRV_LOG(DEBUG, "port %u forcing Ethernet interface up", + eth_dev->data->port_id); + mlx5_set_link_up(eth_dev); + /* + * Even though the interrupt handler is not installed yet, + * interrupts will still trigger on the async_fd from + * Verbs context returned by ibv_open_device(). + */ + mlx5_link_update(eth_dev, 0); +#ifdef HAVE_MLX5DV_DR_ESWITCH + if (!(config.hca_attr.eswitch_manager && config.dv_flow_en && + (switch_info->representor || switch_info->master))) + config.dv_esw_en = 0; +#else + config.dv_esw_en = 0; +#endif + /* Detect minimal data bytes to inline. */ + mlx5_set_min_inline(spawn, &config); + /* Store device configuration on private structure. */ + priv->config = config; + /* Create context for virtual machine VLAN workaround. */ + priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex); + if (config.dv_flow_en) { + err = mlx5_alloc_shared_dr(priv); + if (err) + goto error; + /* + * RSS id is shared with meter flow id. Meter flow id can only + * use the 24 MSB of the register. + */ + priv->qrss_id_pool = mlx5_flow_id_pool_alloc(UINT32_MAX >> + MLX5_MTR_COLOR_BITS); + if (!priv->qrss_id_pool) { + DRV_LOG(ERR, "can't create flow id pool"); + err = ENOMEM; + goto error; + } + } + /* Supported Verbs flow priority number detection. */ + err = mlx5_flow_discover_priorities(eth_dev); + if (err < 0) { + err = -err; + goto error; + } + priv->config.flow_prio = err; + if (!priv->config.dv_esw_en && + priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { + DRV_LOG(WARNING, "metadata mode %u is not supported " + "(no E-Switch)", priv->config.dv_xmeta_en); + priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY; + } + mlx5_set_metadata_mask(eth_dev); + if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && + !priv->sh->dv_regc0_mask) { + DRV_LOG(ERR, "metadata mode %u is not supported " + "(no metadata reg_c[0] is available)", + priv->config.dv_xmeta_en); + err = ENOTSUP; + goto error; + } + /* + * Allocate the buffer for flow creating, just once. + * The allocation must be done before any flow creating. + */ + mlx5_flow_alloc_intermediate(eth_dev); + /* Query availibility of metadata reg_c's. */ + err = mlx5_flow_discover_mreg_c(eth_dev); + if (err < 0) { + err = -err; + goto error; + } + if (!mlx5_flow_ext_mreg_supported(eth_dev)) { + DRV_LOG(DEBUG, + "port %u extensive metadata register is not supported", + eth_dev->data->port_id); + if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { + DRV_LOG(ERR, "metadata mode %u is not supported " + "(no metadata registers available)", + priv->config.dv_xmeta_en); + err = ENOTSUP; + goto error; + } + } + if (priv->config.dv_flow_en && + priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && + mlx5_flow_ext_mreg_supported(eth_dev) && + priv->sh->dv_regc0_mask) { + priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME, + MLX5_FLOW_MREG_HTABLE_SZ); + if (!priv->mreg_cp_tbl) { + err = ENOMEM; + goto error; + } + } + return eth_dev; +error: + if (priv) { + if (priv->mreg_cp_tbl) + mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL); + if (priv->sh) + mlx5_free_shared_dr(priv); + if (priv->nl_socket_route >= 0) + close(priv->nl_socket_route); + if (priv->nl_socket_rdma >= 0) + close(priv->nl_socket_rdma); + if (priv->vmwa_context) + mlx5_vlan_vmwa_exit(priv->vmwa_context); + if (priv->qrss_id_pool) + mlx5_flow_id_pool_release(priv->qrss_id_pool); + if (own_domain_id) + claim_zero(rte_eth_switch_domain_free(priv->domain_id)); + rte_free(priv); + if (eth_dev != NULL) + eth_dev->data->dev_private = NULL; + } + if (eth_dev != NULL) { + /* mac_addrs must not be freed alone because part of dev_private */ + eth_dev->data->mac_addrs = NULL; + rte_eth_dev_release_port(eth_dev); + } + if (sh) + mlx5_free_shared_ibctx(sh); + MLX5_ASSERT(err > 0); + rte_errno = err; + return NULL; +} + +/** + * Comparison callback to sort device data. + * + * This is meant to be used with qsort(). + * + * @param a[in] + * Pointer to pointer to first data object. + * @param b[in] + * Pointer to pointer to second data object. + * + * @return + * 0 if both objects are equal, less than 0 if the first argument is less + * than the second, greater than 0 otherwise. + */ +static int +mlx5_dev_spawn_data_cmp(const void *a, const void *b) +{ + const struct mlx5_switch_info *si_a = + &((const struct mlx5_dev_spawn_data *)a)->info; + const struct mlx5_switch_info *si_b = + &((const struct mlx5_dev_spawn_data *)b)->info; + int ret; + + /* Master device first. */ + ret = si_b->master - si_a->master; + if (ret) + return ret; + /* Then representor devices. */ + ret = si_b->representor - si_a->representor; + if (ret) + return ret; + /* Unidentified devices come last in no specific order. */ + if (!si_a->representor) + return 0; + /* Order representors by name. */ + return si_a->port_name - si_b->port_name; +} + +/** + * Match PCI information for possible slaves of bonding device. + * + * @param[in] ibv_dev + * Pointer to Infiniband device structure. + * @param[in] pci_dev + * Pointer to PCI device structure to match PCI address. + * @param[in] nl_rdma + * Netlink RDMA group socket handle. + * + * @return + * negative value if no bonding device found, otherwise + * positive index of slave PF in bonding. + */ +static int +mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev, + const struct rte_pci_device *pci_dev, + int nl_rdma) +{ + char ifname[IF_NAMESIZE + 1]; + unsigned int ifindex; + unsigned int np, i; + FILE *file = NULL; + int pf = -1; + + /* + * Try to get master device name. If something goes + * wrong suppose the lack of kernel support and no + * bonding devices. + */ + if (nl_rdma < 0) + return -1; + if (!strstr(ibv_dev->name, "bond")) + return -1; + np = mlx5_nl_portnum(nl_rdma, ibv_dev->name); + if (!np) + return -1; + /* + * The Master device might not be on the predefined + * port (not on port index 1, it is not garanted), + * we have to scan all Infiniband device port and + * find master. + */ + for (i = 1; i <= np; ++i) { + /* Check whether Infiniband port is populated. */ + ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i); + if (!ifindex) + continue; + if (!if_indextoname(ifindex, ifname)) + continue; + /* Try to read bonding slave names from sysfs. */ + MKSTR(slaves, + "/sys/class/net/%s/master/bonding/slaves", ifname); + file = fopen(slaves, "r"); + if (file) + break; + } + if (!file) + return -1; + /* Use safe format to check maximal buffer length. */ + MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE); + while (fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) { + char tmp_str[IF_NAMESIZE + 32]; + struct rte_pci_addr pci_addr; + struct mlx5_switch_info info; + + /* Process slave interface names in the loop. */ + snprintf(tmp_str, sizeof(tmp_str), + "/sys/class/net/%s", ifname); + if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) { + DRV_LOG(WARNING, "can not get PCI address" + " for netdev \"%s\"", ifname); + continue; + } + if (pci_dev->addr.domain != pci_addr.domain || + pci_dev->addr.bus != pci_addr.bus || + pci_dev->addr.devid != pci_addr.devid || + pci_dev->addr.function != pci_addr.function) + continue; + /* Slave interface PCI address match found. */ + fclose(file); + snprintf(tmp_str, sizeof(tmp_str), + "/sys/class/net/%s/phys_port_name", ifname); + file = fopen(tmp_str, "rb"); + if (!file) + break; + info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET; + if (fscanf(file, "%32s", tmp_str) == 1) + mlx5_translate_port_name(tmp_str, &info); + if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_LEGACY || + info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK) + pf = info.port_name; + break; + } + if (file) + fclose(file); + return pf; +} + +/** + * DPDK callback to register a PCI device. + * + * This function spawns Ethernet devices out of a given PCI device. + * + * @param[in] pci_drv + * PCI driver structure (mlx5_driver). + * @param[in] pci_dev + * PCI device information. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, + struct rte_pci_device *pci_dev) +{ + struct ibv_device **ibv_list; + /* + * Number of found IB Devices matching with requested PCI BDF. + * nd != 1 means there are multiple IB devices over the same + * PCI device and we have representors and master. + */ + unsigned int nd = 0; + /* + * Number of found IB device Ports. nd = 1 and np = 1..n means + * we have the single multiport IB device, and there may be + * representors attached to some of found ports. + */ + unsigned int np = 0; + /* + * Number of DPDK ethernet devices to Spawn - either over + * multiple IB devices or multiple ports of single IB device. + * Actually this is the number of iterations to spawn. + */ + unsigned int ns = 0; + /* + * Bonding device + * < 0 - no bonding device (single one) + * >= 0 - bonding device (value is slave PF index) + */ + int bd = -1; + struct mlx5_dev_spawn_data *list = NULL; + struct mlx5_dev_config dev_config; + int ret; + + if (mlx5_class_get(pci_dev->device.devargs) != MLX5_CLASS_NET) { + DRV_LOG(DEBUG, "Skip probing - should be probed by other mlx5" + " driver."); + return 1; + } + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + mlx5_pmd_socket_init(); + ret = mlx5_init_once(); + if (ret) { + DRV_LOG(ERR, "unable to init PMD global data: %s", + strerror(rte_errno)); + return -rte_errno; + } + MLX5_ASSERT(pci_drv == &mlx5_driver); + errno = 0; + ibv_list = mlx5_glue->get_device_list(&ret); + if (!ibv_list) { + rte_errno = errno ? errno : ENOSYS; + DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?"); + return -rte_errno; + } + /* + * First scan the list of all Infiniband devices to find + * matching ones, gathering into the list. + */ + struct ibv_device *ibv_match[ret + 1]; + int nl_route = mlx5_nl_init(NETLINK_ROUTE); + int nl_rdma = mlx5_nl_init(NETLINK_RDMA); + unsigned int i; + + while (ret-- > 0) { + struct rte_pci_addr pci_addr; + + DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name); + bd = mlx5_device_bond_pci_match + (ibv_list[ret], pci_dev, nl_rdma); + if (bd >= 0) { + /* + * Bonding device detected. Only one match is allowed, + * the bonding is supported over multi-port IB device, + * there should be no matches on representor PCI + * functions or non VF LAG bonding devices with + * specified address. + */ + if (nd) { + DRV_LOG(ERR, + "multiple PCI match on bonding device" + "\"%s\" found", ibv_list[ret]->name); + rte_errno = ENOENT; + ret = -rte_errno; + goto exit; + } + DRV_LOG(INFO, "PCI information matches for" + " slave %d bonding device \"%s\"", + bd, ibv_list[ret]->name); + ibv_match[nd++] = ibv_list[ret]; + break; + } + if (mlx5_dev_to_pci_addr + (ibv_list[ret]->ibdev_path, &pci_addr)) + continue; + if (pci_dev->addr.domain != pci_addr.domain || + pci_dev->addr.bus != pci_addr.bus || + pci_dev->addr.devid != pci_addr.devid || + pci_dev->addr.function != pci_addr.function) + continue; + DRV_LOG(INFO, "PCI information matches for device \"%s\"", + ibv_list[ret]->name); + ibv_match[nd++] = ibv_list[ret]; + } + ibv_match[nd] = NULL; + if (!nd) { + /* No device matches, just complain and bail out. */ + DRV_LOG(WARNING, + "no Verbs device matches PCI device " PCI_PRI_FMT "," + " are kernel drivers loaded?", + pci_dev->addr.domain, pci_dev->addr.bus, + pci_dev->addr.devid, pci_dev->addr.function); + rte_errno = ENOENT; + ret = -rte_errno; + goto exit; + } + if (nd == 1) { + /* + * Found single matching device may have multiple ports. + * Each port may be representor, we have to check the port + * number and check the representors existence. + */ + if (nl_rdma >= 0) + np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name); + if (!np) + DRV_LOG(WARNING, "can not get IB device \"%s\"" + " ports number", ibv_match[0]->name); + if (bd >= 0 && !np) { + DRV_LOG(ERR, "can not get ports" + " for bonding device"); + rte_errno = ENOENT; + ret = -rte_errno; + goto exit; + } + } +#ifndef HAVE_MLX5DV_DR_DEVX_PORT + if (bd >= 0) { + /* + * This may happen if there is VF LAG kernel support and + * application is compiled with older rdma_core library. + */ + DRV_LOG(ERR, + "No kernel/verbs support for VF LAG bonding found."); + rte_errno = ENOTSUP; + ret = -rte_errno; + goto exit; + } +#endif + /* + * Now we can determine the maximal + * amount of devices to be spawned. + */ + list = rte_zmalloc("device spawn data", + sizeof(struct mlx5_dev_spawn_data) * + (np ? np : nd), + RTE_CACHE_LINE_SIZE); + if (!list) { + DRV_LOG(ERR, "spawn data array allocation failure"); + rte_errno = ENOMEM; + ret = -rte_errno; + goto exit; + } + if (bd >= 0 || np > 1) { + /* + * Single IB device with multiple ports found, + * it may be E-Switch master device and representors. + * We have to perform identification through the ports. + */ + MLX5_ASSERT(nl_rdma >= 0); + MLX5_ASSERT(ns == 0); + MLX5_ASSERT(nd == 1); + MLX5_ASSERT(np); + for (i = 1; i <= np; ++i) { + list[ns].max_port = np; + list[ns].ibv_port = i; + list[ns].ibv_dev = ibv_match[0]; + list[ns].eth_dev = NULL; + list[ns].pci_dev = pci_dev; + list[ns].pf_bond = bd; + list[ns].ifindex = mlx5_nl_ifindex + (nl_rdma, list[ns].ibv_dev->name, i); + if (!list[ns].ifindex) { + /* + * No network interface index found for the + * specified port, it means there is no + * representor on this port. It's OK, + * there can be disabled ports, for example + * if sriov_numvfs < sriov_totalvfs. + */ + continue; + } + ret = -1; + if (nl_route >= 0) + ret = mlx5_nl_switch_info + (nl_route, + list[ns].ifindex, + &list[ns].info); + if (ret || (!list[ns].info.representor && + !list[ns].info.master)) { + /* + * We failed to recognize representors with + * Netlink, let's try to perform the task + * with sysfs. + */ + ret = mlx5_sysfs_switch_info + (list[ns].ifindex, + &list[ns].info); + } + if (!ret && bd >= 0) { + switch (list[ns].info.name_type) { + case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: + if (list[ns].info.port_name == bd) + ns++; + break; + case MLX5_PHYS_PORT_NAME_TYPE_PFVF: + if (list[ns].info.pf_num == bd) + ns++; + break; + default: + break; + } + continue; + } + if (!ret && (list[ns].info.representor ^ + list[ns].info.master)) + ns++; + } + if (!ns) { + DRV_LOG(ERR, + "unable to recognize master/representors" + " on the IB device with multiple ports"); + rte_errno = ENOENT; + ret = -rte_errno; + goto exit; + } + } else { + /* + * The existence of several matching entries (nd > 1) means + * port representors have been instantiated. No existing Verbs + * call nor sysfs entries can tell them apart, this can only + * be done through Netlink calls assuming kernel drivers are + * recent enough to support them. + * + * In the event of identification failure through Netlink, + * try again through sysfs, then: + * + * 1. A single IB device matches (nd == 1) with single + * port (np=0/1) and is not a representor, assume + * no switch support. + * + * 2. Otherwise no safe assumptions can be made; + * complain louder and bail out. + */ + np = 1; + for (i = 0; i != nd; ++i) { + memset(&list[ns].info, 0, sizeof(list[ns].info)); + list[ns].max_port = 1; + list[ns].ibv_port = 1; + list[ns].ibv_dev = ibv_match[i]; + list[ns].eth_dev = NULL; + list[ns].pci_dev = pci_dev; + list[ns].pf_bond = -1; + list[ns].ifindex = 0; + if (nl_rdma >= 0) + list[ns].ifindex = mlx5_nl_ifindex + (nl_rdma, list[ns].ibv_dev->name, 1); + if (!list[ns].ifindex) { + char ifname[IF_NAMESIZE]; + + /* + * Netlink failed, it may happen with old + * ib_core kernel driver (before 4.16). + * We can assume there is old driver because + * here we are processing single ports IB + * devices. Let's try sysfs to retrieve + * the ifindex. The method works for + * master device only. + */ + if (nd > 1) { + /* + * Multiple devices found, assume + * representors, can not distinguish + * master/representor and retrieve + * ifindex via sysfs. + */ + continue; + } + ret = mlx5_get_master_ifname + (ibv_match[i]->ibdev_path, &ifname); + if (!ret) + list[ns].ifindex = + if_nametoindex(ifname); + if (!list[ns].ifindex) { + /* + * No network interface index found + * for the specified device, it means + * there it is neither representor + * nor master. + */ + continue; + } + } + ret = -1; + if (nl_route >= 0) + ret = mlx5_nl_switch_info + (nl_route, + list[ns].ifindex, + &list[ns].info); + if (ret || (!list[ns].info.representor && + !list[ns].info.master)) { + /* + * We failed to recognize representors with + * Netlink, let's try to perform the task + * with sysfs. + */ + ret = mlx5_sysfs_switch_info + (list[ns].ifindex, + &list[ns].info); + } + if (!ret && (list[ns].info.representor ^ + list[ns].info.master)) { + ns++; + } else if ((nd == 1) && + !list[ns].info.representor && + !list[ns].info.master) { + /* + * Single IB device with + * one physical port and + * attached network device. + * May be SRIOV is not enabled + * or there is no representors. + */ + DRV_LOG(INFO, "no E-Switch support detected"); + ns++; + break; + } + } + if (!ns) { + DRV_LOG(ERR, + "unable to recognize master/representors" + " on the multiple IB devices"); + rte_errno = ENOENT; + ret = -rte_errno; + goto exit; + } + } + MLX5_ASSERT(ns); + /* + * Sort list to probe devices in natural order for users convenience + * (i.e. master first, then representors from lowest to highest ID). + */ + qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp); + /* Default configuration. */ + dev_config = (struct mlx5_dev_config){ + .hw_padding = 0, + .mps = MLX5_ARG_UNSET, + .dbnc = MLX5_ARG_UNSET, + .rx_vec_en = 1, + .txq_inline_max = MLX5_ARG_UNSET, + .txq_inline_min = MLX5_ARG_UNSET, + .txq_inline_mpw = MLX5_ARG_UNSET, + .txqs_inline = MLX5_ARG_UNSET, + .vf_nl_en = 1, + .mr_ext_memseg_en = 1, + .mprq = { + .enabled = 0, /* Disabled by default. */ + .stride_num_n = 0, + .stride_size_n = 0, + .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN, + .min_rxqs_num = MLX5_MPRQ_MIN_RXQS, + }, + .dv_esw_en = 1, + .dv_flow_en = 1, + .log_hp_size = MLX5_ARG_UNSET, + }; + /* Device specific configuration. */ + switch (pci_dev->id.device_id) { + case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: + case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF: + case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: + case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: + case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF: + case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF: + case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF: + dev_config.vf = 1; + break; + default: + break; + } + for (i = 0; i != ns; ++i) { + uint32_t restore; + + list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device, + &list[i], + dev_config); + if (!list[i].eth_dev) { + if (rte_errno != EBUSY && rte_errno != EEXIST) + break; + /* Device is disabled or already spawned. Ignore it. */ + continue; + } + restore = list[i].eth_dev->data->dev_flags; + rte_eth_copy_pci_info(list[i].eth_dev, pci_dev); + /* Restore non-PCI flags cleared by the above call. */ + list[i].eth_dev->data->dev_flags |= restore; + mlx5_dev_interrupt_handler_devx_install(list[i].eth_dev); + rte_eth_dev_probing_finish(list[i].eth_dev); + } + if (i != ns) { + DRV_LOG(ERR, + "probe of PCI device " PCI_PRI_FMT " aborted after" + " encountering an error: %s", + pci_dev->addr.domain, pci_dev->addr.bus, + pci_dev->addr.devid, pci_dev->addr.function, + strerror(rte_errno)); + ret = -rte_errno; + /* Roll back. */ + while (i--) { + if (!list[i].eth_dev) + continue; + mlx5_dev_close(list[i].eth_dev); + /* mac_addrs must not be freed because in dev_private */ + list[i].eth_dev->data->mac_addrs = NULL; + claim_zero(rte_eth_dev_release_port(list[i].eth_dev)); + } + /* Restore original error. */ + rte_errno = -ret; + } else { + ret = 0; + } +exit: + /* + * Do the routine cleanup: + * - close opened Netlink sockets + * - free allocated spawn data array + * - free the Infiniband device list + */ + if (nl_rdma >= 0) + close(nl_rdma); + if (nl_route >= 0) + close(nl_route); + if (list) + rte_free(list); + MLX5_ASSERT(ibv_list); + mlx5_glue->free_device_list(ibv_list); + return ret; +} + +/** + * Look for the ethernet device belonging to mlx5 driver. + * + * @param[in] port_id + * port_id to start looking for device. + * @param[in] pci_dev + * Pointer to the hint PCI device. When device is being probed + * the its siblings (master and preceding representors might + * not have assigned driver yet (because the mlx5_pci_probe() + * is not completed yet, for this case match on hint PCI + * device may be used to detect sibling device. + * + * @return + * port_id of found device, RTE_MAX_ETHPORT if not found. + */ +uint16_t +mlx5_eth_find_next(uint16_t port_id, struct rte_pci_device *pci_dev) +{ + while (port_id < RTE_MAX_ETHPORTS) { + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + + if (dev->state != RTE_ETH_DEV_UNUSED && + dev->device && + (dev->device == &pci_dev->device || + (dev->device->driver && + dev->device->driver->name && + !strcmp(dev->device->driver->name, MLX5_DRIVER_NAME)))) + break; + port_id++; + } + if (port_id >= RTE_MAX_ETHPORTS) + return RTE_MAX_ETHPORTS; + return port_id; +} + +/** + * DPDK callback to remove a PCI device. + * + * This function removes all Ethernet devices belong to a given PCI device. + * + * @param[in] pci_dev + * Pointer to the PCI device. + * + * @return + * 0 on success, the function cannot fail. + */ +static int +mlx5_pci_remove(struct rte_pci_device *pci_dev) +{ + uint16_t port_id; + + RTE_ETH_FOREACH_DEV_OF(port_id, &pci_dev->device) + rte_eth_dev_close(port_id); + return 0; +} + +static const struct rte_pci_id mlx5_pci_id_map[] = { + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX4) + }, + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) + }, + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX4LX) + }, + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) + }, + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX5) + }, + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) + }, + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX5EX) + }, + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF) + }, + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX5BF) + }, + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF) + }, + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX6) + }, + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX6VF) + }, + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX6DX) + }, + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF) + }, + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF) + }, + { + .vendor_id = 0 + } +}; + +static struct rte_pci_driver mlx5_driver = { + .driver = { + .name = MLX5_DRIVER_NAME + }, + .id_table = mlx5_pci_id_map, + .probe = mlx5_pci_probe, + .remove = mlx5_pci_remove, + .dma_map = mlx5_dma_map, + .dma_unmap = mlx5_dma_unmap, + .drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV | + RTE_PCI_DRV_PROBE_AGAIN, +}; + +/** + * Driver initialization routine. + */ +RTE_INIT(rte_mlx5_pmd_init) +{ + /* Initialize driver log type. */ + mlx5_logtype = rte_log_register("pmd.net.mlx5"); + if (mlx5_logtype >= 0) + rte_log_set_level(mlx5_logtype, RTE_LOG_NOTICE); + + /* Build the static tables for Verbs conversion. */ + mlx5_set_ptype_table(); + mlx5_set_cksum_table(); + mlx5_set_swp_types_table(); + if (mlx5_glue) + rte_pci_register(&mlx5_driver); +} + +RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__); +RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map); +RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib"); diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5.h new file mode 100644 index 000000000..d9f5d816f --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5.h @@ -0,0 +1,848 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2015 6WIND S.A. + * Copyright 2015 Mellanox Technologies, Ltd + */ + +#ifndef RTE_PMD_MLX5_H_ +#define RTE_PMD_MLX5_H_ + +#include <stddef.h> +#include <stdbool.h> +#include <stdint.h> +#include <limits.h> +#include <net/if.h> +#include <netinet/in.h> +#include <sys/queue.h> + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_pci.h> +#include <rte_ether.h> +#include <rte_ethdev_driver.h> +#include <rte_rwlock.h> +#include <rte_interrupts.h> +#include <rte_errno.h> +#include <rte_flow.h> + +#include <mlx5_glue.h> +#include <mlx5_devx_cmds.h> +#include <mlx5_prm.h> +#include <mlx5_nl.h> +#include <mlx5_common_mp.h> +#include <mlx5_common_mr.h> + +#include "mlx5_defs.h" +#include "mlx5_utils.h" +#include "mlx5_autoconf.h" + + +enum mlx5_ipool_index { +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + MLX5_IPOOL_DECAP_ENCAP = 0, /* Pool for encap/decap resource. */ + MLX5_IPOOL_PUSH_VLAN, /* Pool for push vlan resource. */ + MLX5_IPOOL_TAG, /* Pool for tag resource. */ + MLX5_IPOOL_PORT_ID, /* Pool for port id resource. */ + MLX5_IPOOL_JUMP, /* Pool for jump resource. */ +#endif + MLX5_IPOOL_MTR, /* Pool for meter resource. */ + MLX5_IPOOL_MCP, /* Pool for metadata resource. */ + MLX5_IPOOL_HRXQ, /* Pool for hrxq resource. */ + MLX5_IPOOL_MLX5_FLOW, /* Pool for mlx5 flow handle. */ + MLX5_IPOOL_RTE_FLOW, /* Pool for rte_flow. */ + MLX5_IPOOL_MAX, +}; + +/** Key string for IPC. */ +#define MLX5_MP_NAME "net_mlx5_mp" + + +LIST_HEAD(mlx5_dev_list, mlx5_ibv_shared); + +/* Shared data between primary and secondary processes. */ +struct mlx5_shared_data { + rte_spinlock_t lock; + /* Global spinlock for primary and secondary processes. */ + int init_done; /* Whether primary has done initialization. */ + unsigned int secondary_cnt; /* Number of secondary processes init'd. */ + struct mlx5_dev_list mem_event_cb_list; + rte_rwlock_t mem_event_rwlock; +}; + +/* Per-process data structure, not visible to other processes. */ +struct mlx5_local_data { + int init_done; /* Whether a secondary has done initialization. */ +}; + +extern struct mlx5_shared_data *mlx5_shared_data; + +struct mlx5_counter_ctrl { + /* Name of the counter. */ + char dpdk_name[RTE_ETH_XSTATS_NAME_SIZE]; + /* Name of the counter on the device table. */ + char ctr_name[RTE_ETH_XSTATS_NAME_SIZE]; + uint32_t ib:1; /**< Nonzero for IB counters. */ +}; + +struct mlx5_xstats_ctrl { + /* Number of device stats. */ + uint16_t stats_n; + /* Number of device stats identified by PMD. */ + uint16_t mlx5_stats_n; + /* Index in the device counters table. */ + uint16_t dev_table_idx[MLX5_MAX_XSTATS]; + uint64_t base[MLX5_MAX_XSTATS]; + uint64_t xstats[MLX5_MAX_XSTATS]; + uint64_t hw_stats[MLX5_MAX_XSTATS]; + struct mlx5_counter_ctrl info[MLX5_MAX_XSTATS]; +}; + +struct mlx5_stats_ctrl { + /* Base for imissed counter. */ + uint64_t imissed_base; + uint64_t imissed; +}; + +/* Default PMD specific parameter value. */ +#define MLX5_ARG_UNSET (-1) + +#define MLX5_LRO_SUPPORTED(dev) \ + (((struct mlx5_priv *)((dev)->data->dev_private))->config.lro.supported) + +/* Maximal size of coalesced segment for LRO is set in chunks of 256 Bytes. */ +#define MLX5_LRO_SEG_CHUNK_SIZE 256u + +/* Maximal size of aggregated LRO packet. */ +#define MLX5_MAX_LRO_SIZE (UINT8_MAX * MLX5_LRO_SEG_CHUNK_SIZE) + +/* LRO configurations structure. */ +struct mlx5_lro_config { + uint32_t supported:1; /* Whether LRO is supported. */ + uint32_t timeout; /* User configuration. */ +}; + +/* + * Device configuration structure. + * + * Merged configuration from: + * + * - Device capabilities, + * - User device parameters disabled features. + */ +struct mlx5_dev_config { + unsigned int hw_csum:1; /* Checksum offload is supported. */ + unsigned int hw_vlan_strip:1; /* VLAN stripping is supported. */ + unsigned int hw_vlan_insert:1; /* VLAN insertion in WQE is supported. */ + unsigned int hw_fcs_strip:1; /* FCS stripping is supported. */ + unsigned int hw_padding:1; /* End alignment padding is supported. */ + unsigned int vf:1; /* This is a VF. */ + unsigned int tunnel_en:1; + /* Whether tunnel stateless offloads are supported. */ + unsigned int mpls_en:1; /* MPLS over GRE/UDP is enabled. */ + unsigned int cqe_comp:1; /* CQE compression is enabled. */ + unsigned int cqe_pad:1; /* CQE padding is enabled. */ + unsigned int tso:1; /* Whether TSO is supported. */ + unsigned int rx_vec_en:1; /* Rx vector is enabled. */ + unsigned int mr_ext_memseg_en:1; + /* Whether memseg should be extended for MR creation. */ + unsigned int l3_vxlan_en:1; /* Enable L3 VXLAN flow creation. */ + unsigned int vf_nl_en:1; /* Enable Netlink requests in VF mode. */ + unsigned int dv_esw_en:1; /* Enable E-Switch DV flow. */ + unsigned int dv_flow_en:1; /* Enable DV flow. */ + unsigned int dv_xmeta_en:2; /* Enable extensive flow metadata. */ + unsigned int swp:1; /* Tx generic tunnel checksum and TSO offload. */ + unsigned int devx:1; /* Whether devx interface is available or not. */ + unsigned int dest_tir:1; /* Whether advanced DR API is available. */ + struct { + unsigned int enabled:1; /* Whether MPRQ is enabled. */ + unsigned int stride_num_n; /* Number of strides. */ + unsigned int stride_size_n; /* Size of a stride. */ + unsigned int min_stride_size_n; /* Min size of a stride. */ + unsigned int max_stride_size_n; /* Max size of a stride. */ + unsigned int max_memcpy_len; + /* Maximum packet size to memcpy Rx packets. */ + unsigned int min_rxqs_num; + /* Rx queue count threshold to enable MPRQ. */ + } mprq; /* Configurations for Multi-Packet RQ. */ + int mps; /* Multi-packet send supported mode. */ + int dbnc; /* Skip doorbell register write barrier. */ + unsigned int flow_prio; /* Number of flow priorities. */ + enum modify_reg flow_mreg_c[MLX5_MREG_C_NUM]; + /* Availibility of mreg_c's. */ + unsigned int tso_max_payload_sz; /* Maximum TCP payload for TSO. */ + unsigned int ind_table_max_size; /* Maximum indirection table size. */ + unsigned int max_dump_files_num; /* Maximum dump files per queue. */ + unsigned int log_hp_size; /* Single hairpin queue data size in total. */ + int txqs_inline; /* Queue number threshold for inlining. */ + int txq_inline_min; /* Minimal amount of data bytes to inline. */ + int txq_inline_max; /* Max packet size for inlining with SEND. */ + int txq_inline_mpw; /* Max packet size for inlining with eMPW. */ + struct mlx5_hca_attr hca_attr; /* HCA attributes. */ + struct mlx5_lro_config lro; /* LRO configuration. */ +}; + + +/** + * Type of object being allocated. + */ +enum mlx5_verbs_alloc_type { + MLX5_VERBS_ALLOC_TYPE_NONE, + MLX5_VERBS_ALLOC_TYPE_TX_QUEUE, + MLX5_VERBS_ALLOC_TYPE_RX_QUEUE, +}; + +/* Structure for VF VLAN workaround. */ +struct mlx5_vf_vlan { + uint32_t tag:12; + uint32_t created:1; +}; + +/** + * Verbs allocator needs a context to know in the callback which kind of + * resources it is allocating. + */ +struct mlx5_verbs_alloc_ctx { + enum mlx5_verbs_alloc_type type; /* Kind of object being allocated. */ + const void *obj; /* Pointer to the DPDK object. */ +}; + +/* Flow drop context necessary due to Verbs API. */ +struct mlx5_drop { + struct mlx5_hrxq *hrxq; /* Hash Rx queue queue. */ + struct mlx5_rxq_obj *rxq; /* Rx queue object. */ +}; + +#define MLX5_COUNTERS_PER_POOL 512 +#define MLX5_MAX_PENDING_QUERIES 4 +#define MLX5_CNT_CONTAINER_RESIZE 64 +#define MLX5_CNT_AGE_OFFSET 0x80000000 +#define CNT_SIZE (sizeof(struct mlx5_flow_counter)) +#define CNTEXT_SIZE (sizeof(struct mlx5_flow_counter_ext)) +#define AGE_SIZE (sizeof(struct mlx5_age_param)) +#define MLX5_AGING_TIME_DELAY 7 +#define CNT_POOL_TYPE_EXT (1 << 0) +#define CNT_POOL_TYPE_AGE (1 << 1) +#define IS_EXT_POOL(pool) (((pool)->type) & CNT_POOL_TYPE_EXT) +#define IS_AGE_POOL(pool) (((pool)->type) & CNT_POOL_TYPE_AGE) +#define MLX_CNT_IS_AGE(counter) ((counter) & MLX5_CNT_AGE_OFFSET ? 1 : 0) +#define MLX5_CNT_LEN(pool) \ + (CNT_SIZE + \ + (IS_AGE_POOL(pool) ? AGE_SIZE : 0) + \ + (IS_EXT_POOL(pool) ? CNTEXT_SIZE : 0)) +#define MLX5_POOL_GET_CNT(pool, index) \ + ((struct mlx5_flow_counter *) \ + ((uint8_t *)((pool) + 1) + (index) * (MLX5_CNT_LEN(pool)))) +#define MLX5_CNT_ARRAY_IDX(pool, cnt) \ + ((int)(((uint8_t *)(cnt) - (uint8_t *)((pool) + 1)) / \ + MLX5_CNT_LEN(pool))) +/* + * The pool index and offset of counter in the pool array makes up the + * counter index. In case the counter is from pool 0 and offset 0, it + * should plus 1 to avoid index 0, since 0 means invalid counter index + * currently. + */ +#define MLX5_MAKE_CNT_IDX(pi, offset) \ + ((pi) * MLX5_COUNTERS_PER_POOL + (offset) + 1) +#define MLX5_CNT_TO_CNT_EXT(pool, cnt) \ + ((struct mlx5_flow_counter_ext *)\ + ((uint8_t *)((cnt) + 1) + \ + (IS_AGE_POOL(pool) ? AGE_SIZE : 0))) +#define MLX5_GET_POOL_CNT_EXT(pool, offset) \ + MLX5_CNT_TO_CNT_EXT(pool, MLX5_POOL_GET_CNT((pool), (offset))) +#define MLX5_CNT_TO_AGE(cnt) \ + ((struct mlx5_age_param *)((cnt) + 1)) + +struct mlx5_flow_counter_pool; + +/*age status*/ +enum { + AGE_FREE, /* Initialized state. */ + AGE_CANDIDATE, /* Counter assigned to flows. */ + AGE_TMOUT, /* Timeout, wait for rte_flow_get_aged_flows and destroy. */ +}; + +#define MLX5_CNT_CONTAINER(sh, batch, age) (&(sh)->cmng.ccont \ + [(batch) * 2 + (age)]) + +enum { + MLX5_CCONT_TYPE_SINGLE, + MLX5_CCONT_TYPE_SINGLE_FOR_AGE, + MLX5_CCONT_TYPE_BATCH, + MLX5_CCONT_TYPE_BATCH_FOR_AGE, + MLX5_CCONT_TYPE_MAX, +}; + +/* Counter age parameter. */ +struct mlx5_age_param { + rte_atomic16_t state; /**< Age state. */ + uint16_t port_id; /**< Port id of the counter. */ + uint32_t timeout:15; /**< Age timeout in unit of 0.1sec. */ + uint32_t expire:16; /**< Expire time(0.1sec) in the future. */ + void *context; /**< Flow counter age context. */ +}; + +struct flow_counter_stats { + uint64_t hits; + uint64_t bytes; +}; + +/* Generic counters information. */ +struct mlx5_flow_counter { + TAILQ_ENTRY(mlx5_flow_counter) next; + /**< Pointer to the next flow counter structure. */ + union { + uint64_t hits; /**< Reset value of hits packets. */ + int64_t query_gen; /**< Generation of the last release. */ + }; + uint64_t bytes; /**< Reset value of bytes. */ + void *action; /**< Pointer to the dv action. */ +}; + +/* Extend counters information for none batch counters. */ +struct mlx5_flow_counter_ext { + uint32_t shared:1; /**< Share counter ID with other flow rules. */ + uint32_t batch: 1; + /**< Whether the counter was allocated by batch command. */ + uint32_t ref_cnt:30; /**< Reference counter. */ + uint32_t id; /**< User counter ID. */ + union { /**< Holds the counters for the rule. */ +#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) + struct ibv_counter_set *cs; +#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) + struct ibv_counters *cs; +#endif + struct mlx5_devx_obj *dcs; /**< Counter Devx object. */ + }; +}; + +TAILQ_HEAD(mlx5_counters, mlx5_flow_counter); + +/* Generic counter pool structure - query is in pool resolution. */ +struct mlx5_flow_counter_pool { + TAILQ_ENTRY(mlx5_flow_counter_pool) next; + struct mlx5_counters counters; /* Free counter list. */ + union { + struct mlx5_devx_obj *min_dcs; + rte_atomic64_t a64_dcs; + }; + /* The devx object of the minimum counter ID. */ + rte_atomic64_t start_query_gen; /* Query start round. */ + rte_atomic64_t end_query_gen; /* Query end round. */ + uint32_t index; /* Pool index in container. */ + uint8_t type; /* Memory type behind the counter array. */ + rte_spinlock_t sl; /* The pool lock. */ + struct mlx5_counter_stats_raw *raw; + struct mlx5_counter_stats_raw *raw_hw; /* The raw on HW working. */ +}; + +struct mlx5_counter_stats_raw; + +/* Memory management structure for group of counter statistics raws. */ +struct mlx5_counter_stats_mem_mng { + LIST_ENTRY(mlx5_counter_stats_mem_mng) next; + struct mlx5_counter_stats_raw *raws; + struct mlx5_devx_obj *dm; + struct mlx5dv_devx_umem *umem; +}; + +/* Raw memory structure for the counter statistics values of a pool. */ +struct mlx5_counter_stats_raw { + LIST_ENTRY(mlx5_counter_stats_raw) next; + int min_dcs_id; + struct mlx5_counter_stats_mem_mng *mem_mng; + volatile struct flow_counter_stats *data; +}; + +TAILQ_HEAD(mlx5_counter_pools, mlx5_flow_counter_pool); + +/* Container structure for counter pools. */ +struct mlx5_pools_container { + rte_atomic16_t n_valid; /* Number of valid pools. */ + uint16_t n; /* Number of pools. */ + rte_spinlock_t resize_sl; /* The resize lock. */ + struct mlx5_counter_pools pool_list; /* Counter pool list. */ + struct mlx5_flow_counter_pool **pools; /* Counter pool array. */ + struct mlx5_counter_stats_mem_mng *mem_mng; + /* Hold the memory management for the next allocated pools raws. */ +}; + +/* Counter global management structure. */ +struct mlx5_flow_counter_mng { + struct mlx5_pools_container ccont[MLX5_CCONT_TYPE_MAX]; + struct mlx5_counters flow_counters; /* Legacy flow counter list. */ + uint8_t pending_queries; + uint8_t batch; + uint16_t pool_index; + uint8_t age; + uint8_t query_thread_on; + LIST_HEAD(mem_mngs, mlx5_counter_stats_mem_mng) mem_mngs; + LIST_HEAD(stat_raws, mlx5_counter_stats_raw) free_stat_raws; +}; + +#define MLX5_AGE_EVENT_NEW 1 +#define MLX5_AGE_TRIGGER 2 +#define MLX5_AGE_SET(age_info, BIT) \ + ((age_info)->flags |= (1 << (BIT))) +#define MLX5_AGE_GET(age_info, BIT) \ + ((age_info)->flags & (1 << (BIT))) +#define GET_PORT_AGE_INFO(priv) \ + (&((priv)->sh->port[(priv)->ibv_port - 1].age_info)) + +/* Aging information for per port. */ +struct mlx5_age_info { + uint8_t flags; /*Indicate if is new event or need be trigered*/ + struct mlx5_counters aged_counters; /* Aged flow counter list. */ + rte_spinlock_t aged_sl; /* Aged flow counter list lock. */ +}; + +/* Per port data of shared IB device. */ +struct mlx5_ibv_shared_port { + uint32_t ih_port_id; + uint32_t devx_ih_port_id; + /* + * Interrupt handler port_id. Used by shared interrupt + * handler to find the corresponding rte_eth device + * by IB port index. If value is equal or greater + * RTE_MAX_ETHPORTS it means there is no subhandler + * installed for specified IB port index. + */ + struct mlx5_age_info age_info; + /* Aging information for per port. */ +}; + +/* Table key of the hash organization. */ +union mlx5_flow_tbl_key { + struct { + /* Table ID should be at the lowest address. */ + uint32_t table_id; /**< ID of the table. */ + uint16_t reserved; /**< must be zero for comparison. */ + uint8_t domain; /**< 1 - FDB, 0 - NIC TX/RX. */ + uint8_t direction; /**< 1 - egress, 0 - ingress. */ + }; + uint64_t v64; /**< full 64bits value of key */ +}; + +/* Table structure. */ +struct mlx5_flow_tbl_resource { + void *obj; /**< Pointer to DR table object. */ + rte_atomic32_t refcnt; /**< Reference counter. */ +}; + +#define MLX5_MAX_TABLES UINT16_MAX +#define MLX5_FLOW_TABLE_LEVEL_METER (UINT16_MAX - 3) +#define MLX5_FLOW_TABLE_LEVEL_SUFFIX (UINT16_MAX - 2) +#define MLX5_HAIRPIN_TX_TABLE (UINT16_MAX - 1) +/* Reserve the last two tables for metadata register copy. */ +#define MLX5_FLOW_MREG_ACT_TABLE_GROUP (MLX5_MAX_TABLES - 1) +#define MLX5_FLOW_MREG_CP_TABLE_GROUP (MLX5_MAX_TABLES - 2) +/* Tables for metering splits should be added here. */ +#define MLX5_MAX_TABLES_EXTERNAL (MLX5_MAX_TABLES - 3) +#define MLX5_MAX_TABLES_FDB UINT16_MAX + +#define MLX5_DBR_PAGE_SIZE 4096 /* Must be >= 512. */ +#define MLX5_DBR_SIZE 8 +#define MLX5_DBR_PER_PAGE (MLX5_DBR_PAGE_SIZE / MLX5_DBR_SIZE) +#define MLX5_DBR_BITMAP_SIZE (MLX5_DBR_PER_PAGE / 64) + +struct mlx5_devx_dbr_page { + /* Door-bell records, must be first member in structure. */ + uint8_t dbrs[MLX5_DBR_PAGE_SIZE]; + LIST_ENTRY(mlx5_devx_dbr_page) next; /* Pointer to the next element. */ + struct mlx5dv_devx_umem *umem; + uint32_t dbr_count; /* Number of door-bell records in use. */ + /* 1 bit marks matching door-bell is in use. */ + uint64_t dbr_bitmap[MLX5_DBR_BITMAP_SIZE]; +}; + +/* ID generation structure. */ +struct mlx5_flow_id_pool { + uint32_t *free_arr; /**< Pointer to the a array of free values. */ + uint32_t base_index; + /**< The next index that can be used without any free elements. */ + uint32_t *curr; /**< Pointer to the index to pop. */ + uint32_t *last; /**< Pointer to the last element in the empty arrray. */ + uint32_t max_id; /**< Maximum id can be allocated from the pool. */ +}; + +/* + * Shared Infiniband device context for Master/Representors + * which belong to same IB device with multiple IB ports. + **/ +struct mlx5_ibv_shared { + LIST_ENTRY(mlx5_ibv_shared) next; + uint32_t refcnt; + uint32_t devx:1; /* Opened with DV. */ + uint32_t max_port; /* Maximal IB device port index. */ + struct ibv_context *ctx; /* Verbs/DV context. */ + struct ibv_pd *pd; /* Protection Domain. */ + uint32_t pdn; /* Protection Domain number. */ + uint32_t tdn; /* Transport Domain number. */ + char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */ + char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */ + struct ibv_device_attr_ex device_attr; /* Device properties. */ + LIST_ENTRY(mlx5_ibv_shared) mem_event_cb; + /**< Called by memory event callback. */ + struct mlx5_mr_share_cache share_cache; + /* Shared DV/DR flow data section. */ + pthread_mutex_t dv_mutex; /* DV context mutex. */ + uint32_t dv_meta_mask; /* flow META metadata supported mask. */ + uint32_t dv_mark_mask; /* flow MARK metadata supported mask. */ + uint32_t dv_regc0_mask; /* available bits of metatada reg_c[0]. */ + uint32_t dv_refcnt; /* DV/DR data reference counter. */ + void *fdb_domain; /* FDB Direct Rules name space handle. */ + void *rx_domain; /* RX Direct Rules name space handle. */ + void *tx_domain; /* TX Direct Rules name space handle. */ + struct mlx5_hlist *flow_tbls; + /* Direct Rules tables for FDB, NIC TX+RX */ + void *esw_drop_action; /* Pointer to DR E-Switch drop action. */ + void *pop_vlan_action; /* Pointer to DR pop VLAN action. */ + uint32_t encaps_decaps; /* Encap/decap action indexed memory list. */ + LIST_HEAD(modify_cmd, mlx5_flow_dv_modify_hdr_resource) modify_cmds; + struct mlx5_hlist *tag_table; + uint32_t port_id_action_list; /* List of port ID actions. */ + uint32_t push_vlan_action_list; /* List of push VLAN actions. */ + struct mlx5_flow_counter_mng cmng; /* Counters management structure. */ + struct mlx5_indexed_pool *ipool[MLX5_IPOOL_MAX]; + /* Memory Pool for mlx5 flow resources. */ + /* Shared interrupt handler section. */ + pthread_mutex_t intr_mutex; /* Interrupt config mutex. */ + uint32_t intr_cnt; /* Interrupt handler reference counter. */ + struct rte_intr_handle intr_handle; /* Interrupt handler for device. */ + uint32_t devx_intr_cnt; /* Devx interrupt handler reference counter. */ + struct rte_intr_handle intr_handle_devx; /* DEVX interrupt handler. */ + struct mlx5dv_devx_cmd_comp *devx_comp; /* DEVX async comp obj. */ + struct mlx5_devx_obj *tis; /* TIS object. */ + struct mlx5_devx_obj *td; /* Transport domain. */ + struct mlx5_flow_id_pool *flow_id_pool; /* Flow ID pool. */ + struct mlx5_ibv_shared_port port[]; /* per device port data array. */ +}; + +/* Per-process private structure. */ +struct mlx5_proc_priv { + size_t uar_table_sz; + /* Size of UAR register table. */ + void *uar_table[]; + /* Table of UAR registers for each process. */ +}; + +/* MTR profile list. */ +TAILQ_HEAD(mlx5_mtr_profiles, mlx5_flow_meter_profile); +/* MTR list. */ +TAILQ_HEAD(mlx5_flow_meters, mlx5_flow_meter); + +#define MLX5_PROC_PRIV(port_id) \ + ((struct mlx5_proc_priv *)rte_eth_devices[port_id].process_private) + +struct mlx5_priv { + struct rte_eth_dev_data *dev_data; /* Pointer to device data. */ + struct mlx5_ibv_shared *sh; /* Shared IB device context. */ + uint32_t ibv_port; /* IB device port number. */ + struct rte_pci_device *pci_dev; /* Backend PCI device. */ + struct rte_ether_addr mac[MLX5_MAX_MAC_ADDRESSES]; /* MAC addresses. */ + BITFIELD_DECLARE(mac_own, uint64_t, MLX5_MAX_MAC_ADDRESSES); + /* Bit-field of MAC addresses owned by the PMD. */ + uint16_t vlan_filter[MLX5_MAX_VLAN_IDS]; /* VLAN filters table. */ + unsigned int vlan_filter_n; /* Number of configured VLAN filters. */ + /* Device properties. */ + uint16_t mtu; /* Configured MTU. */ + unsigned int isolated:1; /* Whether isolated mode is enabled. */ + unsigned int representor:1; /* Device is a port representor. */ + unsigned int master:1; /* Device is a E-Switch master. */ + unsigned int dr_shared:1; /* DV/DR data is shared. */ + unsigned int counter_fallback:1; /* Use counter fallback management. */ + unsigned int mtr_en:1; /* Whether support meter. */ + unsigned int mtr_reg_share:1; /* Whether support meter REG_C share. */ + uint16_t domain_id; /* Switch domain identifier. */ + uint16_t vport_id; /* Associated VF vport index (if any). */ + uint32_t vport_meta_tag; /* Used for vport index match ove VF LAG. */ + uint32_t vport_meta_mask; /* Used for vport index field match mask. */ + int32_t representor_id; /* Port representor identifier. */ + int32_t pf_bond; /* >=0 means PF index in bonding configuration. */ + unsigned int if_index; /* Associated kernel network device index. */ + /* RX/TX queues. */ + unsigned int rxqs_n; /* RX queues array size. */ + unsigned int txqs_n; /* TX queues array size. */ + struct mlx5_rxq_data *(*rxqs)[]; /* RX queues. */ + struct mlx5_txq_data *(*txqs)[]; /* TX queues. */ + struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */ + struct rte_eth_rss_conf rss_conf; /* RSS configuration. */ + unsigned int (*reta_idx)[]; /* RETA index table. */ + unsigned int reta_idx_n; /* RETA index size. */ + struct mlx5_drop drop_queue; /* Flow drop queues. */ + uint32_t flows; /* RTE Flow rules. */ + uint32_t ctrl_flows; /* Control flow rules. */ + void *inter_flows; /* Intermediate resources for flow creation. */ + void *rss_desc; /* Intermediate rss description resources. */ + int flow_idx; /* Intermediate device flow index. */ + int flow_nested_idx; /* Intermediate device flow index, nested. */ + LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */ + LIST_HEAD(rxqobj, mlx5_rxq_obj) rxqsobj; /* Verbs/DevX Rx queues. */ + uint32_t hrxqs; /* Verbs Hash Rx queues. */ + LIST_HEAD(txq, mlx5_txq_ctrl) txqsctrl; /* DPDK Tx queues. */ + LIST_HEAD(txqobj, mlx5_txq_obj) txqsobj; /* Verbs/DevX Tx queues. */ + /* Indirection tables. */ + LIST_HEAD(ind_tables, mlx5_ind_table_obj) ind_tbls; + /* Pointer to next element. */ + rte_atomic32_t refcnt; /**< Reference counter. */ + struct ibv_flow_action *verbs_action; + /**< Verbs modify header action object. */ + uint8_t ft_type; /**< Flow table type, Rx or Tx. */ + uint8_t max_lro_msg_size; + /* Tags resources cache. */ + uint32_t link_speed_capa; /* Link speed capabilities. */ + struct mlx5_xstats_ctrl xstats_ctrl; /* Extended stats control. */ + struct mlx5_stats_ctrl stats_ctrl; /* Stats control. */ + struct mlx5_dev_config config; /* Device configuration. */ + struct mlx5_verbs_alloc_ctx verbs_alloc_ctx; + /* Context for Verbs allocator. */ + int nl_socket_rdma; /* Netlink socket (NETLINK_RDMA). */ + int nl_socket_route; /* Netlink socket (NETLINK_ROUTE). */ + LIST_HEAD(dbrpage, mlx5_devx_dbr_page) dbrpgs; /* Door-bell pages. */ + struct mlx5_nl_vlan_vmwa_context *vmwa_context; /* VLAN WA context. */ + struct mlx5_flow_id_pool *qrss_id_pool; + struct mlx5_hlist *mreg_cp_tbl; + /* Hash table of Rx metadata register copy table. */ + uint8_t mtr_sfx_reg; /* Meter prefix-suffix flow match REG_C. */ + uint8_t mtr_color_reg; /* Meter color match REG_C. */ + struct mlx5_mtr_profiles flow_meter_profiles; /* MTR profile list. */ + struct mlx5_flow_meters flow_meters; /* MTR list. */ +#ifndef RTE_ARCH_64 + rte_spinlock_t uar_lock_cq; /* CQs share a common distinct UAR */ + rte_spinlock_t uar_lock[MLX5_UAR_PAGE_NUM_MAX]; + /* UAR same-page access control required in 32bit implementations. */ +#endif + uint8_t skip_default_rss_reta; /* Skip configuration of default reta. */ + uint8_t fdb_def_rule; /* Whether fdb jump to table 1 is configured. */ + struct mlx5_mp_id mp_id; /* ID of a multi-process process */ + LIST_HEAD(fdir, mlx5_fdir_flow) fdir_flows; /* fdir flows. */ +}; + +#define PORT_ID(priv) ((priv)->dev_data->port_id) +#define ETH_DEV(priv) (&rte_eth_devices[PORT_ID(priv)]) + +/* mlx5.c */ + +int mlx5_getenv_int(const char *); +int mlx5_proc_priv_init(struct rte_eth_dev *dev); +int64_t mlx5_get_dbr(struct rte_eth_dev *dev, + struct mlx5_devx_dbr_page **dbr_page); +int32_t mlx5_release_dbr(struct rte_eth_dev *dev, uint32_t umem_id, + uint64_t offset); +int mlx5_udp_tunnel_port_add(struct rte_eth_dev *dev, + struct rte_eth_udp_tunnel *udp_tunnel); +uint16_t mlx5_eth_find_next(uint16_t port_id, struct rte_pci_device *pci_dev); + +/* Macro to iterate over all valid ports for mlx5 driver. */ +#define MLX5_ETH_FOREACH_DEV(port_id, pci_dev) \ + for (port_id = mlx5_eth_find_next(0, pci_dev); \ + port_id < RTE_MAX_ETHPORTS; \ + port_id = mlx5_eth_find_next(port_id + 1, pci_dev)) + +/* mlx5_ethdev.c */ + +int mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]); +int mlx5_get_master_ifname(const char *ibdev_path, char (*ifname)[IF_NAMESIZE]); +unsigned int mlx5_ifindex(const struct rte_eth_dev *dev); +int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr); +int mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu); +int mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, + unsigned int flags); +int mlx5_dev_configure(struct rte_eth_dev *dev); +int mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info); +int mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock); +int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size); +const uint32_t *mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev); +int mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete); +int mlx5_force_link_status_change(struct rte_eth_dev *dev, int status); +int mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu); +int mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, + struct rte_eth_fc_conf *fc_conf); +int mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, + struct rte_eth_fc_conf *fc_conf); +void mlx5_dev_link_status_handler(void *arg); +void mlx5_dev_interrupt_handler(void *arg); +void mlx5_dev_interrupt_handler_devx(void *arg); +void mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev); +void mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev); +void mlx5_dev_interrupt_handler_devx_uninstall(struct rte_eth_dev *dev); +void mlx5_dev_interrupt_handler_devx_install(struct rte_eth_dev *dev); +int mlx5_set_link_down(struct rte_eth_dev *dev); +int mlx5_set_link_up(struct rte_eth_dev *dev); +int mlx5_is_removed(struct rte_eth_dev *dev); +eth_tx_burst_t mlx5_select_tx_function(struct rte_eth_dev *dev); +eth_rx_burst_t mlx5_select_rx_function(struct rte_eth_dev *dev); +struct mlx5_priv *mlx5_port_to_eswitch_info(uint16_t port, bool valid); +struct mlx5_priv *mlx5_dev_to_eswitch_info(struct rte_eth_dev *dev); +int mlx5_sysfs_switch_info(unsigned int ifindex, + struct mlx5_switch_info *info); +void mlx5_sysfs_check_switch_info(bool device_dir, + struct mlx5_switch_info *switch_info); +void mlx5_translate_port_name(const char *port_name_in, + struct mlx5_switch_info *port_info_out); +void mlx5_intr_callback_unregister(const struct rte_intr_handle *handle, + rte_intr_callback_fn cb_fn, void *cb_arg); +int mlx5_get_module_info(struct rte_eth_dev *dev, + struct rte_eth_dev_module_info *modinfo); +int mlx5_get_module_eeprom(struct rte_eth_dev *dev, + struct rte_dev_eeprom_info *info); +int mlx5_hairpin_cap_get(struct rte_eth_dev *dev, + struct rte_eth_hairpin_cap *cap); +int mlx5_dev_configure_rss_reta(struct rte_eth_dev *dev); + +/* mlx5_mac.c */ + +int mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[RTE_ETHER_ADDR_LEN]); +void mlx5_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index); +int mlx5_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac, + uint32_t index, uint32_t vmdq); +struct mlx5_nl_vlan_vmwa_context *mlx5_vlan_vmwa_init + (struct rte_eth_dev *dev, uint32_t ifindex); +int mlx5_mac_addr_set(struct rte_eth_dev *dev, struct rte_ether_addr *mac_addr); +int mlx5_set_mc_addr_list(struct rte_eth_dev *dev, + struct rte_ether_addr *mc_addr_set, + uint32_t nb_mc_addr); + +/* mlx5_rss.c */ + +int mlx5_rss_hash_update(struct rte_eth_dev *dev, + struct rte_eth_rss_conf *rss_conf); +int mlx5_rss_hash_conf_get(struct rte_eth_dev *dev, + struct rte_eth_rss_conf *rss_conf); +int mlx5_rss_reta_index_resize(struct rte_eth_dev *dev, unsigned int reta_size); +int mlx5_dev_rss_reta_query(struct rte_eth_dev *dev, + struct rte_eth_rss_reta_entry64 *reta_conf, + uint16_t reta_size); +int mlx5_dev_rss_reta_update(struct rte_eth_dev *dev, + struct rte_eth_rss_reta_entry64 *reta_conf, + uint16_t reta_size); + +/* mlx5_rxmode.c */ + +int mlx5_promiscuous_enable(struct rte_eth_dev *dev); +int mlx5_promiscuous_disable(struct rte_eth_dev *dev); +int mlx5_allmulticast_enable(struct rte_eth_dev *dev); +int mlx5_allmulticast_disable(struct rte_eth_dev *dev); + +/* mlx5_stats.c */ + +void mlx5_stats_init(struct rte_eth_dev *dev); +int mlx5_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats); +int mlx5_stats_reset(struct rte_eth_dev *dev); +int mlx5_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *stats, + unsigned int n); +int mlx5_xstats_reset(struct rte_eth_dev *dev); +int mlx5_xstats_get_names(struct rte_eth_dev *dev __rte_unused, + struct rte_eth_xstat_name *xstats_names, + unsigned int n); + +/* mlx5_vlan.c */ + +int mlx5_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on); +void mlx5_vlan_strip_queue_set(struct rte_eth_dev *dev, uint16_t queue, int on); +int mlx5_vlan_offload_set(struct rte_eth_dev *dev, int mask); +void mlx5_vlan_vmwa_exit(struct mlx5_nl_vlan_vmwa_context *ctx); +void mlx5_vlan_vmwa_release(struct rte_eth_dev *dev, + struct mlx5_vf_vlan *vf_vlan); +void mlx5_vlan_vmwa_acquire(struct rte_eth_dev *dev, + struct mlx5_vf_vlan *vf_vlan); + +/* mlx5_trigger.c */ + +int mlx5_dev_start(struct rte_eth_dev *dev); +void mlx5_dev_stop(struct rte_eth_dev *dev); +int mlx5_traffic_enable(struct rte_eth_dev *dev); +void mlx5_traffic_disable(struct rte_eth_dev *dev); +int mlx5_traffic_restart(struct rte_eth_dev *dev); + +/* mlx5_flow.c */ + +int mlx5_flow_discover_mreg_c(struct rte_eth_dev *eth_dev); +bool mlx5_flow_ext_mreg_supported(struct rte_eth_dev *dev); +int mlx5_flow_discover_priorities(struct rte_eth_dev *dev); +void mlx5_flow_print(struct rte_flow *flow); +int mlx5_flow_validate(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error); +struct rte_flow *mlx5_flow_create(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error); +int mlx5_flow_destroy(struct rte_eth_dev *dev, struct rte_flow *flow, + struct rte_flow_error *error); +void mlx5_flow_list_flush(struct rte_eth_dev *dev, uint32_t *list, bool active); +int mlx5_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error); +int mlx5_flow_query(struct rte_eth_dev *dev, struct rte_flow *flow, + const struct rte_flow_action *action, void *data, + struct rte_flow_error *error); +int mlx5_flow_isolate(struct rte_eth_dev *dev, int enable, + struct rte_flow_error *error); +int mlx5_dev_filter_ctrl(struct rte_eth_dev *dev, + enum rte_filter_type filter_type, + enum rte_filter_op filter_op, + void *arg); +int mlx5_flow_start(struct rte_eth_dev *dev, uint32_t *list); +void mlx5_flow_stop(struct rte_eth_dev *dev, uint32_t *list); +int mlx5_flow_start_default(struct rte_eth_dev *dev); +void mlx5_flow_stop_default(struct rte_eth_dev *dev); +void mlx5_flow_alloc_intermediate(struct rte_eth_dev *dev); +void mlx5_flow_free_intermediate(struct rte_eth_dev *dev); +int mlx5_flow_verify(struct rte_eth_dev *dev); +int mlx5_ctrl_flow_source_queue(struct rte_eth_dev *dev, uint32_t queue); +int mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev, + struct rte_flow_item_eth *eth_spec, + struct rte_flow_item_eth *eth_mask, + struct rte_flow_item_vlan *vlan_spec, + struct rte_flow_item_vlan *vlan_mask); +int mlx5_ctrl_flow(struct rte_eth_dev *dev, + struct rte_flow_item_eth *eth_spec, + struct rte_flow_item_eth *eth_mask); +struct rte_flow *mlx5_flow_create_esw_table_zero_flow(struct rte_eth_dev *dev); +int mlx5_flow_create_drop_queue(struct rte_eth_dev *dev); +void mlx5_flow_delete_drop_queue(struct rte_eth_dev *dev); +void mlx5_flow_async_pool_query_handle(struct mlx5_ibv_shared *sh, + uint64_t async_id, int status); +void mlx5_set_query_alarm(struct mlx5_ibv_shared *sh); +void mlx5_flow_query_alarm(void *arg); +uint32_t mlx5_counter_alloc(struct rte_eth_dev *dev); +void mlx5_counter_free(struct rte_eth_dev *dev, uint32_t cnt); +int mlx5_counter_query(struct rte_eth_dev *dev, uint32_t cnt, + bool clear, uint64_t *pkts, uint64_t *bytes); +int mlx5_flow_dev_dump(struct rte_eth_dev *dev, FILE *file, + struct rte_flow_error *error); +void mlx5_flow_rxq_dynf_metadata_set(struct rte_eth_dev *dev); +int mlx5_flow_get_aged_flows(struct rte_eth_dev *dev, void **contexts, + uint32_t nb_contexts, struct rte_flow_error *error); + +/* mlx5_mp.c */ +int mlx5_mp_primary_handle(const struct rte_mp_msg *mp_msg, const void *peer); +int mlx5_mp_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer); +void mlx5_mp_req_start_rxtx(struct rte_eth_dev *dev); +void mlx5_mp_req_stop_rxtx(struct rte_eth_dev *dev); + +/* mlx5_socket.c */ + +int mlx5_pmd_socket_init(void); + +/* mlx5_flow_meter.c */ + +int mlx5_flow_meter_ops_get(struct rte_eth_dev *dev, void *arg); +struct mlx5_flow_meter *mlx5_flow_meter_find(struct mlx5_priv *priv, + uint32_t meter_id); +struct mlx5_flow_meter *mlx5_flow_meter_attach + (struct mlx5_priv *priv, + uint32_t meter_id, + const struct rte_flow_attr *attr, + struct rte_flow_error *error); +void mlx5_flow_meter_detach(struct mlx5_flow_meter *fm); + +#endif /* RTE_PMD_MLX5_H_ */ diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_defs.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_defs.h new file mode 100644 index 000000000..260f58429 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_defs.h @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2015 6WIND S.A. + * Copyright 2015 Mellanox Technologies, Ltd + */ + +#ifndef RTE_PMD_MLX5_DEFS_H_ +#define RTE_PMD_MLX5_DEFS_H_ + +#include <rte_ethdev_driver.h> +#include <rte_vxlan.h> + +#include "mlx5_autoconf.h" + +/* Reported driver name. */ +#define MLX5_DRIVER_NAME "net_mlx5" + +/* Maximum number of simultaneous VLAN filters. */ +#define MLX5_MAX_VLAN_IDS 128 + +/* + * Request TX completion every time descriptors reach this threshold since + * the previous request. Must be a power of two for performance reasons. + */ +#define MLX5_TX_COMP_THRESH 32u + +/* + * Request TX completion every time the total number of WQEBBs used for inlining + * packets exceeds the size of WQ divided by this divisor. Better to be power of + * two for performance. + */ +#define MLX5_TX_COMP_THRESH_INLINE_DIV (1 << 3) + +/* + * Maximal amount of normal completion CQEs + * processed in one call of tx_burst() routine. + */ +#define MLX5_TX_COMP_MAX_CQE 2u + + +/* Size of per-queue MR cache array for linear search. */ +#define MLX5_MR_CACHE_N 8 + +/* Size of MR cache table for binary search. */ +#define MLX5_MR_BTREE_CACHE_N 256 + +/* + * If defined, only use software counters. The PMD will never ask the hardware + * for these, and many of them won't be available. + */ +#ifndef MLX5_PMD_SOFT_COUNTERS +#define MLX5_PMD_SOFT_COUNTERS 1 +#endif + +/* Switch port ID parameters for bonding configurations. */ +#define MLX5_PORT_ID_BONDING_PF_MASK 0xf +#define MLX5_PORT_ID_BONDING_PF_SHIFT 0xf + +/* Alarm timeout. */ +#define MLX5_ALARM_TIMEOUT_US 100000 + +/* Maximum number of extended statistics counters. */ +#define MLX5_MAX_XSTATS 32 + +/* Maximum Packet headers size (L2+L3+L4) for TSO. */ +#define MLX5_MAX_TSO_HEADER (128u + 34u) + +/* Inline data size required by NICs. */ +#define MLX5_INLINE_HSIZE_NONE 0 +#define MLX5_INLINE_HSIZE_L2 (sizeof(struct rte_ether_hdr) + \ + sizeof(struct rte_vlan_hdr)) +#define MLX5_INLINE_HSIZE_L3 (MLX5_INLINE_HSIZE_L2 + \ + sizeof(struct rte_ipv6_hdr)) +#define MLX5_INLINE_HSIZE_L4 (MLX5_INLINE_HSIZE_L3 + \ + sizeof(struct rte_tcp_hdr)) +#define MLX5_INLINE_HSIZE_INNER_L2 (MLX5_INLINE_HSIZE_L3 + \ + sizeof(struct rte_udp_hdr) + \ + sizeof(struct rte_vxlan_hdr) + \ + sizeof(struct rte_ether_hdr) + \ + sizeof(struct rte_vlan_hdr)) +#define MLX5_INLINE_HSIZE_INNER_L3 (MLX5_INLINE_HSIZE_INNER_L2 + \ + sizeof(struct rte_ipv6_hdr)) +#define MLX5_INLINE_HSIZE_INNER_L4 (MLX5_INLINE_HSIZE_INNER_L3 + \ + sizeof(struct rte_tcp_hdr)) + +/* Threshold of buffer replenishment for vectorized Rx. */ +#define MLX5_VPMD_RXQ_RPLNSH_THRESH(n) \ + (RTE_MIN(MLX5_VPMD_RX_MAX_BURST, (unsigned int)(n) >> 2)) + +/* Maximum size of burst for vectorized Rx. */ +#define MLX5_VPMD_RX_MAX_BURST 64U + +/* Recommended optimal burst size. */ +#define MLX5_RX_DEFAULT_BURST 64U +#define MLX5_TX_DEFAULT_BURST 64U + +/* Number of packets vectorized Rx can simultaneously process in a loop. */ +#define MLX5_VPMD_DESCS_PER_LOOP 4 + +/* Mask of RSS on source only or destination only. */ +#define MLX5_RSS_SRC_DST_ONLY (ETH_RSS_L3_SRC_ONLY | ETH_RSS_L3_DST_ONLY | \ + ETH_RSS_L4_SRC_ONLY | ETH_RSS_L4_DST_ONLY) + +/* Supported RSS */ +#define MLX5_RSS_HF_MASK (~(ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP | \ + MLX5_RSS_SRC_DST_ONLY)) + +/* Timeout in seconds to get a valid link status. */ +#define MLX5_LINK_STATUS_TIMEOUT 10 + +/* Number of times to retry retrieving the physical link information. */ +#define MLX5_GET_LINK_STATUS_RETRY_COUNT 3 + +/* Maximum number of UAR pages used by a port, + * These are the size and mask for an array of mutexes used to synchronize + * the access to port's UARs on platforms that do not support 64 bit writes. + * In such systems it is possible to issue the 64 bits DoorBells through two + * consecutive writes, each write 32 bits. The access to a UAR page (which can + * be accessible by all threads in the process) must be synchronized + * (for example, using a semaphore). Such a synchronization is not required + * when ringing DoorBells on different UAR pages. + * A port with 512 Tx queues uses 8, 4kBytes, UAR pages which are shared + * among the ports. + */ +#define MLX5_UAR_PAGE_NUM_MAX 64 +#define MLX5_UAR_PAGE_NUM_MASK ((MLX5_UAR_PAGE_NUM_MAX) - 1) + +/* Fields of memory mapping type in offset parameter of mmap() */ +#define MLX5_UAR_MMAP_CMD_SHIFT 8 +#define MLX5_UAR_MMAP_CMD_MASK 0xff + +/* Environment variable to control the doorbell register mapping. */ +#define MLX5_SHUT_UP_BF "MLX5_SHUT_UP_BF" +#if defined(RTE_ARCH_ARM64) +#define MLX5_SHUT_UP_BF_DEFAULT "0" +#else +#define MLX5_SHUT_UP_BF_DEFAULT "1" +#endif + +#ifndef HAVE_MLX5DV_MMAP_GET_NC_PAGES_CMD +#define MLX5_MMAP_GET_NC_PAGES_CMD 3 +#endif + +/* Log 2 of the default number of strides per WQE for Multi-Packet RQ. */ +#define MLX5_MPRQ_STRIDE_NUM_N 6U + +/* Log 2 of the default size of a stride per WQE for Multi-Packet RQ. */ +#define MLX5_MPRQ_STRIDE_SIZE_N 11U + +/* Two-byte shift is disabled for Multi-Packet RQ. */ +#define MLX5_MPRQ_TWO_BYTE_SHIFT 0 + +/* + * Minimum size of packet to be memcpy'd instead of being attached as an + * external buffer. + */ +#define MLX5_MPRQ_MEMCPY_DEFAULT_LEN 128 + +/* Minimum number Rx queues to enable Multi-Packet RQ. */ +#define MLX5_MPRQ_MIN_RXQS 12 + +/* Cache size of mempool for Multi-Packet RQ. */ +#define MLX5_MPRQ_MP_CACHE_SZ 32U + +/* MLX5_DV_XMETA_EN supported values. */ +#define MLX5_XMETA_MODE_LEGACY 0 +#define MLX5_XMETA_MODE_META16 1 +#define MLX5_XMETA_MODE_META32 2 + +/* MLX5_TX_DB_NC supported values. */ +#define MLX5_TXDB_CACHED 0 +#define MLX5_TXDB_NCACHED 1 +#define MLX5_TXDB_HEURISTIC 2 + +/* Size of the simple hash table for metadata register table. */ +#define MLX5_FLOW_MREG_HTABLE_SZ 4096 +#define MLX5_FLOW_MREG_HNAME "MARK_COPY_TABLE" +#define MLX5_DEFAULT_COPY_ID UINT32_MAX + +/* Hairpin TX/RX queue configuration parameters. */ +#define MLX5_HAIRPIN_QUEUE_STRIDE 6 +#define MLX5_HAIRPIN_JUMBO_LOG_SIZE (14 + 2) + +/* Definition of static_assert found in /usr/include/assert.h */ +#ifndef HAVE_STATIC_ASSERT +#define static_assert _Static_assert +#endif + +#endif /* RTE_PMD_MLX5_DEFS_H_ */ diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_ethdev.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_ethdev.c new file mode 100644 index 000000000..47f11b963 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_ethdev.c @@ -0,0 +1,2071 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2015 6WIND S.A. + * Copyright 2015 Mellanox Technologies, Ltd + */ + +#include <stddef.h> +#include <inttypes.h> +#include <unistd.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <errno.h> +#include <dirent.h> +#include <net/if.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <linux/ethtool.h> +#include <linux/sockios.h> +#include <fcntl.h> +#include <stdalign.h> +#include <sys/un.h> +#include <time.h> + +#include <rte_atomic.h> +#include <rte_ethdev_driver.h> +#include <rte_bus_pci.h> +#include <rte_mbuf.h> +#include <rte_common.h> +#include <rte_interrupts.h> +#include <rte_malloc.h> +#include <rte_string_fns.h> +#include <rte_rwlock.h> +#include <rte_cycles.h> + +#include <mlx5_glue.h> +#include <mlx5_devx_cmds.h> +#include <mlx5_common.h> + +#include "mlx5.h" +#include "mlx5_rxtx.h" +#include "mlx5_utils.h" + +/* Supported speed values found in /usr/include/linux/ethtool.h */ +#ifndef HAVE_SUPPORTED_40000baseKR4_Full +#define SUPPORTED_40000baseKR4_Full (1 << 23) +#endif +#ifndef HAVE_SUPPORTED_40000baseCR4_Full +#define SUPPORTED_40000baseCR4_Full (1 << 24) +#endif +#ifndef HAVE_SUPPORTED_40000baseSR4_Full +#define SUPPORTED_40000baseSR4_Full (1 << 25) +#endif +#ifndef HAVE_SUPPORTED_40000baseLR4_Full +#define SUPPORTED_40000baseLR4_Full (1 << 26) +#endif +#ifndef HAVE_SUPPORTED_56000baseKR4_Full +#define SUPPORTED_56000baseKR4_Full (1 << 27) +#endif +#ifndef HAVE_SUPPORTED_56000baseCR4_Full +#define SUPPORTED_56000baseCR4_Full (1 << 28) +#endif +#ifndef HAVE_SUPPORTED_56000baseSR4_Full +#define SUPPORTED_56000baseSR4_Full (1 << 29) +#endif +#ifndef HAVE_SUPPORTED_56000baseLR4_Full +#define SUPPORTED_56000baseLR4_Full (1 << 30) +#endif + +/* Add defines in case the running kernel is not the same as user headers. */ +#ifndef ETHTOOL_GLINKSETTINGS +struct ethtool_link_settings { + uint32_t cmd; + uint32_t speed; + uint8_t duplex; + uint8_t port; + uint8_t phy_address; + uint8_t autoneg; + uint8_t mdio_support; + uint8_t eth_to_mdix; + uint8_t eth_tp_mdix_ctrl; + int8_t link_mode_masks_nwords; + uint32_t reserved[8]; + uint32_t link_mode_masks[]; +}; + +/* The kernel values can be found in /include/uapi/linux/ethtool.h */ +#define ETHTOOL_GLINKSETTINGS 0x0000004c +#define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5 +#define ETHTOOL_LINK_MODE_Autoneg_BIT 6 +#define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17 +#define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18 +#define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19 +#define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20 +#define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21 +#define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22 +#define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23 +#define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24 +#define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25 +#define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26 +#define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27 +#define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28 +#define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29 +#define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30 +#endif +#ifndef HAVE_ETHTOOL_LINK_MODE_25G +#define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31 +#define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32 +#define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33 +#endif +#ifndef HAVE_ETHTOOL_LINK_MODE_50G +#define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34 +#define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35 +#endif +#ifndef HAVE_ETHTOOL_LINK_MODE_100G +#define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36 +#define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37 +#define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38 +#define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39 +#endif +#ifndef HAVE_ETHTOOL_LINK_MODE_200G +#define ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT 62 +#define ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT 63 +#define ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT 0 /* 64 - 64 */ +#define ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT 1 /* 65 - 64 */ +#define ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT 2 /* 66 - 64 */ +#endif + +/** + * Get master interface name from private structure. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[out] ifname + * Interface name output buffer. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_get_master_ifname(const char *ibdev_path, char (*ifname)[IF_NAMESIZE]) +{ + DIR *dir; + struct dirent *dent; + unsigned int dev_type = 0; + unsigned int dev_port_prev = ~0u; + char match[IF_NAMESIZE] = ""; + + MLX5_ASSERT(ibdev_path); + { + MKSTR(path, "%s/device/net", ibdev_path); + + dir = opendir(path); + if (dir == NULL) { + rte_errno = errno; + return -rte_errno; + } + } + while ((dent = readdir(dir)) != NULL) { + char *name = dent->d_name; + FILE *file; + unsigned int dev_port; + int r; + + if ((name[0] == '.') && + ((name[1] == '\0') || + ((name[1] == '.') && (name[2] == '\0')))) + continue; + + MKSTR(path, "%s/device/net/%s/%s", + ibdev_path, name, + (dev_type ? "dev_id" : "dev_port")); + + file = fopen(path, "rb"); + if (file == NULL) { + if (errno != ENOENT) + continue; + /* + * Switch to dev_id when dev_port does not exist as + * is the case with Linux kernel versions < 3.15. + */ +try_dev_id: + match[0] = '\0'; + if (dev_type) + break; + dev_type = 1; + dev_port_prev = ~0u; + rewinddir(dir); + continue; + } + r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); + fclose(file); + if (r != 1) + continue; + /* + * Switch to dev_id when dev_port returns the same value for + * all ports. May happen when using a MOFED release older than + * 3.0 with a Linux kernel >= 3.15. + */ + if (dev_port == dev_port_prev) + goto try_dev_id; + dev_port_prev = dev_port; + if (dev_port == 0) + strlcpy(match, name, sizeof(match)); + } + closedir(dir); + if (match[0] == '\0') { + rte_errno = ENOENT; + return -rte_errno; + } + strncpy(*ifname, match, sizeof(*ifname)); + return 0; +} + +/** + * Get interface name from private structure. + * + * This is a port representor-aware version of mlx5_get_master_ifname(). + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[out] ifname + * Interface name output buffer. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]) +{ + struct mlx5_priv *priv = dev->data->dev_private; + unsigned int ifindex; + + MLX5_ASSERT(priv); + MLX5_ASSERT(priv->sh); + ifindex = mlx5_ifindex(dev); + if (!ifindex) { + if (!priv->representor) + return mlx5_get_master_ifname(priv->sh->ibdev_path, + ifname); + rte_errno = ENXIO; + return -rte_errno; + } + if (if_indextoname(ifindex, &(*ifname)[0])) + return 0; + rte_errno = errno; + return -rte_errno; +} + +/** + * Get the interface index from device name. + * + * @param[in] dev + * Pointer to Ethernet device. + * + * @return + * Nonzero interface index on success, zero otherwise and rte_errno is set. + */ +unsigned int +mlx5_ifindex(const struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + unsigned int ifindex; + + MLX5_ASSERT(priv); + MLX5_ASSERT(priv->if_index); + ifindex = priv->if_index; + if (!ifindex) + rte_errno = ENXIO; + return ifindex; +} + +/** + * Perform ifreq ioctl() on associated Ethernet device. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param req + * Request number to pass to ioctl(). + * @param[out] ifr + * Interface request structure output buffer. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr) +{ + int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); + int ret = 0; + + if (sock == -1) { + rte_errno = errno; + return -rte_errno; + } + ret = mlx5_get_ifname(dev, &ifr->ifr_name); + if (ret) + goto error; + ret = ioctl(sock, req, ifr); + if (ret == -1) { + rte_errno = errno; + goto error; + } + close(sock); + return 0; +error: + close(sock); + return -rte_errno; +} + +/** + * Get device MTU. + * + * @param dev + * Pointer to Ethernet device. + * @param[out] mtu + * MTU value output buffer. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu) +{ + struct ifreq request; + int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request); + + if (ret) + return ret; + *mtu = request.ifr_mtu; + return 0; +} + +/** + * Set device MTU. + * + * @param dev + * Pointer to Ethernet device. + * @param mtu + * MTU value to set. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) +{ + struct ifreq request = { .ifr_mtu = mtu, }; + + return mlx5_ifreq(dev, SIOCSIFMTU, &request); +} + +/** + * Set device flags. + * + * @param dev + * Pointer to Ethernet device. + * @param keep + * Bitmask for flags that must remain untouched. + * @param flags + * Bitmask for flags to modify. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags) +{ + struct ifreq request; + int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request); + + if (ret) + return ret; + request.ifr_flags &= keep; + request.ifr_flags |= flags & ~keep; + return mlx5_ifreq(dev, SIOCSIFFLAGS, &request); +} + +/** + * DPDK callback for Ethernet device configuration. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_dev_configure(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + unsigned int rxqs_n = dev->data->nb_rx_queues; + unsigned int txqs_n = dev->data->nb_tx_queues; + const uint8_t use_app_rss_key = + !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key; + int ret = 0; + + if (use_app_rss_key && + (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len != + MLX5_RSS_HASH_KEY_LEN)) { + DRV_LOG(ERR, "port %u RSS key len must be %s Bytes long", + dev->data->port_id, RTE_STR(MLX5_RSS_HASH_KEY_LEN)); + rte_errno = EINVAL; + return -rte_errno; + } + priv->rss_conf.rss_key = + rte_realloc(priv->rss_conf.rss_key, + MLX5_RSS_HASH_KEY_LEN, 0); + if (!priv->rss_conf.rss_key) { + DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)", + dev->data->port_id, rxqs_n); + rte_errno = ENOMEM; + return -rte_errno; + } + + if (dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG) + dev->data->dev_conf.rxmode.offloads |= DEV_RX_OFFLOAD_RSS_HASH; + + memcpy(priv->rss_conf.rss_key, + use_app_rss_key ? + dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key : + rss_hash_default_key, + MLX5_RSS_HASH_KEY_LEN); + priv->rss_conf.rss_key_len = MLX5_RSS_HASH_KEY_LEN; + priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf; + priv->rxqs = (void *)dev->data->rx_queues; + priv->txqs = (void *)dev->data->tx_queues; + if (txqs_n != priv->txqs_n) { + DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u", + dev->data->port_id, priv->txqs_n, txqs_n); + priv->txqs_n = txqs_n; + } + if (rxqs_n > priv->config.ind_table_max_size) { + DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)", + dev->data->port_id, rxqs_n); + rte_errno = EINVAL; + return -rte_errno; + } + if (rxqs_n != priv->rxqs_n) { + DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u", + dev->data->port_id, priv->rxqs_n, rxqs_n); + priv->rxqs_n = rxqs_n; + } + priv->skip_default_rss_reta = 0; + ret = mlx5_proc_priv_init(dev); + if (ret) + return ret; + return 0; +} + +/** + * Configure default RSS reta. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_dev_configure_rss_reta(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + unsigned int rxqs_n = dev->data->nb_rx_queues; + unsigned int i; + unsigned int j; + unsigned int reta_idx_n; + int ret = 0; + unsigned int *rss_queue_arr = NULL; + unsigned int rss_queue_n = 0; + + if (priv->skip_default_rss_reta) + return ret; + rss_queue_arr = rte_malloc("", rxqs_n * sizeof(unsigned int), 0); + if (!rss_queue_arr) { + DRV_LOG(ERR, "port %u cannot allocate RSS queue list (%u)", + dev->data->port_id, rxqs_n); + rte_errno = ENOMEM; + return -rte_errno; + } + for (i = 0, j = 0; i < rxqs_n; i++) { + struct mlx5_rxq_data *rxq_data; + struct mlx5_rxq_ctrl *rxq_ctrl; + + rxq_data = (*priv->rxqs)[i]; + rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); + if (rxq_ctrl && rxq_ctrl->type == MLX5_RXQ_TYPE_STANDARD) + rss_queue_arr[j++] = i; + } + rss_queue_n = j; + if (rss_queue_n > priv->config.ind_table_max_size) { + DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)", + dev->data->port_id, rss_queue_n); + rte_errno = EINVAL; + rte_free(rss_queue_arr); + return -rte_errno; + } + DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u", + dev->data->port_id, priv->rxqs_n, rxqs_n); + priv->rxqs_n = rxqs_n; + /* + * If the requested number of RX queues is not a power of two, + * use the maximum indirection table size for better balancing. + * The result is always rounded to the next power of two. + */ + reta_idx_n = (1 << log2above((rss_queue_n & (rss_queue_n - 1)) ? + priv->config.ind_table_max_size : + rss_queue_n)); + ret = mlx5_rss_reta_index_resize(dev, reta_idx_n); + if (ret) { + rte_free(rss_queue_arr); + return ret; + } + /* + * When the number of RX queues is not a power of two, + * the remaining table entries are padded with reused WQs + * and hashes are not spread uniformly. + */ + for (i = 0, j = 0; (i != reta_idx_n); ++i) { + (*priv->reta_idx)[i] = rss_queue_arr[j]; + if (++j == rss_queue_n) + j = 0; + } + rte_free(rss_queue_arr); + return ret; +} + +/** + * Sets default tuning parameters. + * + * @param dev + * Pointer to Ethernet device. + * @param[out] info + * Info structure output buffer. + */ +static void +mlx5_set_default_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) +{ + struct mlx5_priv *priv = dev->data->dev_private; + + /* Minimum CPU utilization. */ + info->default_rxportconf.ring_size = 256; + info->default_txportconf.ring_size = 256; + info->default_rxportconf.burst_size = MLX5_RX_DEFAULT_BURST; + info->default_txportconf.burst_size = MLX5_TX_DEFAULT_BURST; + if ((priv->link_speed_capa & ETH_LINK_SPEED_200G) | + (priv->link_speed_capa & ETH_LINK_SPEED_100G)) { + info->default_rxportconf.nb_queues = 16; + info->default_txportconf.nb_queues = 16; + if (dev->data->nb_rx_queues > 2 || + dev->data->nb_tx_queues > 2) { + /* Max Throughput. */ + info->default_rxportconf.ring_size = 2048; + info->default_txportconf.ring_size = 2048; + } + } else { + info->default_rxportconf.nb_queues = 8; + info->default_txportconf.nb_queues = 8; + if (dev->data->nb_rx_queues > 2 || + dev->data->nb_tx_queues > 2) { + /* Max Throughput. */ + info->default_rxportconf.ring_size = 4096; + info->default_txportconf.ring_size = 4096; + } + } +} + +/** + * Sets tx mbuf limiting parameters. + * + * @param dev + * Pointer to Ethernet device. + * @param[out] info + * Info structure output buffer. + */ +static void +mlx5_set_txlimit_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_dev_config *config = &priv->config; + unsigned int inlen; + uint16_t nb_max; + + inlen = (config->txq_inline_max == MLX5_ARG_UNSET) ? + MLX5_SEND_DEF_INLINE_LEN : + (unsigned int)config->txq_inline_max; + MLX5_ASSERT(config->txq_inline_min >= 0); + inlen = RTE_MAX(inlen, (unsigned int)config->txq_inline_min); + inlen = RTE_MIN(inlen, MLX5_WQE_SIZE_MAX + + MLX5_ESEG_MIN_INLINE_SIZE - + MLX5_WQE_CSEG_SIZE - + MLX5_WQE_ESEG_SIZE - + MLX5_WQE_DSEG_SIZE * 2); + nb_max = (MLX5_WQE_SIZE_MAX + + MLX5_ESEG_MIN_INLINE_SIZE - + MLX5_WQE_CSEG_SIZE - + MLX5_WQE_ESEG_SIZE - + MLX5_WQE_DSEG_SIZE - + inlen) / MLX5_WSEG_SIZE; + info->tx_desc_lim.nb_seg_max = nb_max; + info->tx_desc_lim.nb_mtu_seg_max = nb_max; +} + +/** + * DPDK callback to get information about the device. + * + * @param dev + * Pointer to Ethernet device structure. + * @param[out] info + * Info structure output buffer. + */ +int +mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_dev_config *config = &priv->config; + unsigned int max; + + /* FIXME: we should ask the device for these values. */ + info->min_rx_bufsize = 32; + info->max_rx_pktlen = 65536; + info->max_lro_pkt_size = MLX5_MAX_LRO_SIZE; + /* + * Since we need one CQ per QP, the limit is the minimum number + * between the two values. + */ + max = RTE_MIN(priv->sh->device_attr.orig_attr.max_cq, + priv->sh->device_attr.orig_attr.max_qp); + /* max_rx_queues is uint16_t. */ + max = RTE_MIN(max, (unsigned int)UINT16_MAX); + info->max_rx_queues = max; + info->max_tx_queues = max; + info->max_mac_addrs = MLX5_MAX_UC_MAC_ADDRESSES; + info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev); + info->rx_offload_capa = (mlx5_get_rx_port_offloads() | + info->rx_queue_offload_capa); + info->tx_offload_capa = mlx5_get_tx_port_offloads(dev); + info->if_index = mlx5_ifindex(dev); + info->reta_size = priv->reta_idx_n ? + priv->reta_idx_n : config->ind_table_max_size; + info->hash_key_size = MLX5_RSS_HASH_KEY_LEN; + info->speed_capa = priv->link_speed_capa; + info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK; + mlx5_set_default_params(dev, info); + mlx5_set_txlimit_params(dev, info); + info->switch_info.name = dev->data->name; + info->switch_info.domain_id = priv->domain_id; + info->switch_info.port_id = priv->representor_id; + if (priv->representor) { + uint16_t port_id; + + if (priv->pf_bond >= 0) { + /* + * Switch port ID is opaque value with driver defined + * format. Push the PF index in bonding configurations + * in upper four bits of port ID. If we get too many + * representors (more than 4K) or PFs (more than 15) + * this approach must be reconsidered. + */ + if ((info->switch_info.port_id >> + MLX5_PORT_ID_BONDING_PF_SHIFT) || + priv->pf_bond > MLX5_PORT_ID_BONDING_PF_MASK) { + DRV_LOG(ERR, "can't update switch port ID" + " for bonding device"); + MLX5_ASSERT(false); + return -ENODEV; + } + info->switch_info.port_id |= + priv->pf_bond << MLX5_PORT_ID_BONDING_PF_SHIFT; + } + MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { + struct mlx5_priv *opriv = + rte_eth_devices[port_id].data->dev_private; + + if (!opriv || + opriv->representor || + opriv->sh != priv->sh || + opriv->domain_id != priv->domain_id) + continue; + /* + * Override switch name with that of the master + * device. + */ + info->switch_info.name = opriv->dev_data->name; + break; + } + } + return 0; +} + +/** + * Get device current raw clock counter + * + * @param dev + * Pointer to Ethernet device structure. + * @param[out] time + * Current raw clock counter of the device. + * + * @return + * 0 if the clock has correctly been read + * The value of errno in case of error + */ +int +mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct ibv_context *ctx = priv->sh->ctx; + struct ibv_values_ex values; + int err = 0; + + values.comp_mask = IBV_VALUES_MASK_RAW_CLOCK; + err = mlx5_glue->query_rt_values_ex(ctx, &values); + if (err != 0) { + DRV_LOG(WARNING, "Could not query the clock !"); + return err; + } + *clock = values.raw_clock.tv_nsec; + return 0; +} + +/** + * Get firmware version of a device. + * + * @param dev + * Ethernet device port. + * @param fw_ver + * String output allocated by caller. + * @param fw_size + * Size of the output string, including terminating null byte. + * + * @return + * 0 on success, or the size of the non truncated string if too big. + */ +int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct ibv_device_attr *attr = &priv->sh->device_attr.orig_attr; + size_t size = strnlen(attr->fw_ver, sizeof(attr->fw_ver)) + 1; + + if (fw_size < size) + return size; + if (fw_ver != NULL) + strlcpy(fw_ver, attr->fw_ver, fw_size); + return 0; +} + +/** + * Get supported packet types. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * A pointer to the supported Packet types array. + */ +const uint32_t * +mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) +{ + static const uint32_t ptypes[] = { + /* refers to rxq_cq_to_pkt_type() */ + RTE_PTYPE_L2_ETHER, + RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, + RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, + RTE_PTYPE_L4_NONFRAG, + RTE_PTYPE_L4_FRAG, + RTE_PTYPE_L4_TCP, + RTE_PTYPE_L4_UDP, + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN, + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN, + RTE_PTYPE_INNER_L4_NONFRAG, + RTE_PTYPE_INNER_L4_FRAG, + RTE_PTYPE_INNER_L4_TCP, + RTE_PTYPE_INNER_L4_UDP, + RTE_PTYPE_UNKNOWN + }; + + if (dev->rx_pkt_burst == mlx5_rx_burst || + dev->rx_pkt_burst == mlx5_rx_burst_mprq || + dev->rx_pkt_burst == mlx5_rx_burst_vec) + return ptypes; + return NULL; +} + +/** + * Retrieve the master device for representor in the same switch domain. + * + * @param dev + * Pointer to representor Ethernet device structure. + * + * @return + * Master device structure on success, NULL otherwise. + */ + +static struct rte_eth_dev * +mlx5_find_master_dev(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv; + uint16_t port_id; + uint16_t domain_id; + + priv = dev->data->dev_private; + domain_id = priv->domain_id; + MLX5_ASSERT(priv->representor); + MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { + struct mlx5_priv *opriv = + rte_eth_devices[port_id].data->dev_private; + if (opriv && + opriv->master && + opriv->domain_id == domain_id && + opriv->sh == priv->sh) + return &rte_eth_devices[port_id]; + } + return NULL; +} + +/** + * DPDK callback to retrieve physical link information. + * + * @param dev + * Pointer to Ethernet device structure. + * @param[out] link + * Storage for current link status. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, + struct rte_eth_link *link) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct ethtool_cmd edata = { + .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */ + }; + struct ifreq ifr; + struct rte_eth_link dev_link; + int link_speed = 0; + int ret; + + ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr); + if (ret) { + DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s", + dev->data->port_id, strerror(rte_errno)); + return ret; + } + dev_link = (struct rte_eth_link) { + .link_status = ((ifr.ifr_flags & IFF_UP) && + (ifr.ifr_flags & IFF_RUNNING)), + }; + ifr = (struct ifreq) { + .ifr_data = (void *)&edata, + }; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); + if (ret) { + if (ret == -ENOTSUP && priv->representor) { + struct rte_eth_dev *master; + + /* + * For representors we can try to inherit link + * settings from the master device. Actually + * link settings do not make a lot of sense + * for representors due to missing physical + * link. The old kernel drivers supported + * emulated settings query for representors, + * the new ones do not, so we have to add + * this code for compatibility issues. + */ + master = mlx5_find_master_dev(dev); + if (master) { + ifr = (struct ifreq) { + .ifr_data = (void *)&edata, + }; + ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr); + } + } + if (ret) { + DRV_LOG(WARNING, + "port %u ioctl(SIOCETHTOOL," + " ETHTOOL_GSET) failed: %s", + dev->data->port_id, strerror(rte_errno)); + return ret; + } + } + link_speed = ethtool_cmd_speed(&edata); + if (link_speed == -1) + dev_link.link_speed = ETH_SPEED_NUM_NONE; + else + dev_link.link_speed = link_speed; + priv->link_speed_capa = 0; + if (edata.supported & SUPPORTED_Autoneg) + priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG; + if (edata.supported & (SUPPORTED_1000baseT_Full | + SUPPORTED_1000baseKX_Full)) + priv->link_speed_capa |= ETH_LINK_SPEED_1G; + if (edata.supported & SUPPORTED_10000baseKR_Full) + priv->link_speed_capa |= ETH_LINK_SPEED_10G; + if (edata.supported & (SUPPORTED_40000baseKR4_Full | + SUPPORTED_40000baseCR4_Full | + SUPPORTED_40000baseSR4_Full | + SUPPORTED_40000baseLR4_Full)) + priv->link_speed_capa |= ETH_LINK_SPEED_40G; + dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ? + ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); + dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & + ETH_LINK_SPEED_FIXED); + if (((dev_link.link_speed && !dev_link.link_status) || + (!dev_link.link_speed && dev_link.link_status))) { + rte_errno = EAGAIN; + return -rte_errno; + } + *link = dev_link; + return 0; +} + +/** + * Retrieve physical link information (unlocked version using new ioctl). + * + * @param dev + * Pointer to Ethernet device structure. + * @param[out] link + * Storage for current link status. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, + struct rte_eth_link *link) + +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS }; + struct ifreq ifr; + struct rte_eth_link dev_link; + struct rte_eth_dev *master = NULL; + uint64_t sc; + int ret; + + ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr); + if (ret) { + DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s", + dev->data->port_id, strerror(rte_errno)); + return ret; + } + dev_link = (struct rte_eth_link) { + .link_status = ((ifr.ifr_flags & IFF_UP) && + (ifr.ifr_flags & IFF_RUNNING)), + }; + ifr = (struct ifreq) { + .ifr_data = (void *)&gcmd, + }; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); + if (ret) { + if (ret == -ENOTSUP && priv->representor) { + /* + * For representors we can try to inherit link + * settings from the master device. Actually + * link settings do not make a lot of sense + * for representors due to missing physical + * link. The old kernel drivers supported + * emulated settings query for representors, + * the new ones do not, so we have to add + * this code for compatibility issues. + */ + master = mlx5_find_master_dev(dev); + if (master) { + ifr = (struct ifreq) { + .ifr_data = (void *)&gcmd, + }; + ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr); + } + } + if (ret) { + DRV_LOG(DEBUG, + "port %u ioctl(SIOCETHTOOL," + " ETHTOOL_GLINKSETTINGS) failed: %s", + dev->data->port_id, strerror(rte_errno)); + return ret; + } + + } + gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords; + + alignas(struct ethtool_link_settings) + uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) + + sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3]; + struct ethtool_link_settings *ecmd = (void *)data; + + *ecmd = gcmd; + ifr.ifr_data = (void *)ecmd; + ret = mlx5_ifreq(master ? master : dev, SIOCETHTOOL, &ifr); + if (ret) { + DRV_LOG(DEBUG, + "port %u ioctl(SIOCETHTOOL," + "ETHTOOL_GLINKSETTINGS) failed: %s", + dev->data->port_id, strerror(rte_errno)); + return ret; + } + dev_link.link_speed = (ecmd->speed == UINT32_MAX) ? ETH_SPEED_NUM_NONE : + ecmd->speed; + sc = ecmd->link_mode_masks[0] | + ((uint64_t)ecmd->link_mode_masks[1] << 32); + priv->link_speed_capa = 0; + if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT)) + priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG; + if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) | + MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT))) + priv->link_speed_capa |= ETH_LINK_SPEED_1G; + if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) | + MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) | + MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT))) + priv->link_speed_capa |= ETH_LINK_SPEED_10G; + if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) | + MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT))) + priv->link_speed_capa |= ETH_LINK_SPEED_20G; + if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) | + MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) | + MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) | + MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT))) + priv->link_speed_capa |= ETH_LINK_SPEED_40G; + if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) | + MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) | + MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) | + MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT))) + priv->link_speed_capa |= ETH_LINK_SPEED_56G; + if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) | + MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) | + MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT))) + priv->link_speed_capa |= ETH_LINK_SPEED_25G; + if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) | + MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT))) + priv->link_speed_capa |= ETH_LINK_SPEED_50G; + if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) | + MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) | + MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) | + MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT))) + priv->link_speed_capa |= ETH_LINK_SPEED_100G; + if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT) | + MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT))) + priv->link_speed_capa |= ETH_LINK_SPEED_200G; + + sc = ecmd->link_mode_masks[2] | + ((uint64_t)ecmd->link_mode_masks[3] << 32); + if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT) | + MLX5_BITSHIFT( + ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT) | + MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT))) + priv->link_speed_capa |= ETH_LINK_SPEED_200G; + dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ? + ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); + dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & + ETH_LINK_SPEED_FIXED); + if (((dev_link.link_speed && !dev_link.link_status) || + (!dev_link.link_speed && dev_link.link_status))) { + rte_errno = EAGAIN; + return -rte_errno; + } + *link = dev_link; + return 0; +} + +/** + * DPDK callback to retrieve physical link information. + * + * @param dev + * Pointer to Ethernet device structure. + * @param wait_to_complete + * Wait for request completion. + * + * @return + * 0 if link status was not updated, positive if it was, a negative errno + * value otherwise and rte_errno is set. + */ +int +mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete) +{ + int ret; + struct rte_eth_link dev_link; + time_t start_time = time(NULL); + int retry = MLX5_GET_LINK_STATUS_RETRY_COUNT; + + do { + ret = mlx5_link_update_unlocked_gs(dev, &dev_link); + if (ret == -ENOTSUP) + ret = mlx5_link_update_unlocked_gset(dev, &dev_link); + if (ret == 0) + break; + /* Handle wait to complete situation. */ + if ((wait_to_complete || retry) && ret == -EAGAIN) { + if (abs((int)difftime(time(NULL), start_time)) < + MLX5_LINK_STATUS_TIMEOUT) { + usleep(0); + continue; + } else { + rte_errno = EBUSY; + return -rte_errno; + } + } else if (ret < 0) { + return ret; + } + } while (wait_to_complete || retry-- > 0); + ret = !!memcmp(&dev->data->dev_link, &dev_link, + sizeof(struct rte_eth_link)); + dev->data->dev_link = dev_link; + return ret; +} + +/** + * DPDK callback to change the MTU. + * + * @param dev + * Pointer to Ethernet device structure. + * @param in_mtu + * New MTU. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) +{ + struct mlx5_priv *priv = dev->data->dev_private; + uint16_t kern_mtu = 0; + int ret; + + ret = mlx5_get_mtu(dev, &kern_mtu); + if (ret) + return ret; + /* Set kernel interface MTU first. */ + ret = mlx5_set_mtu(dev, mtu); + if (ret) + return ret; + ret = mlx5_get_mtu(dev, &kern_mtu); + if (ret) + return ret; + if (kern_mtu == mtu) { + priv->mtu = mtu; + DRV_LOG(DEBUG, "port %u adapter MTU set to %u", + dev->data->port_id, mtu); + return 0; + } + rte_errno = EAGAIN; + return -rte_errno; +} + +/** + * DPDK callback to get flow control status. + * + * @param dev + * Pointer to Ethernet device structure. + * @param[out] fc_conf + * Flow control output buffer. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) +{ + struct ifreq ifr; + struct ethtool_pauseparam ethpause = { + .cmd = ETHTOOL_GPAUSEPARAM + }; + int ret; + + ifr.ifr_data = (void *)ðpause; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); + if (ret) { + DRV_LOG(WARNING, + "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:" + " %s", + dev->data->port_id, strerror(rte_errno)); + return ret; + } + fc_conf->autoneg = ethpause.autoneg; + if (ethpause.rx_pause && ethpause.tx_pause) + fc_conf->mode = RTE_FC_FULL; + else if (ethpause.rx_pause) + fc_conf->mode = RTE_FC_RX_PAUSE; + else if (ethpause.tx_pause) + fc_conf->mode = RTE_FC_TX_PAUSE; + else + fc_conf->mode = RTE_FC_NONE; + return 0; +} + +/** + * DPDK callback to modify flow control parameters. + * + * @param dev + * Pointer to Ethernet device structure. + * @param[in] fc_conf + * Flow control parameters. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) +{ + struct ifreq ifr; + struct ethtool_pauseparam ethpause = { + .cmd = ETHTOOL_SPAUSEPARAM + }; + int ret; + + ifr.ifr_data = (void *)ðpause; + ethpause.autoneg = fc_conf->autoneg; + if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || + (fc_conf->mode & RTE_FC_RX_PAUSE)) + ethpause.rx_pause = 1; + else + ethpause.rx_pause = 0; + + if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || + (fc_conf->mode & RTE_FC_TX_PAUSE)) + ethpause.tx_pause = 1; + else + ethpause.tx_pause = 0; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); + if (ret) { + DRV_LOG(WARNING, + "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" + " failed: %s", + dev->data->port_id, strerror(rte_errno)); + return ret; + } + return 0; +} + +/** + * Handle asynchronous removal event for entire multiport device. + * + * @param sh + * Infiniband device shared context. + */ +static void +mlx5_dev_interrupt_device_fatal(struct mlx5_ibv_shared *sh) +{ + uint32_t i; + + for (i = 0; i < sh->max_port; ++i) { + struct rte_eth_dev *dev; + + if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) { + /* + * Or not existing port either no + * handler installed for this port. + */ + continue; + } + dev = &rte_eth_devices[sh->port[i].ih_port_id]; + MLX5_ASSERT(dev); + if (dev->data->dev_conf.intr_conf.rmv) + _rte_eth_dev_callback_process + (dev, RTE_ETH_EVENT_INTR_RMV, NULL); + } +} + +/** + * Handle shared asynchronous events the NIC (removal event + * and link status change). Supports multiport IB device. + * + * @param cb_arg + * Callback argument. + */ +void +mlx5_dev_interrupt_handler(void *cb_arg) +{ + struct mlx5_ibv_shared *sh = cb_arg; + struct ibv_async_event event; + + /* Read all message from the IB device and acknowledge them. */ + for (;;) { + struct rte_eth_dev *dev; + uint32_t tmp; + + if (mlx5_glue->get_async_event(sh->ctx, &event)) + break; + /* Retrieve and check IB port index. */ + tmp = (uint32_t)event.element.port_num; + if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) { + /* + * The DEVICE_FATAL event is called once for + * entire device without port specifying. + * We should notify all existing ports. + */ + mlx5_glue->ack_async_event(&event); + mlx5_dev_interrupt_device_fatal(sh); + continue; + } + MLX5_ASSERT(tmp && (tmp <= sh->max_port)); + if (!tmp) { + /* Unsupported devive level event. */ + mlx5_glue->ack_async_event(&event); + DRV_LOG(DEBUG, + "unsupported common event (type %d)", + event.event_type); + continue; + } + if (tmp > sh->max_port) { + /* Invalid IB port index. */ + mlx5_glue->ack_async_event(&event); + DRV_LOG(DEBUG, + "cannot handle an event (type %d)" + "due to invalid IB port index (%u)", + event.event_type, tmp); + continue; + } + if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) { + /* No handler installed. */ + mlx5_glue->ack_async_event(&event); + DRV_LOG(DEBUG, + "cannot handle an event (type %d)" + "due to no handler installed for port %u", + event.event_type, tmp); + continue; + } + /* Retrieve ethernet device descriptor. */ + tmp = sh->port[tmp - 1].ih_port_id; + dev = &rte_eth_devices[tmp]; + MLX5_ASSERT(dev); + if ((event.event_type == IBV_EVENT_PORT_ACTIVE || + event.event_type == IBV_EVENT_PORT_ERR) && + dev->data->dev_conf.intr_conf.lsc) { + mlx5_glue->ack_async_event(&event); + if (mlx5_link_update(dev, 0) == -EAGAIN) { + usleep(0); + continue; + } + _rte_eth_dev_callback_process + (dev, RTE_ETH_EVENT_INTR_LSC, NULL); + continue; + } + DRV_LOG(DEBUG, + "port %u cannot handle an unknown event (type %d)", + dev->data->port_id, event.event_type); + mlx5_glue->ack_async_event(&event); + } +} + +/* + * Unregister callback handler safely. The handler may be active + * while we are trying to unregister it, in this case code -EAGAIN + * is returned by rte_intr_callback_unregister(). This routine checks + * the return code and tries to unregister handler again. + * + * @param handle + * interrupt handle + * @param cb_fn + * pointer to callback routine + * @cb_arg + * opaque callback parameter + */ +void +mlx5_intr_callback_unregister(const struct rte_intr_handle *handle, + rte_intr_callback_fn cb_fn, void *cb_arg) +{ + /* + * Try to reduce timeout management overhead by not calling + * the timer related routines on the first iteration. If the + * unregistering succeeds on first call there will be no + * timer calls at all. + */ + uint64_t twait = 0; + uint64_t start = 0; + + do { + int ret; + + ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg); + if (ret >= 0) + return; + if (ret != -EAGAIN) { + DRV_LOG(INFO, "failed to unregister interrupt" + " handler (error: %d)", ret); + MLX5_ASSERT(false); + return; + } + if (twait) { + struct timespec onems; + + /* Wait one millisecond and try again. */ + onems.tv_sec = 0; + onems.tv_nsec = NS_PER_S / MS_PER_S; + nanosleep(&onems, 0); + /* Check whether one second elapsed. */ + if ((rte_get_timer_cycles() - start) <= twait) + continue; + } else { + /* + * We get the amount of timer ticks for one second. + * If this amount elapsed it means we spent one + * second in waiting. This branch is executed once + * on first iteration. + */ + twait = rte_get_timer_hz(); + MLX5_ASSERT(twait); + } + /* + * Timeout elapsed, show message (once a second) and retry. + * We have no other acceptable option here, if we ignore + * the unregistering return code the handler will not + * be unregistered, fd will be closed and we may get the + * crush. Hanging and messaging in the loop seems not to be + * the worst choice. + */ + DRV_LOG(INFO, "Retrying to unregister interrupt handler"); + start = rte_get_timer_cycles(); + } while (true); +} + +/** + * Handle DEVX interrupts from the NIC. + * This function is probably called from the DPDK host thread. + * + * @param cb_arg + * Callback argument. + */ +void +mlx5_dev_interrupt_handler_devx(void *cb_arg) +{ +#ifndef HAVE_IBV_DEVX_ASYNC + (void)cb_arg; + return; +#else + struct mlx5_ibv_shared *sh = cb_arg; + union { + struct mlx5dv_devx_async_cmd_hdr cmd_resp; + uint8_t buf[MLX5_ST_SZ_BYTES(query_flow_counter_out) + + MLX5_ST_SZ_BYTES(traffic_counter) + + sizeof(struct mlx5dv_devx_async_cmd_hdr)]; + } out; + uint8_t *buf = out.buf + sizeof(out.cmd_resp); + + while (!mlx5_glue->devx_get_async_cmd_comp(sh->devx_comp, + &out.cmd_resp, + sizeof(out.buf))) + mlx5_flow_async_pool_query_handle + (sh, (uint64_t)out.cmd_resp.wr_id, + mlx5_devx_get_out_command_status(buf)); +#endif /* HAVE_IBV_DEVX_ASYNC */ +} + +/** + * Uninstall shared asynchronous device events handler. + * This function is implemented to support event sharing + * between multiple ports of single IB device. + * + * @param dev + * Pointer to Ethernet device. + */ +static void +mlx5_dev_shared_handler_uninstall(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) + return; + pthread_mutex_lock(&sh->intr_mutex); + MLX5_ASSERT(priv->ibv_port); + MLX5_ASSERT(priv->ibv_port <= sh->max_port); + MLX5_ASSERT(dev->data->port_id < RTE_MAX_ETHPORTS); + if (sh->port[priv->ibv_port - 1].ih_port_id >= RTE_MAX_ETHPORTS) + goto exit; + MLX5_ASSERT(sh->port[priv->ibv_port - 1].ih_port_id == + (uint32_t)dev->data->port_id); + MLX5_ASSERT(sh->intr_cnt); + sh->port[priv->ibv_port - 1].ih_port_id = RTE_MAX_ETHPORTS; + if (!sh->intr_cnt || --sh->intr_cnt) + goto exit; + mlx5_intr_callback_unregister(&sh->intr_handle, + mlx5_dev_interrupt_handler, sh); + sh->intr_handle.fd = 0; + sh->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; +exit: + pthread_mutex_unlock(&sh->intr_mutex); +} + +/** + * Uninstall devx shared asynchronous device events handler. + * This function is implemeted to support event sharing + * between multiple ports of single IB device. + * + * @param dev + * Pointer to Ethernet device. + */ +static void +mlx5_dev_shared_handler_devx_uninstall(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) + return; + pthread_mutex_lock(&sh->intr_mutex); + MLX5_ASSERT(priv->ibv_port); + MLX5_ASSERT(priv->ibv_port <= sh->max_port); + MLX5_ASSERT(dev->data->port_id < RTE_MAX_ETHPORTS); + if (sh->port[priv->ibv_port - 1].devx_ih_port_id >= RTE_MAX_ETHPORTS) + goto exit; + MLX5_ASSERT(sh->port[priv->ibv_port - 1].devx_ih_port_id == + (uint32_t)dev->data->port_id); + sh->port[priv->ibv_port - 1].devx_ih_port_id = RTE_MAX_ETHPORTS; + if (!sh->devx_intr_cnt || --sh->devx_intr_cnt) + goto exit; + if (sh->intr_handle_devx.fd) { + rte_intr_callback_unregister(&sh->intr_handle_devx, + mlx5_dev_interrupt_handler_devx, + sh); + sh->intr_handle_devx.fd = 0; + sh->intr_handle_devx.type = RTE_INTR_HANDLE_UNKNOWN; + } + if (sh->devx_comp) { + mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp); + sh->devx_comp = NULL; + } +exit: + pthread_mutex_unlock(&sh->intr_mutex); +} + +/** + * Install shared asynchronous device events handler. + * This function is implemented to support event sharing + * between multiple ports of single IB device. + * + * @param dev + * Pointer to Ethernet device. + */ +static void +mlx5_dev_shared_handler_install(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; + int ret; + int flags; + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) + return; + pthread_mutex_lock(&sh->intr_mutex); + MLX5_ASSERT(priv->ibv_port); + MLX5_ASSERT(priv->ibv_port <= sh->max_port); + MLX5_ASSERT(dev->data->port_id < RTE_MAX_ETHPORTS); + if (sh->port[priv->ibv_port - 1].ih_port_id < RTE_MAX_ETHPORTS) { + /* The handler is already installed for this port. */ + MLX5_ASSERT(sh->intr_cnt); + goto exit; + } + if (sh->intr_cnt) { + sh->port[priv->ibv_port - 1].ih_port_id = + (uint32_t)dev->data->port_id; + sh->intr_cnt++; + goto exit; + } + /* No shared handler installed. */ + MLX5_ASSERT(sh->ctx->async_fd > 0); + flags = fcntl(sh->ctx->async_fd, F_GETFL); + ret = fcntl(sh->ctx->async_fd, F_SETFL, flags | O_NONBLOCK); + if (ret) { + DRV_LOG(INFO, "failed to change file descriptor async event" + " queue"); + /* Indicate there will be no interrupts. */ + dev->data->dev_conf.intr_conf.lsc = 0; + dev->data->dev_conf.intr_conf.rmv = 0; + } else { + sh->intr_handle.fd = sh->ctx->async_fd; + sh->intr_handle.type = RTE_INTR_HANDLE_EXT; + rte_intr_callback_register(&sh->intr_handle, + mlx5_dev_interrupt_handler, sh); + sh->intr_cnt++; + sh->port[priv->ibv_port - 1].ih_port_id = + (uint32_t)dev->data->port_id; + } +exit: + pthread_mutex_unlock(&sh->intr_mutex); +} + +/** + * Install devx shared asyncronous device events handler. + * This function is implemeted to support event sharing + * between multiple ports of single IB device. + * + * @param dev + * Pointer to Ethernet device. + */ +static void +mlx5_dev_shared_handler_devx_install(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) + return; + pthread_mutex_lock(&sh->intr_mutex); + MLX5_ASSERT(priv->ibv_port); + MLX5_ASSERT(priv->ibv_port <= sh->max_port); + MLX5_ASSERT(dev->data->port_id < RTE_MAX_ETHPORTS); + if (sh->port[priv->ibv_port - 1].devx_ih_port_id < RTE_MAX_ETHPORTS) { + /* The handler is already installed for this port. */ + MLX5_ASSERT(sh->devx_intr_cnt); + goto exit; + } + if (sh->devx_intr_cnt) { + sh->devx_intr_cnt++; + sh->port[priv->ibv_port - 1].devx_ih_port_id = + (uint32_t)dev->data->port_id; + goto exit; + } + if (priv->config.devx) { +#ifndef HAVE_IBV_DEVX_ASYNC + goto exit; +#else + sh->devx_comp = mlx5_glue->devx_create_cmd_comp(sh->ctx); + if (sh->devx_comp) { + int flags = fcntl(sh->devx_comp->fd, F_GETFL); + int ret = fcntl(sh->devx_comp->fd, F_SETFL, + flags | O_NONBLOCK); + + if (ret) { + DRV_LOG(INFO, "failed to change file descriptor" + " devx async event queue"); + } else { + sh->intr_handle_devx.fd = sh->devx_comp->fd; + sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT; + rte_intr_callback_register + (&sh->intr_handle_devx, + mlx5_dev_interrupt_handler_devx, sh); + sh->devx_intr_cnt++; + sh->port[priv->ibv_port - 1].devx_ih_port_id = + (uint32_t)dev->data->port_id; + } + } +#endif /* HAVE_IBV_DEVX_ASYNC */ + } +exit: + pthread_mutex_unlock(&sh->intr_mutex); +} + +/** + * Uninstall interrupt handler. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev) +{ + mlx5_dev_shared_handler_uninstall(dev); +} + +/** + * Install interrupt handler. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev) +{ + mlx5_dev_shared_handler_install(dev); +} + +/** + * Devx uninstall interrupt handler. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx5_dev_interrupt_handler_devx_uninstall(struct rte_eth_dev *dev) +{ + mlx5_dev_shared_handler_devx_uninstall(dev); +} + +/** + * Devx install interrupt handler. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx5_dev_interrupt_handler_devx_install(struct rte_eth_dev *dev) +{ + mlx5_dev_shared_handler_devx_install(dev); +} + +/** + * DPDK callback to bring the link DOWN. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_set_link_down(struct rte_eth_dev *dev) +{ + return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP); +} + +/** + * DPDK callback to bring the link UP. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_set_link_up(struct rte_eth_dev *dev) +{ + return mlx5_set_flags(dev, ~IFF_UP, IFF_UP); +} + +/** + * Configure the RX function to use. + * + * @param dev + * Pointer to private data structure. + * + * @return + * Pointer to selected Rx burst function. + */ +eth_rx_burst_t +mlx5_select_rx_function(struct rte_eth_dev *dev) +{ + eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst; + + MLX5_ASSERT(dev != NULL); + if (mlx5_check_vec_rx_support(dev) > 0) { + rx_pkt_burst = mlx5_rx_burst_vec; + DRV_LOG(DEBUG, "port %u selected Rx vectorized function", + dev->data->port_id); + } else if (mlx5_mprq_enabled(dev)) { + rx_pkt_burst = mlx5_rx_burst_mprq; + } + return rx_pkt_burst; +} + +/** + * Check if mlx5 device was removed. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 1 when device is removed, otherwise 0. + */ +int +mlx5_is_removed(struct rte_eth_dev *dev) +{ + struct ibv_device_attr device_attr; + struct mlx5_priv *priv = dev->data->dev_private; + + if (mlx5_glue->query_device(priv->sh->ctx, &device_attr) == EIO) + return 1; + return 0; +} + +/** + * Get the E-Switch parameters by port id. + * + * @param[in] port + * Device port id. + * @param[in] valid + * Device port id is valid, skip check. This flag is useful + * when trials are performed from probing and device is not + * flagged as valid yet (in attaching process). + * @param[out] es_domain_id + * E-Switch domain id. + * @param[out] es_port_id + * The port id of the port in the E-Switch. + * + * @return + * pointer to device private data structure containing data needed + * on success, NULL otherwise and rte_errno is set. + */ +struct mlx5_priv * +mlx5_port_to_eswitch_info(uint16_t port, bool valid) +{ + struct rte_eth_dev *dev; + struct mlx5_priv *priv; + + if (port >= RTE_MAX_ETHPORTS) { + rte_errno = EINVAL; + return NULL; + } + if (!valid && !rte_eth_dev_is_valid_port(port)) { + rte_errno = ENODEV; + return NULL; + } + dev = &rte_eth_devices[port]; + priv = dev->data->dev_private; + if (!(priv->representor || priv->master)) { + rte_errno = EINVAL; + return NULL; + } + return priv; +} + +/** + * Get the E-Switch parameters by device instance. + * + * @param[in] port + * Device port id. + * @param[out] es_domain_id + * E-Switch domain id. + * @param[out] es_port_id + * The port id of the port in the E-Switch. + * + * @return + * pointer to device private data structure containing data needed + * on success, NULL otherwise and rte_errno is set. + */ +struct mlx5_priv * +mlx5_dev_to_eswitch_info(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv; + + priv = dev->data->dev_private; + if (!(priv->representor || priv->master)) { + rte_errno = EINVAL; + return NULL; + } + return priv; +} + +/** + * Get switch information associated with network interface. + * + * @param ifindex + * Network interface index. + * @param[out] info + * Switch information object, populated in case of success. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info) +{ + char ifname[IF_NAMESIZE]; + char port_name[IF_NAMESIZE]; + FILE *file; + struct mlx5_switch_info data = { + .master = 0, + .representor = 0, + .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET, + .port_name = 0, + .switch_id = 0, + }; + DIR *dir; + bool port_switch_id_set = false; + bool device_dir = false; + char c; + int ret; + + if (!if_indextoname(ifindex, ifname)) { + rte_errno = errno; + return -rte_errno; + } + + MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name", + ifname); + MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id", + ifname); + MKSTR(pci_device, "/sys/class/net/%s/device", + ifname); + + file = fopen(phys_port_name, "rb"); + if (file != NULL) { + ret = fscanf(file, "%s", port_name); + fclose(file); + if (ret == 1) + mlx5_translate_port_name(port_name, &data); + } + file = fopen(phys_switch_id, "rb"); + if (file == NULL) { + rte_errno = errno; + return -rte_errno; + } + port_switch_id_set = + fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 && + c == '\n'; + fclose(file); + dir = opendir(pci_device); + if (dir != NULL) { + closedir(dir); + device_dir = true; + } + if (port_switch_id_set) { + /* We have some E-Switch configuration. */ + mlx5_sysfs_check_switch_info(device_dir, &data); + } + *info = data; + MLX5_ASSERT(!(data.master && data.representor)); + if (data.master && data.representor) { + DRV_LOG(ERR, "ifindex %u device is recognized as master" + " and as representor", ifindex); + rte_errno = ENODEV; + return -rte_errno; + } + return 0; +} + +/** + * Analyze gathered port parameters via sysfs to recognize master + * and representor devices for E-Switch configuration. + * + * @param[in] device_dir + * flag of presence of "device" directory under port device key. + * @param[inout] switch_info + * Port information, including port name as a number and port name + * type if recognized + * + * @return + * master and representor flags are set in switch_info according to + * recognized parameters (if any). + */ +void +mlx5_sysfs_check_switch_info(bool device_dir, + struct mlx5_switch_info *switch_info) +{ + switch (switch_info->name_type) { + case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN: + /* + * Name is not recognized, assume the master, + * check the device directory presence. + */ + switch_info->master = device_dir; + break; + case MLX5_PHYS_PORT_NAME_TYPE_NOTSET: + /* + * Name is not set, this assumes the legacy naming + * schema for master, just check if there is + * a device directory. + */ + switch_info->master = device_dir; + break; + case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: + /* New uplink naming schema recognized. */ + switch_info->master = 1; + break; + case MLX5_PHYS_PORT_NAME_TYPE_LEGACY: + /* Legacy representors naming schema. */ + switch_info->representor = !device_dir; + break; + case MLX5_PHYS_PORT_NAME_TYPE_PFVF: + /* New representors naming schema. */ + switch_info->representor = 1; + break; + } +} + +/** + * DPDK callback to retrieve plug-in module EEPROM information (type and size). + * + * @param dev + * Pointer to Ethernet device structure. + * @param[out] modinfo + * Storage for plug-in module EEPROM information. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_get_module_info(struct rte_eth_dev *dev, + struct rte_eth_dev_module_info *modinfo) +{ + struct ethtool_modinfo info = { + .cmd = ETHTOOL_GMODULEINFO, + }; + struct ifreq ifr = (struct ifreq) { + .ifr_data = (void *)&info, + }; + int ret = 0; + + if (!dev || !modinfo) { + DRV_LOG(WARNING, "missing argument, cannot get module info"); + rte_errno = EINVAL; + return -rte_errno; + } + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); + if (ret) { + DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s", + dev->data->port_id, strerror(rte_errno)); + return ret; + } + modinfo->type = info.type; + modinfo->eeprom_len = info.eeprom_len; + return ret; +} + +/** + * DPDK callback to retrieve plug-in module EEPROM data. + * + * @param dev + * Pointer to Ethernet device structure. + * @param[out] info + * Storage for plug-in module EEPROM data. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int mlx5_get_module_eeprom(struct rte_eth_dev *dev, + struct rte_dev_eeprom_info *info) +{ + struct ethtool_eeprom *eeprom; + struct ifreq ifr; + int ret = 0; + + if (!dev || !info) { + DRV_LOG(WARNING, "missing argument, cannot get module eeprom"); + rte_errno = EINVAL; + return -rte_errno; + } + eeprom = rte_calloc(__func__, 1, + (sizeof(struct ethtool_eeprom) + info->length), 0); + if (!eeprom) { + DRV_LOG(WARNING, "port %u cannot allocate memory for " + "eeprom data", dev->data->port_id); + rte_errno = ENOMEM; + return -rte_errno; + } + eeprom->cmd = ETHTOOL_GMODULEEEPROM; + eeprom->offset = info->offset; + eeprom->len = info->length; + ifr = (struct ifreq) { + .ifr_data = (void *)eeprom, + }; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); + if (ret) + DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s", + dev->data->port_id, strerror(rte_errno)); + else + rte_memcpy(info->data, eeprom->data, info->length); + rte_free(eeprom); + return ret; +} + +/** + * DPDK callback to retrieve hairpin capabilities. + * + * @param dev + * Pointer to Ethernet device structure. + * @param[out] cap + * Storage for hairpin capability data. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int mlx5_hairpin_cap_get(struct rte_eth_dev *dev, + struct rte_eth_hairpin_cap *cap) +{ + struct mlx5_priv *priv = dev->data->dev_private; + + if (priv->sh->devx == 0) { + rte_errno = ENOTSUP; + return -rte_errno; + } + cap->max_nb_queues = UINT16_MAX; + cap->max_rx_2_tx = 1; + cap->max_tx_2_rx = 1; + cap->max_nb_desc = 8192; + return 0; +} diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow.c new file mode 100644 index 000000000..ae478a510 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow.c @@ -0,0 +1,6204 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2016 6WIND S.A. + * Copyright 2016 Mellanox Technologies, Ltd + */ + +#include <netinet/in.h> +#include <sys/queue.h> +#include <stdalign.h> +#include <stdint.h> +#include <string.h> +#include <stdbool.h> + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_common.h> +#include <rte_ether.h> +#include <rte_ethdev_driver.h> +#include <rte_flow.h> +#include <rte_cycles.h> +#include <rte_flow_driver.h> +#include <rte_malloc.h> +#include <rte_ip.h> + +#include <mlx5_glue.h> +#include <mlx5_devx_cmds.h> +#include <mlx5_prm.h> + +#include "mlx5_defs.h" +#include "mlx5.h" +#include "mlx5_flow.h" +#include "mlx5_rxtx.h" + +/* Dev ops structure defined in mlx5.c */ +extern const struct eth_dev_ops mlx5_dev_ops; +extern const struct eth_dev_ops mlx5_dev_ops_isolate; + +/** Device flow drivers. */ +#ifdef HAVE_IBV_FLOW_DV_SUPPORT +extern const struct mlx5_flow_driver_ops mlx5_flow_dv_drv_ops; +#endif +extern const struct mlx5_flow_driver_ops mlx5_flow_verbs_drv_ops; + +const struct mlx5_flow_driver_ops mlx5_flow_null_drv_ops; + +const struct mlx5_flow_driver_ops *flow_drv_ops[] = { + [MLX5_FLOW_TYPE_MIN] = &mlx5_flow_null_drv_ops, +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + [MLX5_FLOW_TYPE_DV] = &mlx5_flow_dv_drv_ops, +#endif + [MLX5_FLOW_TYPE_VERBS] = &mlx5_flow_verbs_drv_ops, + [MLX5_FLOW_TYPE_MAX] = &mlx5_flow_null_drv_ops +}; + +enum mlx5_expansion { + MLX5_EXPANSION_ROOT, + MLX5_EXPANSION_ROOT_OUTER, + MLX5_EXPANSION_ROOT_ETH_VLAN, + MLX5_EXPANSION_ROOT_OUTER_ETH_VLAN, + MLX5_EXPANSION_OUTER_ETH, + MLX5_EXPANSION_OUTER_ETH_VLAN, + MLX5_EXPANSION_OUTER_VLAN, + MLX5_EXPANSION_OUTER_IPV4, + MLX5_EXPANSION_OUTER_IPV4_UDP, + MLX5_EXPANSION_OUTER_IPV4_TCP, + MLX5_EXPANSION_OUTER_IPV6, + MLX5_EXPANSION_OUTER_IPV6_UDP, + MLX5_EXPANSION_OUTER_IPV6_TCP, + MLX5_EXPANSION_VXLAN, + MLX5_EXPANSION_VXLAN_GPE, + MLX5_EXPANSION_GRE, + MLX5_EXPANSION_MPLS, + MLX5_EXPANSION_ETH, + MLX5_EXPANSION_ETH_VLAN, + MLX5_EXPANSION_VLAN, + MLX5_EXPANSION_IPV4, + MLX5_EXPANSION_IPV4_UDP, + MLX5_EXPANSION_IPV4_TCP, + MLX5_EXPANSION_IPV6, + MLX5_EXPANSION_IPV6_UDP, + MLX5_EXPANSION_IPV6_TCP, +}; + +/** Supported expansion of items. */ +static const struct rte_flow_expand_node mlx5_support_expansion[] = { + [MLX5_EXPANSION_ROOT] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH, + MLX5_EXPANSION_IPV4, + MLX5_EXPANSION_IPV6), + .type = RTE_FLOW_ITEM_TYPE_END, + }, + [MLX5_EXPANSION_ROOT_OUTER] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_ETH, + MLX5_EXPANSION_OUTER_IPV4, + MLX5_EXPANSION_OUTER_IPV6), + .type = RTE_FLOW_ITEM_TYPE_END, + }, + [MLX5_EXPANSION_ROOT_ETH_VLAN] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH_VLAN), + .type = RTE_FLOW_ITEM_TYPE_END, + }, + [MLX5_EXPANSION_ROOT_OUTER_ETH_VLAN] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_ETH_VLAN), + .type = RTE_FLOW_ITEM_TYPE_END, + }, + [MLX5_EXPANSION_OUTER_ETH] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_IPV4, + MLX5_EXPANSION_OUTER_IPV6, + MLX5_EXPANSION_MPLS), + .type = RTE_FLOW_ITEM_TYPE_ETH, + .rss_types = 0, + }, + [MLX5_EXPANSION_OUTER_ETH_VLAN] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_VLAN), + .type = RTE_FLOW_ITEM_TYPE_ETH, + .rss_types = 0, + }, + [MLX5_EXPANSION_OUTER_VLAN] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_IPV4, + MLX5_EXPANSION_OUTER_IPV6), + .type = RTE_FLOW_ITEM_TYPE_VLAN, + }, + [MLX5_EXPANSION_OUTER_IPV4] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT + (MLX5_EXPANSION_OUTER_IPV4_UDP, + MLX5_EXPANSION_OUTER_IPV4_TCP, + MLX5_EXPANSION_GRE, + MLX5_EXPANSION_IPV4, + MLX5_EXPANSION_IPV6), + .type = RTE_FLOW_ITEM_TYPE_IPV4, + .rss_types = ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 | + ETH_RSS_NONFRAG_IPV4_OTHER, + }, + [MLX5_EXPANSION_OUTER_IPV4_UDP] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_VXLAN, + MLX5_EXPANSION_VXLAN_GPE), + .type = RTE_FLOW_ITEM_TYPE_UDP, + .rss_types = ETH_RSS_NONFRAG_IPV4_UDP, + }, + [MLX5_EXPANSION_OUTER_IPV4_TCP] = { + .type = RTE_FLOW_ITEM_TYPE_TCP, + .rss_types = ETH_RSS_NONFRAG_IPV4_TCP, + }, + [MLX5_EXPANSION_OUTER_IPV6] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT + (MLX5_EXPANSION_OUTER_IPV6_UDP, + MLX5_EXPANSION_OUTER_IPV6_TCP, + MLX5_EXPANSION_IPV4, + MLX5_EXPANSION_IPV6), + .type = RTE_FLOW_ITEM_TYPE_IPV6, + .rss_types = ETH_RSS_IPV6 | ETH_RSS_FRAG_IPV6 | + ETH_RSS_NONFRAG_IPV6_OTHER, + }, + [MLX5_EXPANSION_OUTER_IPV6_UDP] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_VXLAN, + MLX5_EXPANSION_VXLAN_GPE), + .type = RTE_FLOW_ITEM_TYPE_UDP, + .rss_types = ETH_RSS_NONFRAG_IPV6_UDP, + }, + [MLX5_EXPANSION_OUTER_IPV6_TCP] = { + .type = RTE_FLOW_ITEM_TYPE_TCP, + .rss_types = ETH_RSS_NONFRAG_IPV6_TCP, + }, + [MLX5_EXPANSION_VXLAN] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH, + MLX5_EXPANSION_IPV4, + MLX5_EXPANSION_IPV6), + .type = RTE_FLOW_ITEM_TYPE_VXLAN, + }, + [MLX5_EXPANSION_VXLAN_GPE] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH, + MLX5_EXPANSION_IPV4, + MLX5_EXPANSION_IPV6), + .type = RTE_FLOW_ITEM_TYPE_VXLAN_GPE, + }, + [MLX5_EXPANSION_GRE] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4), + .type = RTE_FLOW_ITEM_TYPE_GRE, + }, + [MLX5_EXPANSION_MPLS] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4, + MLX5_EXPANSION_IPV6), + .type = RTE_FLOW_ITEM_TYPE_MPLS, + }, + [MLX5_EXPANSION_ETH] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4, + MLX5_EXPANSION_IPV6), + .type = RTE_FLOW_ITEM_TYPE_ETH, + }, + [MLX5_EXPANSION_ETH_VLAN] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_VLAN), + .type = RTE_FLOW_ITEM_TYPE_ETH, + }, + [MLX5_EXPANSION_VLAN] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4, + MLX5_EXPANSION_IPV6), + .type = RTE_FLOW_ITEM_TYPE_VLAN, + }, + [MLX5_EXPANSION_IPV4] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4_UDP, + MLX5_EXPANSION_IPV4_TCP), + .type = RTE_FLOW_ITEM_TYPE_IPV4, + .rss_types = ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 | + ETH_RSS_NONFRAG_IPV4_OTHER, + }, + [MLX5_EXPANSION_IPV4_UDP] = { + .type = RTE_FLOW_ITEM_TYPE_UDP, + .rss_types = ETH_RSS_NONFRAG_IPV4_UDP, + }, + [MLX5_EXPANSION_IPV4_TCP] = { + .type = RTE_FLOW_ITEM_TYPE_TCP, + .rss_types = ETH_RSS_NONFRAG_IPV4_TCP, + }, + [MLX5_EXPANSION_IPV6] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV6_UDP, + MLX5_EXPANSION_IPV6_TCP), + .type = RTE_FLOW_ITEM_TYPE_IPV6, + .rss_types = ETH_RSS_IPV6 | ETH_RSS_FRAG_IPV6 | + ETH_RSS_NONFRAG_IPV6_OTHER, + }, + [MLX5_EXPANSION_IPV6_UDP] = { + .type = RTE_FLOW_ITEM_TYPE_UDP, + .rss_types = ETH_RSS_NONFRAG_IPV6_UDP, + }, + [MLX5_EXPANSION_IPV6_TCP] = { + .type = RTE_FLOW_ITEM_TYPE_TCP, + .rss_types = ETH_RSS_NONFRAG_IPV6_TCP, + }, +}; + +static const struct rte_flow_ops mlx5_flow_ops = { + .validate = mlx5_flow_validate, + .create = mlx5_flow_create, + .destroy = mlx5_flow_destroy, + .flush = mlx5_flow_flush, + .isolate = mlx5_flow_isolate, + .query = mlx5_flow_query, + .dev_dump = mlx5_flow_dev_dump, + .get_aged_flows = mlx5_flow_get_aged_flows, +}; + +/* Convert FDIR request to Generic flow. */ +struct mlx5_fdir { + struct rte_flow_attr attr; + struct rte_flow_item items[4]; + struct rte_flow_item_eth l2; + struct rte_flow_item_eth l2_mask; + union { + struct rte_flow_item_ipv4 ipv4; + struct rte_flow_item_ipv6 ipv6; + } l3; + union { + struct rte_flow_item_ipv4 ipv4; + struct rte_flow_item_ipv6 ipv6; + } l3_mask; + union { + struct rte_flow_item_udp udp; + struct rte_flow_item_tcp tcp; + } l4; + union { + struct rte_flow_item_udp udp; + struct rte_flow_item_tcp tcp; + } l4_mask; + struct rte_flow_action actions[2]; + struct rte_flow_action_queue queue; +}; + +/* Map of Verbs to Flow priority with 8 Verbs priorities. */ +static const uint32_t priority_map_3[][MLX5_PRIORITY_MAP_MAX] = { + { 0, 1, 2 }, { 2, 3, 4 }, { 5, 6, 7 }, +}; + +/* Map of Verbs to Flow priority with 16 Verbs priorities. */ +static const uint32_t priority_map_5[][MLX5_PRIORITY_MAP_MAX] = { + { 0, 1, 2 }, { 3, 4, 5 }, { 6, 7, 8 }, + { 9, 10, 11 }, { 12, 13, 14 }, +}; + +/* Tunnel information. */ +struct mlx5_flow_tunnel_info { + uint64_t tunnel; /**< Tunnel bit (see MLX5_FLOW_*). */ + uint32_t ptype; /**< Tunnel Ptype (see RTE_PTYPE_*). */ +}; + +static struct mlx5_flow_tunnel_info tunnels_info[] = { + { + .tunnel = MLX5_FLOW_LAYER_VXLAN, + .ptype = RTE_PTYPE_TUNNEL_VXLAN | RTE_PTYPE_L4_UDP, + }, + { + .tunnel = MLX5_FLOW_LAYER_GENEVE, + .ptype = RTE_PTYPE_TUNNEL_GENEVE | RTE_PTYPE_L4_UDP, + }, + { + .tunnel = MLX5_FLOW_LAYER_VXLAN_GPE, + .ptype = RTE_PTYPE_TUNNEL_VXLAN_GPE | RTE_PTYPE_L4_UDP, + }, + { + .tunnel = MLX5_FLOW_LAYER_GRE, + .ptype = RTE_PTYPE_TUNNEL_GRE, + }, + { + .tunnel = MLX5_FLOW_LAYER_MPLS | MLX5_FLOW_LAYER_OUTER_L4_UDP, + .ptype = RTE_PTYPE_TUNNEL_MPLS_IN_UDP | RTE_PTYPE_L4_UDP, + }, + { + .tunnel = MLX5_FLOW_LAYER_MPLS, + .ptype = RTE_PTYPE_TUNNEL_MPLS_IN_GRE, + }, + { + .tunnel = MLX5_FLOW_LAYER_NVGRE, + .ptype = RTE_PTYPE_TUNNEL_NVGRE, + }, + { + .tunnel = MLX5_FLOW_LAYER_IPIP, + .ptype = RTE_PTYPE_TUNNEL_IP, + }, + { + .tunnel = MLX5_FLOW_LAYER_IPV6_ENCAP, + .ptype = RTE_PTYPE_TUNNEL_IP, + }, + { + .tunnel = MLX5_FLOW_LAYER_GTP, + .ptype = RTE_PTYPE_TUNNEL_GTPU, + }, +}; + +/** + * Translate tag ID to register. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] feature + * The feature that request the register. + * @param[in] id + * The request register ID. + * @param[out] error + * Error description in case of any. + * + * @return + * The request register on success, a negative errno + * value otherwise and rte_errno is set. + */ +int +mlx5_flow_get_reg_id(struct rte_eth_dev *dev, + enum mlx5_feature_name feature, + uint32_t id, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_dev_config *config = &priv->config; + enum modify_reg start_reg; + bool skip_mtr_reg = false; + + switch (feature) { + case MLX5_HAIRPIN_RX: + return REG_B; + case MLX5_HAIRPIN_TX: + return REG_A; + case MLX5_METADATA_RX: + switch (config->dv_xmeta_en) { + case MLX5_XMETA_MODE_LEGACY: + return REG_B; + case MLX5_XMETA_MODE_META16: + return REG_C_0; + case MLX5_XMETA_MODE_META32: + return REG_C_1; + } + break; + case MLX5_METADATA_TX: + return REG_A; + case MLX5_METADATA_FDB: + switch (config->dv_xmeta_en) { + case MLX5_XMETA_MODE_LEGACY: + return REG_NONE; + case MLX5_XMETA_MODE_META16: + return REG_C_0; + case MLX5_XMETA_MODE_META32: + return REG_C_1; + } + break; + case MLX5_FLOW_MARK: + switch (config->dv_xmeta_en) { + case MLX5_XMETA_MODE_LEGACY: + return REG_NONE; + case MLX5_XMETA_MODE_META16: + return REG_C_1; + case MLX5_XMETA_MODE_META32: + return REG_C_0; + } + break; + case MLX5_MTR_SFX: + /* + * If meter color and flow match share one register, flow match + * should use the meter color register for match. + */ + if (priv->mtr_reg_share) + return priv->mtr_color_reg; + else + return priv->mtr_color_reg != REG_C_2 ? REG_C_2 : + REG_C_3; + case MLX5_MTR_COLOR: + MLX5_ASSERT(priv->mtr_color_reg != REG_NONE); + return priv->mtr_color_reg; + case MLX5_COPY_MARK: + /* + * Metadata COPY_MARK register using is in meter suffix sub + * flow while with meter. It's safe to share the same register. + */ + return priv->mtr_color_reg != REG_C_2 ? REG_C_2 : REG_C_3; + case MLX5_APP_TAG: + /* + * If meter is enable, it will engage the register for color + * match and flow match. If meter color match is not using the + * REG_C_2, need to skip the REG_C_x be used by meter color + * match. + * If meter is disable, free to use all available registers. + */ + start_reg = priv->mtr_color_reg != REG_C_2 ? REG_C_2 : + (priv->mtr_reg_share ? REG_C_3 : REG_C_4); + skip_mtr_reg = !!(priv->mtr_en && start_reg == REG_C_2); + if (id > (REG_C_7 - start_reg)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + NULL, "invalid tag id"); + if (config->flow_mreg_c[id + start_reg - REG_C_0] == REG_NONE) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + NULL, "unsupported tag id"); + /* + * This case means meter is using the REG_C_x great than 2. + * Take care not to conflict with meter color REG_C_x. + * If the available index REG_C_y >= REG_C_x, skip the + * color register. + */ + if (skip_mtr_reg && config->flow_mreg_c + [id + start_reg - REG_C_0] >= priv->mtr_color_reg) { + if (id >= (REG_C_7 - start_reg)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + NULL, "invalid tag id"); + if (config->flow_mreg_c + [id + 1 + start_reg - REG_C_0] != REG_NONE) + return config->flow_mreg_c + [id + 1 + start_reg - REG_C_0]; + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + NULL, "unsupported tag id"); + } + return config->flow_mreg_c[id + start_reg - REG_C_0]; + } + MLX5_ASSERT(false); + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "invalid feature name"); +} + +/** + * Check extensive flow metadata register support. + * + * @param dev + * Pointer to rte_eth_dev structure. + * + * @return + * True if device supports extensive flow metadata register, otherwise false. + */ +bool +mlx5_flow_ext_mreg_supported(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_dev_config *config = &priv->config; + + /* + * Having available reg_c can be regarded inclusively as supporting + * extensive flow metadata register, which could mean, + * - metadata register copy action by modify header. + * - 16 modify header actions is supported. + * - reg_c's are preserved across different domain (FDB and NIC) on + * packet loopback by flow lookup miss. + */ + return config->flow_mreg_c[2] != REG_NONE; +} + +/** + * Discover the maximum number of priority available. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * + * @return + * number of supported flow priority on success, a negative errno + * value otherwise and rte_errno is set. + */ +int +mlx5_flow_discover_priorities(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct { + struct ibv_flow_attr attr; + struct ibv_flow_spec_eth eth; + struct ibv_flow_spec_action_drop drop; + } flow_attr = { + .attr = { + .num_of_specs = 2, + .port = (uint8_t)priv->ibv_port, + }, + .eth = { + .type = IBV_FLOW_SPEC_ETH, + .size = sizeof(struct ibv_flow_spec_eth), + }, + .drop = { + .size = sizeof(struct ibv_flow_spec_action_drop), + .type = IBV_FLOW_SPEC_ACTION_DROP, + }, + }; + struct ibv_flow *flow; + struct mlx5_hrxq *drop = mlx5_hrxq_drop_new(dev); + uint16_t vprio[] = { 8, 16 }; + int i; + int priority = 0; + + if (!drop) { + rte_errno = ENOTSUP; + return -rte_errno; + } + for (i = 0; i != RTE_DIM(vprio); i++) { + flow_attr.attr.priority = vprio[i] - 1; + flow = mlx5_glue->create_flow(drop->qp, &flow_attr.attr); + if (!flow) + break; + claim_zero(mlx5_glue->destroy_flow(flow)); + priority = vprio[i]; + } + mlx5_hrxq_drop_release(dev); + switch (priority) { + case 8: + priority = RTE_DIM(priority_map_3); + break; + case 16: + priority = RTE_DIM(priority_map_5); + break; + default: + rte_errno = ENOTSUP; + DRV_LOG(ERR, + "port %u verbs maximum priority: %d expected 8/16", + dev->data->port_id, priority); + return -rte_errno; + } + DRV_LOG(INFO, "port %u flow maximum priority: %d", + dev->data->port_id, priority); + return priority; +} + +/** + * Adjust flow priority based on the highest layer and the request priority. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] priority + * The rule base priority. + * @param[in] subpriority + * The priority based on the items. + * + * @return + * The new priority. + */ +uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority, + uint32_t subpriority) +{ + uint32_t res = 0; + struct mlx5_priv *priv = dev->data->dev_private; + + switch (priv->config.flow_prio) { + case RTE_DIM(priority_map_3): + res = priority_map_3[priority][subpriority]; + break; + case RTE_DIM(priority_map_5): + res = priority_map_5[priority][subpriority]; + break; + } + return res; +} + +/** + * Verify the @p item specifications (spec, last, mask) are compatible with the + * NIC capabilities. + * + * @param[in] item + * Item specification. + * @param[in] mask + * @p item->mask or flow default bit-masks. + * @param[in] nic_mask + * Bit-masks covering supported fields by the NIC to compare with user mask. + * @param[in] size + * Bit-masks size in bytes. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_item_acceptable(const struct rte_flow_item *item, + const uint8_t *mask, + const uint8_t *nic_mask, + unsigned int size, + struct rte_flow_error *error) +{ + unsigned int i; + + MLX5_ASSERT(nic_mask); + for (i = 0; i < size; ++i) + if ((nic_mask[i] | mask[i]) != nic_mask[i]) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "mask enables non supported" + " bits"); + if (!item->spec && (item->mask || item->last)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "mask/last without a spec is not" + " supported"); + if (item->spec && item->last) { + uint8_t spec[size]; + uint8_t last[size]; + unsigned int i; + int ret; + + for (i = 0; i < size; ++i) { + spec[i] = ((const uint8_t *)item->spec)[i] & mask[i]; + last[i] = ((const uint8_t *)item->last)[i] & mask[i]; + } + ret = memcmp(spec, last, size); + if (ret != 0) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "range is not valid"); + } + return 0; +} + +/** + * Adjust the hash fields according to the @p flow information. + * + * @param[in] dev_flow. + * Pointer to the mlx5_flow. + * @param[in] tunnel + * 1 when the hash field is for a tunnel item. + * @param[in] layer_types + * ETH_RSS_* types. + * @param[in] hash_fields + * Item hash fields. + * + * @return + * The hash fields that should be used. + */ +uint64_t +mlx5_flow_hashfields_adjust(struct mlx5_flow_rss_desc *rss_desc, + int tunnel __rte_unused, uint64_t layer_types, + uint64_t hash_fields) +{ +#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + int rss_request_inner = rss_desc->level >= 2; + + /* Check RSS hash level for tunnel. */ + if (tunnel && rss_request_inner) + hash_fields |= IBV_RX_HASH_INNER; + else if (tunnel || rss_request_inner) + return 0; +#endif + /* Check if requested layer matches RSS hash fields. */ + if (!(rss_desc->types & layer_types)) + return 0; + return hash_fields; +} + +/** + * Lookup and set the ptype in the data Rx part. A single Ptype can be used, + * if several tunnel rules are used on this queue, the tunnel ptype will be + * cleared. + * + * @param rxq_ctrl + * Rx queue to update. + */ +static void +flow_rxq_tunnel_ptype_update(struct mlx5_rxq_ctrl *rxq_ctrl) +{ + unsigned int i; + uint32_t tunnel_ptype = 0; + + /* Look up for the ptype to use. */ + for (i = 0; i != MLX5_FLOW_TUNNEL; ++i) { + if (!rxq_ctrl->flow_tunnels_n[i]) + continue; + if (!tunnel_ptype) { + tunnel_ptype = tunnels_info[i].ptype; + } else { + tunnel_ptype = 0; + break; + } + } + rxq_ctrl->rxq.tunnel = tunnel_ptype; +} + +/** + * Set the Rx queue flags (Mark/Flag and Tunnel Ptypes) according to the devive + * flow. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] dev_handle + * Pointer to device flow handle structure. + */ +static void +flow_drv_rxq_flags_set(struct rte_eth_dev *dev, + struct mlx5_flow_handle *dev_handle) +{ + struct mlx5_priv *priv = dev->data->dev_private; + const int mark = dev_handle->mark; + const int tunnel = !!(dev_handle->layers & MLX5_FLOW_LAYER_TUNNEL); + struct mlx5_hrxq *hrxq; + unsigned int i; + + if (dev_handle->fate_action != MLX5_FLOW_FATE_QUEUE) + return; + hrxq = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_HRXQ], + dev_handle->rix_hrxq); + if (!hrxq) + return; + for (i = 0; i != hrxq->ind_table->queues_n; ++i) { + int idx = hrxq->ind_table->queues[i]; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of((*priv->rxqs)[idx], + struct mlx5_rxq_ctrl, rxq); + + /* + * To support metadata register copy on Tx loopback, + * this must be always enabled (metadata may arive + * from other port - not from local flows only. + */ + if (priv->config.dv_flow_en && + priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && + mlx5_flow_ext_mreg_supported(dev)) { + rxq_ctrl->rxq.mark = 1; + rxq_ctrl->flow_mark_n = 1; + } else if (mark) { + rxq_ctrl->rxq.mark = 1; + rxq_ctrl->flow_mark_n++; + } + if (tunnel) { + unsigned int j; + + /* Increase the counter matching the flow. */ + for (j = 0; j != MLX5_FLOW_TUNNEL; ++j) { + if ((tunnels_info[j].tunnel & + dev_handle->layers) == + tunnels_info[j].tunnel) { + rxq_ctrl->flow_tunnels_n[j]++; + break; + } + } + flow_rxq_tunnel_ptype_update(rxq_ctrl); + } + } +} + +/** + * Set the Rx queue flags (Mark/Flag and Tunnel Ptypes) for a flow + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] flow + * Pointer to flow structure. + */ +static void +flow_rxq_flags_set(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + struct mlx5_priv *priv = dev->data->dev_private; + uint32_t handle_idx; + struct mlx5_flow_handle *dev_handle; + + SILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], flow->dev_handles, + handle_idx, dev_handle, next) + flow_drv_rxq_flags_set(dev, dev_handle); +} + +/** + * Clear the Rx queue flags (Mark/Flag and Tunnel Ptype) associated with the + * device flow if no other flow uses it with the same kind of request. + * + * @param dev + * Pointer to Ethernet device. + * @param[in] dev_handle + * Pointer to the device flow handle structure. + */ +static void +flow_drv_rxq_flags_trim(struct rte_eth_dev *dev, + struct mlx5_flow_handle *dev_handle) +{ + struct mlx5_priv *priv = dev->data->dev_private; + const int mark = dev_handle->mark; + const int tunnel = !!(dev_handle->layers & MLX5_FLOW_LAYER_TUNNEL); + struct mlx5_hrxq *hrxq; + unsigned int i; + + if (dev_handle->fate_action != MLX5_FLOW_FATE_QUEUE) + return; + hrxq = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_HRXQ], + dev_handle->rix_hrxq); + if (!hrxq) + return; + MLX5_ASSERT(dev->data->dev_started); + for (i = 0; i != hrxq->ind_table->queues_n; ++i) { + int idx = hrxq->ind_table->queues[i]; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of((*priv->rxqs)[idx], + struct mlx5_rxq_ctrl, rxq); + + if (priv->config.dv_flow_en && + priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && + mlx5_flow_ext_mreg_supported(dev)) { + rxq_ctrl->rxq.mark = 1; + rxq_ctrl->flow_mark_n = 1; + } else if (mark) { + rxq_ctrl->flow_mark_n--; + rxq_ctrl->rxq.mark = !!rxq_ctrl->flow_mark_n; + } + if (tunnel) { + unsigned int j; + + /* Decrease the counter matching the flow. */ + for (j = 0; j != MLX5_FLOW_TUNNEL; ++j) { + if ((tunnels_info[j].tunnel & + dev_handle->layers) == + tunnels_info[j].tunnel) { + rxq_ctrl->flow_tunnels_n[j]--; + break; + } + } + flow_rxq_tunnel_ptype_update(rxq_ctrl); + } + } +} + +/** + * Clear the Rx queue flags (Mark/Flag and Tunnel Ptype) associated with the + * @p flow if no other flow uses it with the same kind of request. + * + * @param dev + * Pointer to Ethernet device. + * @param[in] flow + * Pointer to the flow. + */ +static void +flow_rxq_flags_trim(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + struct mlx5_priv *priv = dev->data->dev_private; + uint32_t handle_idx; + struct mlx5_flow_handle *dev_handle; + + SILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], flow->dev_handles, + handle_idx, dev_handle, next) + flow_drv_rxq_flags_trim(dev, dev_handle); +} + +/** + * Clear the Mark/Flag and Tunnel ptype information in all Rx queues. + * + * @param dev + * Pointer to Ethernet device. + */ +static void +flow_rxq_flags_clear(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + unsigned int i; + + for (i = 0; i != priv->rxqs_n; ++i) { + struct mlx5_rxq_ctrl *rxq_ctrl; + unsigned int j; + + if (!(*priv->rxqs)[i]) + continue; + rxq_ctrl = container_of((*priv->rxqs)[i], + struct mlx5_rxq_ctrl, rxq); + rxq_ctrl->flow_mark_n = 0; + rxq_ctrl->rxq.mark = 0; + for (j = 0; j != MLX5_FLOW_TUNNEL; ++j) + rxq_ctrl->flow_tunnels_n[j] = 0; + rxq_ctrl->rxq.tunnel = 0; + } +} + +/** + * Set the Rx queue dynamic metadata (mask and offset) for a flow + * + * @param[in] dev + * Pointer to the Ethernet device structure. + */ +void +mlx5_flow_rxq_dynf_metadata_set(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_data *data; + unsigned int i; + + for (i = 0; i != priv->rxqs_n; ++i) { + if (!(*priv->rxqs)[i]) + continue; + data = (*priv->rxqs)[i]; + if (!rte_flow_dynf_metadata_avail()) { + data->dynf_meta = 0; + data->flow_meta_mask = 0; + data->flow_meta_offset = -1; + } else { + data->dynf_meta = 1; + data->flow_meta_mask = rte_flow_dynf_metadata_mask; + data->flow_meta_offset = rte_flow_dynf_metadata_offs; + } + } +} + +/* + * return a pointer to the desired action in the list of actions. + * + * @param[in] actions + * The list of actions to search the action in. + * @param[in] action + * The action to find. + * + * @return + * Pointer to the action in the list, if found. NULL otherwise. + */ +const struct rte_flow_action * +mlx5_flow_find_action(const struct rte_flow_action *actions, + enum rte_flow_action_type action) +{ + if (actions == NULL) + return NULL; + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) + if (actions->type == action) + return actions; + return NULL; +} + +/* + * Validate the flag action. + * + * @param[in] action_flags + * Bit-fields that holds the actions detected until now. + * @param[in] attr + * Attributes of flow that includes this action. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_action_flag(uint64_t action_flags, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + if (action_flags & MLX5_FLOW_ACTION_MARK) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't mark and flag in same flow"); + if (action_flags & MLX5_FLOW_ACTION_FLAG) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't have 2 flag" + " actions in same flow"); + if (attr->egress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL, + "flag action not supported for " + "egress"); + return 0; +} + +/* + * Validate the mark action. + * + * @param[in] action + * Pointer to the queue action. + * @param[in] action_flags + * Bit-fields that holds the actions detected until now. + * @param[in] attr + * Attributes of flow that includes this action. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_action_mark(const struct rte_flow_action *action, + uint64_t action_flags, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + const struct rte_flow_action_mark *mark = action->conf; + + if (!mark) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + action, + "configuration cannot be null"); + if (mark->id >= MLX5_FLOW_MARK_MAX) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &mark->id, + "mark id must in 0 <= id < " + RTE_STR(MLX5_FLOW_MARK_MAX)); + if (action_flags & MLX5_FLOW_ACTION_FLAG) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't flag and mark in same flow"); + if (action_flags & MLX5_FLOW_ACTION_MARK) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't have 2 mark actions in same" + " flow"); + if (attr->egress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL, + "mark action not supported for " + "egress"); + return 0; +} + +/* + * Validate the drop action. + * + * @param[in] action_flags + * Bit-fields that holds the actions detected until now. + * @param[in] attr + * Attributes of flow that includes this action. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_action_drop(uint64_t action_flags __rte_unused, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + if (attr->egress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL, + "drop action not supported for " + "egress"); + return 0; +} + +/* + * Validate the queue action. + * + * @param[in] action + * Pointer to the queue action. + * @param[in] action_flags + * Bit-fields that holds the actions detected until now. + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] attr + * Attributes of flow that includes this action. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_action_queue(const struct rte_flow_action *action, + uint64_t action_flags, + struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + const struct rte_flow_action_queue *queue = action->conf; + + if (action_flags & MLX5_FLOW_FATE_ACTIONS) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't have 2 fate actions in" + " same flow"); + if (!priv->rxqs_n) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + NULL, "No Rx queues configured"); + if (queue->index >= priv->rxqs_n) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &queue->index, + "queue index out of range"); + if (!(*priv->rxqs)[queue->index]) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &queue->index, + "queue is not configured"); + if (attr->egress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL, + "queue action not supported for " + "egress"); + return 0; +} + +/* + * Validate the rss action. + * + * @param[in] action + * Pointer to the queue action. + * @param[in] action_flags + * Bit-fields that holds the actions detected until now. + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] attr + * Attributes of flow that includes this action. + * @param[in] item_flags + * Items that were detected. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_action_rss(const struct rte_flow_action *action, + uint64_t action_flags, + struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + uint64_t item_flags, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + const struct rte_flow_action_rss *rss = action->conf; + int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + unsigned int i; + + if (action_flags & MLX5_FLOW_FATE_ACTIONS) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't have 2 fate actions" + " in same flow"); + if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT && + rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->func, + "RSS hash function not supported"); +#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + if (rss->level > 2) +#else + if (rss->level > 1) +#endif + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->level, + "tunnel RSS is not supported"); + /* allow RSS key_len 0 in case of NULL (default) RSS key. */ + if (rss->key_len == 0 && rss->key != NULL) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->key_len, + "RSS hash key length 0"); + if (rss->key_len > 0 && rss->key_len < MLX5_RSS_HASH_KEY_LEN) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->key_len, + "RSS hash key too small"); + if (rss->key_len > MLX5_RSS_HASH_KEY_LEN) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->key_len, + "RSS hash key too large"); + if (rss->queue_num > priv->config.ind_table_max_size) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->queue_num, + "number of queues too large"); + if (rss->types & MLX5_RSS_HF_MASK) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->types, + "some RSS protocols are not" + " supported"); + if ((rss->types & (ETH_RSS_L3_SRC_ONLY | ETH_RSS_L3_DST_ONLY)) && + !(rss->types & ETH_RSS_IP)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, NULL, + "L3 partial RSS requested but L3 RSS" + " type not specified"); + if ((rss->types & (ETH_RSS_L4_SRC_ONLY | ETH_RSS_L4_DST_ONLY)) && + !(rss->types & (ETH_RSS_UDP | ETH_RSS_TCP))) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, NULL, + "L4 partial RSS requested but L4 RSS" + " type not specified"); + if (!priv->rxqs_n) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + NULL, "No Rx queues configured"); + if (!rss->queue_num) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + NULL, "No queues configured"); + for (i = 0; i != rss->queue_num; ++i) { + if (rss->queue[i] >= priv->rxqs_n) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->queue[i], "queue index out of range"); + if (!(*priv->rxqs)[rss->queue[i]]) + return rte_flow_error_set + (error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->queue[i], "queue is not configured"); + } + if (attr->egress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL, + "rss action not supported for " + "egress"); + if (rss->level > 1 && !tunnel) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, NULL, + "inner RSS is not supported for " + "non-tunnel flows"); + return 0; +} + +/* + * Validate the count action. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] attr + * Attributes of flow that includes this action. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_action_count(struct rte_eth_dev *dev __rte_unused, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + if (attr->egress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL, + "count action not supported for " + "egress"); + return 0; +} + +/** + * Verify the @p attributes will be correctly understood by the NIC and store + * them in the @p flow if everything is correct. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] attributes + * Pointer to flow attributes + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_attributes(struct rte_eth_dev *dev, + const struct rte_flow_attr *attributes, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + uint32_t priority_max = priv->config.flow_prio - 1; + + if (attributes->group) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_GROUP, + NULL, "groups is not supported"); + if (attributes->priority != MLX5_FLOW_PRIO_RSVD && + attributes->priority >= priority_max) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, + NULL, "priority out of range"); + if (attributes->egress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL, + "egress is not supported"); + if (attributes->transfer && !priv->config.dv_esw_en) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, + NULL, "transfer is not supported"); + if (!attributes->ingress) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, + NULL, + "ingress attribute is mandatory"); + return 0; +} + +/** + * Validate ICMP6 item. + * + * @param[in] item + * Item specification. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_item_icmp6(const struct rte_flow_item *item, + uint64_t item_flags, + uint8_t target_protocol, + struct rte_flow_error *error) +{ + const struct rte_flow_item_icmp6 *mask = item->mask; + const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 : + MLX5_FLOW_LAYER_OUTER_L3_IPV6; + const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 : + MLX5_FLOW_LAYER_OUTER_L4; + int ret; + + if (target_protocol != 0xFF && target_protocol != IPPROTO_ICMPV6) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "protocol filtering not compatible" + " with ICMP6 layer"); + if (!(item_flags & l3m)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "IPv6 is mandatory to filter on" + " ICMP6"); + if (item_flags & l4m) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "multiple L4 layers not supported"); + if (!mask) + mask = &rte_flow_item_icmp6_mask; + ret = mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)&rte_flow_item_icmp6_mask, + sizeof(struct rte_flow_item_icmp6), error); + if (ret < 0) + return ret; + return 0; +} + +/** + * Validate ICMP item. + * + * @param[in] item + * Item specification. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_item_icmp(const struct rte_flow_item *item, + uint64_t item_flags, + uint8_t target_protocol, + struct rte_flow_error *error) +{ + const struct rte_flow_item_icmp *mask = item->mask; + const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 : + MLX5_FLOW_LAYER_OUTER_L3_IPV4; + const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 : + MLX5_FLOW_LAYER_OUTER_L4; + int ret; + + if (target_protocol != 0xFF && target_protocol != IPPROTO_ICMP) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "protocol filtering not compatible" + " with ICMP layer"); + if (!(item_flags & l3m)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "IPv4 is mandatory to filter" + " on ICMP"); + if (item_flags & l4m) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "multiple L4 layers not supported"); + if (!mask) + mask = &rte_flow_item_icmp_mask; + ret = mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)&rte_flow_item_icmp_mask, + sizeof(struct rte_flow_item_icmp), error); + if (ret < 0) + return ret; + return 0; +} + +/** + * Validate Ethernet item. + * + * @param[in] item + * Item specification. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_item_eth(const struct rte_flow_item *item, + uint64_t item_flags, + struct rte_flow_error *error) +{ + const struct rte_flow_item_eth *mask = item->mask; + const struct rte_flow_item_eth nic_mask = { + .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", + .src.addr_bytes = "\xff\xff\xff\xff\xff\xff", + .type = RTE_BE16(0xffff), + }; + int ret; + int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + const uint64_t ethm = tunnel ? MLX5_FLOW_LAYER_INNER_L2 : + MLX5_FLOW_LAYER_OUTER_L2; + + if (item_flags & ethm) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "multiple L2 layers not supported"); + if ((!tunnel && (item_flags & MLX5_FLOW_LAYER_OUTER_L3)) || + (tunnel && (item_flags & MLX5_FLOW_LAYER_INNER_L3))) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "L2 layer should not follow " + "L3 layers"); + if ((!tunnel && (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)) || + (tunnel && (item_flags & MLX5_FLOW_LAYER_INNER_VLAN))) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "L2 layer should not follow VLAN"); + if (!mask) + mask = &rte_flow_item_eth_mask; + ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask, + (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_eth), + error); + return ret; +} + +/** + * Validate VLAN item. + * + * @param[in] item + * Item specification. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. + * @param[in] dev + * Ethernet device flow is being created on. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_item_vlan(const struct rte_flow_item *item, + uint64_t item_flags, + struct rte_eth_dev *dev, + struct rte_flow_error *error) +{ + const struct rte_flow_item_vlan *spec = item->spec; + const struct rte_flow_item_vlan *mask = item->mask; + const struct rte_flow_item_vlan nic_mask = { + .tci = RTE_BE16(UINT16_MAX), + .inner_type = RTE_BE16(UINT16_MAX), + }; + uint16_t vlan_tag = 0; + const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + int ret; + const uint64_t l34m = tunnel ? (MLX5_FLOW_LAYER_INNER_L3 | + MLX5_FLOW_LAYER_INNER_L4) : + (MLX5_FLOW_LAYER_OUTER_L3 | + MLX5_FLOW_LAYER_OUTER_L4); + const uint64_t vlanm = tunnel ? MLX5_FLOW_LAYER_INNER_VLAN : + MLX5_FLOW_LAYER_OUTER_VLAN; + + if (item_flags & vlanm) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "multiple VLAN layers not supported"); + else if ((item_flags & l34m) != 0) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "VLAN cannot follow L3/L4 layer"); + if (!mask) + mask = &rte_flow_item_vlan_mask; + ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask, + (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_vlan), + error); + if (ret) + return ret; + if (!tunnel && mask->tci != RTE_BE16(0x0fff)) { + struct mlx5_priv *priv = dev->data->dev_private; + + if (priv->vmwa_context) { + /* + * Non-NULL context means we have a virtual machine + * and SR-IOV enabled, we have to create VLAN interface + * to make hypervisor to setup E-Switch vport + * context correctly. We avoid creating the multiple + * VLAN interfaces, so we cannot support VLAN tag mask. + */ + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "VLAN tag mask is not" + " supported in virtual" + " environment"); + } + } + if (spec) { + vlan_tag = spec->tci; + vlan_tag &= mask->tci; + } + /* + * From verbs perspective an empty VLAN is equivalent + * to a packet without VLAN layer. + */ + if (!vlan_tag) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM_SPEC, + item->spec, + "VLAN cannot be empty"); + return 0; +} + +/** + * Validate IPV4 item. + * + * @param[in] item + * Item specification. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. + * @param[in] acc_mask + * Acceptable mask, if NULL default internal default mask + * will be used to check whether item fields are supported. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_item_ipv4(const struct rte_flow_item *item, + uint64_t item_flags, + uint64_t last_item, + uint16_t ether_type, + const struct rte_flow_item_ipv4 *acc_mask, + struct rte_flow_error *error) +{ + const struct rte_flow_item_ipv4 *mask = item->mask; + const struct rte_flow_item_ipv4 *spec = item->spec; + const struct rte_flow_item_ipv4 nic_mask = { + .hdr = { + .src_addr = RTE_BE32(0xffffffff), + .dst_addr = RTE_BE32(0xffffffff), + .type_of_service = 0xff, + .next_proto_id = 0xff, + }, + }; + const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3 : + MLX5_FLOW_LAYER_OUTER_L3; + const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 : + MLX5_FLOW_LAYER_OUTER_L4; + int ret; + uint8_t next_proto = 0xFF; + const uint64_t l2_vlan = (MLX5_FLOW_LAYER_L2 | + MLX5_FLOW_LAYER_OUTER_VLAN | + MLX5_FLOW_LAYER_INNER_VLAN); + + if ((last_item & l2_vlan) && ether_type && + ether_type != RTE_ETHER_TYPE_IPV4) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "IPv4 cannot follow L2/VLAN layer " + "which ether type is not IPv4"); + if (item_flags & MLX5_FLOW_LAYER_IPIP) { + if (mask && spec) + next_proto = mask->hdr.next_proto_id & + spec->hdr.next_proto_id; + if (next_proto == IPPROTO_IPIP || next_proto == IPPROTO_IPV6) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "multiple tunnel " + "not supported"); + } + if (item_flags & MLX5_FLOW_LAYER_IPV6_ENCAP) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "wrong tunnel type - IPv6 specified " + "but IPv4 item provided"); + if (item_flags & l3m) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "multiple L3 layers not supported"); + else if (item_flags & l4m) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "L3 cannot follow an L4 layer."); + else if ((item_flags & MLX5_FLOW_LAYER_NVGRE) && + !(item_flags & MLX5_FLOW_LAYER_INNER_L2)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "L3 cannot follow an NVGRE layer."); + if (!mask) + mask = &rte_flow_item_ipv4_mask; + else if (mask->hdr.next_proto_id != 0 && + mask->hdr.next_proto_id != 0xff) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "partial mask is not supported" + " for protocol"); + ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask, + acc_mask ? (const uint8_t *)acc_mask + : (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_ipv4), + error); + if (ret < 0) + return ret; + return 0; +} + +/** + * Validate IPV6 item. + * + * @param[in] item + * Item specification. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. + * @param[in] acc_mask + * Acceptable mask, if NULL default internal default mask + * will be used to check whether item fields are supported. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_item_ipv6(const struct rte_flow_item *item, + uint64_t item_flags, + uint64_t last_item, + uint16_t ether_type, + const struct rte_flow_item_ipv6 *acc_mask, + struct rte_flow_error *error) +{ + const struct rte_flow_item_ipv6 *mask = item->mask; + const struct rte_flow_item_ipv6 *spec = item->spec; + const struct rte_flow_item_ipv6 nic_mask = { + .hdr = { + .src_addr = + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + .dst_addr = + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + .vtc_flow = RTE_BE32(0xffffffff), + .proto = 0xff, + }, + }; + const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3 : + MLX5_FLOW_LAYER_OUTER_L3; + const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 : + MLX5_FLOW_LAYER_OUTER_L4; + int ret; + uint8_t next_proto = 0xFF; + const uint64_t l2_vlan = (MLX5_FLOW_LAYER_L2 | + MLX5_FLOW_LAYER_OUTER_VLAN | + MLX5_FLOW_LAYER_INNER_VLAN); + + if ((last_item & l2_vlan) && ether_type && + ether_type != RTE_ETHER_TYPE_IPV6) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "IPv6 cannot follow L2/VLAN layer " + "which ether type is not IPv6"); + if (item_flags & MLX5_FLOW_LAYER_IPV6_ENCAP) { + if (mask && spec) + next_proto = mask->hdr.proto & spec->hdr.proto; + if (next_proto == IPPROTO_IPIP || next_proto == IPPROTO_IPV6) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "multiple tunnel " + "not supported"); + } + if (item_flags & MLX5_FLOW_LAYER_IPIP) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "wrong tunnel type - IPv4 specified " + "but IPv6 item provided"); + if (item_flags & l3m) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "multiple L3 layers not supported"); + else if (item_flags & l4m) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "L3 cannot follow an L4 layer."); + else if ((item_flags & MLX5_FLOW_LAYER_NVGRE) && + !(item_flags & MLX5_FLOW_LAYER_INNER_L2)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "L3 cannot follow an NVGRE layer."); + if (!mask) + mask = &rte_flow_item_ipv6_mask; + ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask, + acc_mask ? (const uint8_t *)acc_mask + : (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_ipv6), + error); + if (ret < 0) + return ret; + return 0; +} + +/** + * Validate UDP item. + * + * @param[in] item + * Item specification. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. + * @param[in] target_protocol + * The next protocol in the previous item. + * @param[in] flow_mask + * mlx5 flow-specific (DV, verbs, etc.) supported header fields mask. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_item_udp(const struct rte_flow_item *item, + uint64_t item_flags, + uint8_t target_protocol, + struct rte_flow_error *error) +{ + const struct rte_flow_item_udp *mask = item->mask; + const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3 : + MLX5_FLOW_LAYER_OUTER_L3; + const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 : + MLX5_FLOW_LAYER_OUTER_L4; + int ret; + + if (target_protocol != 0xff && target_protocol != IPPROTO_UDP) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "protocol filtering not compatible" + " with UDP layer"); + if (!(item_flags & l3m)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "L3 is mandatory to filter on L4"); + if (item_flags & l4m) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "multiple L4 layers not supported"); + if (!mask) + mask = &rte_flow_item_udp_mask; + ret = mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)&rte_flow_item_udp_mask, + sizeof(struct rte_flow_item_udp), error); + if (ret < 0) + return ret; + return 0; +} + +/** + * Validate TCP item. + * + * @param[in] item + * Item specification. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. + * @param[in] target_protocol + * The next protocol in the previous item. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_item_tcp(const struct rte_flow_item *item, + uint64_t item_flags, + uint8_t target_protocol, + const struct rte_flow_item_tcp *flow_mask, + struct rte_flow_error *error) +{ + const struct rte_flow_item_tcp *mask = item->mask; + const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3 : + MLX5_FLOW_LAYER_OUTER_L3; + const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 : + MLX5_FLOW_LAYER_OUTER_L4; + int ret; + + MLX5_ASSERT(flow_mask); + if (target_protocol != 0xff && target_protocol != IPPROTO_TCP) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "protocol filtering not compatible" + " with TCP layer"); + if (!(item_flags & l3m)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "L3 is mandatory to filter on L4"); + if (item_flags & l4m) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "multiple L4 layers not supported"); + if (!mask) + mask = &rte_flow_item_tcp_mask; + ret = mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)flow_mask, + sizeof(struct rte_flow_item_tcp), error); + if (ret < 0) + return ret; + return 0; +} + +/** + * Validate VXLAN item. + * + * @param[in] item + * Item specification. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. + * @param[in] target_protocol + * The next protocol in the previous item. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_item_vxlan(const struct rte_flow_item *item, + uint64_t item_flags, + struct rte_flow_error *error) +{ + const struct rte_flow_item_vxlan *spec = item->spec; + const struct rte_flow_item_vxlan *mask = item->mask; + int ret; + union vni { + uint32_t vlan_id; + uint8_t vni[4]; + } id = { .vlan_id = 0, }; + + + if (item_flags & MLX5_FLOW_LAYER_TUNNEL) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "multiple tunnel layers not" + " supported"); + /* + * Verify only UDPv4 is present as defined in + * https://tools.ietf.org/html/rfc7348 + */ + if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "no outer UDP layer found"); + if (!mask) + mask = &rte_flow_item_vxlan_mask; + ret = mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)&rte_flow_item_vxlan_mask, + sizeof(struct rte_flow_item_vxlan), + error); + if (ret < 0) + return ret; + if (spec) { + memcpy(&id.vni[1], spec->vni, 3); + memcpy(&id.vni[1], mask->vni, 3); + } + if (!(item_flags & MLX5_FLOW_LAYER_OUTER)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "VXLAN tunnel must be fully defined"); + return 0; +} + +/** + * Validate VXLAN_GPE item. + * + * @param[in] item + * Item specification. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. + * @param[in] priv + * Pointer to the private data structure. + * @param[in] target_protocol + * The next protocol in the previous item. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_item_vxlan_gpe(const struct rte_flow_item *item, + uint64_t item_flags, + struct rte_eth_dev *dev, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + const struct rte_flow_item_vxlan_gpe *spec = item->spec; + const struct rte_flow_item_vxlan_gpe *mask = item->mask; + int ret; + union vni { + uint32_t vlan_id; + uint8_t vni[4]; + } id = { .vlan_id = 0, }; + + if (!priv->config.l3_vxlan_en) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "L3 VXLAN is not enabled by device" + " parameter and/or not configured in" + " firmware"); + if (item_flags & MLX5_FLOW_LAYER_TUNNEL) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "multiple tunnel layers not" + " supported"); + /* + * Verify only UDPv4 is present as defined in + * https://tools.ietf.org/html/rfc7348 + */ + if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "no outer UDP layer found"); + if (!mask) + mask = &rte_flow_item_vxlan_gpe_mask; + ret = mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)&rte_flow_item_vxlan_gpe_mask, + sizeof(struct rte_flow_item_vxlan_gpe), + error); + if (ret < 0) + return ret; + if (spec) { + if (spec->protocol) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "VxLAN-GPE protocol" + " not supported"); + memcpy(&id.vni[1], spec->vni, 3); + memcpy(&id.vni[1], mask->vni, 3); + } + if (!(item_flags & MLX5_FLOW_LAYER_OUTER)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "VXLAN-GPE tunnel must be fully" + " defined"); + return 0; +} +/** + * Validate GRE Key item. + * + * @param[in] item + * Item specification. + * @param[in] item_flags + * Bit flags to mark detected items. + * @param[in] gre_item + * Pointer to gre_item + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_item_gre_key(const struct rte_flow_item *item, + uint64_t item_flags, + const struct rte_flow_item *gre_item, + struct rte_flow_error *error) +{ + const rte_be32_t *mask = item->mask; + int ret = 0; + rte_be32_t gre_key_default_mask = RTE_BE32(UINT32_MAX); + const struct rte_flow_item_gre *gre_spec; + const struct rte_flow_item_gre *gre_mask; + + if (item_flags & MLX5_FLOW_LAYER_GRE_KEY) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "Multiple GRE key not support"); + if (!(item_flags & MLX5_FLOW_LAYER_GRE)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "No preceding GRE header"); + if (item_flags & MLX5_FLOW_LAYER_INNER) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "GRE key following a wrong item"); + gre_mask = gre_item->mask; + if (!gre_mask) + gre_mask = &rte_flow_item_gre_mask; + gre_spec = gre_item->spec; + if (gre_spec && (gre_mask->c_rsvd0_ver & RTE_BE16(0x2000)) && + !(gre_spec->c_rsvd0_ver & RTE_BE16(0x2000))) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "Key bit must be on"); + + if (!mask) + mask = &gre_key_default_mask; + ret = mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)&gre_key_default_mask, + sizeof(rte_be32_t), error); + return ret; +} + +/** + * Validate GRE item. + * + * @param[in] item + * Item specification. + * @param[in] item_flags + * Bit flags to mark detected items. + * @param[in] target_protocol + * The next protocol in the previous item. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_item_gre(const struct rte_flow_item *item, + uint64_t item_flags, + uint8_t target_protocol, + struct rte_flow_error *error) +{ + const struct rte_flow_item_gre *spec __rte_unused = item->spec; + const struct rte_flow_item_gre *mask = item->mask; + int ret; + const struct rte_flow_item_gre nic_mask = { + .c_rsvd0_ver = RTE_BE16(0xB000), + .protocol = RTE_BE16(UINT16_MAX), + }; + + if (target_protocol != 0xff && target_protocol != IPPROTO_GRE) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "protocol filtering not compatible" + " with this GRE layer"); + if (item_flags & MLX5_FLOW_LAYER_TUNNEL) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "multiple tunnel layers not" + " supported"); + if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "L3 Layer is missing"); + if (!mask) + mask = &rte_flow_item_gre_mask; + ret = mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_gre), error); + if (ret < 0) + return ret; +#ifndef HAVE_MLX5DV_DR +#ifndef HAVE_IBV_DEVICE_MPLS_SUPPORT + if (spec && (spec->protocol & mask->protocol)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "without MPLS support the" + " specification cannot be used for" + " filtering"); +#endif +#endif + return 0; +} + +/** + * Validate Geneve item. + * + * @param[in] item + * Item specification. + * @param[in] itemFlags + * Bit-fields that holds the items detected until now. + * @param[in] enPriv + * Pointer to the private data structure. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ + +int +mlx5_flow_validate_item_geneve(const struct rte_flow_item *item, + uint64_t item_flags, + struct rte_eth_dev *dev, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + const struct rte_flow_item_geneve *spec = item->spec; + const struct rte_flow_item_geneve *mask = item->mask; + int ret; + uint16_t gbhdr; + uint8_t opt_len = priv->config.hca_attr.geneve_max_opt_len ? + MLX5_GENEVE_OPT_LEN_1 : MLX5_GENEVE_OPT_LEN_0; + const struct rte_flow_item_geneve nic_mask = { + .ver_opt_len_o_c_rsvd0 = RTE_BE16(0x3f80), + .vni = "\xff\xff\xff", + .protocol = RTE_BE16(UINT16_MAX), + }; + + if (!priv->config.hca_attr.tunnel_stateless_geneve_rx) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "L3 Geneve is not enabled by device" + " parameter and/or not configured in" + " firmware"); + if (item_flags & MLX5_FLOW_LAYER_TUNNEL) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "multiple tunnel layers not" + " supported"); + /* + * Verify only UDPv4 is present as defined in + * https://tools.ietf.org/html/rfc7348 + */ + if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "no outer UDP layer found"); + if (!mask) + mask = &rte_flow_item_geneve_mask; + ret = mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_geneve), error); + if (ret) + return ret; + if (spec) { + gbhdr = rte_be_to_cpu_16(spec->ver_opt_len_o_c_rsvd0); + if (MLX5_GENEVE_VER_VAL(gbhdr) || + MLX5_GENEVE_CRITO_VAL(gbhdr) || + MLX5_GENEVE_RSVD_VAL(gbhdr) || spec->rsvd1) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "Geneve protocol unsupported" + " fields are being used"); + if (MLX5_GENEVE_OPTLEN_VAL(gbhdr) > opt_len) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "Unsupported Geneve options length"); + } + if (!(item_flags & MLX5_FLOW_LAYER_OUTER)) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "Geneve tunnel must be fully defined"); + return 0; +} + +/** + * Validate MPLS item. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] item + * Item specification. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. + * @param[in] prev_layer + * The protocol layer indicated in previous item. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_item_mpls(struct rte_eth_dev *dev __rte_unused, + const struct rte_flow_item *item __rte_unused, + uint64_t item_flags __rte_unused, + uint64_t prev_layer __rte_unused, + struct rte_flow_error *error) +{ +#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT + const struct rte_flow_item_mpls *mask = item->mask; + struct mlx5_priv *priv = dev->data->dev_private; + int ret; + + if (!priv->config.mpls_en) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "MPLS not supported or" + " disabled in firmware" + " configuration."); + /* MPLS over IP, UDP, GRE is allowed */ + if (!(prev_layer & (MLX5_FLOW_LAYER_OUTER_L3 | + MLX5_FLOW_LAYER_OUTER_L4_UDP | + MLX5_FLOW_LAYER_GRE))) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "protocol filtering not compatible" + " with MPLS layer"); + /* Multi-tunnel isn't allowed but MPLS over GRE is an exception. */ + if ((item_flags & MLX5_FLOW_LAYER_TUNNEL) && + !(item_flags & MLX5_FLOW_LAYER_GRE)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "multiple tunnel layers not" + " supported"); + if (!mask) + mask = &rte_flow_item_mpls_mask; + ret = mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)&rte_flow_item_mpls_mask, + sizeof(struct rte_flow_item_mpls), error); + if (ret < 0) + return ret; + return 0; +#endif + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "MPLS is not supported by Verbs, please" + " update."); +} + +/** + * Validate NVGRE item. + * + * @param[in] item + * Item specification. + * @param[in] item_flags + * Bit flags to mark detected items. + * @param[in] target_protocol + * The next protocol in the previous item. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_item_nvgre(const struct rte_flow_item *item, + uint64_t item_flags, + uint8_t target_protocol, + struct rte_flow_error *error) +{ + const struct rte_flow_item_nvgre *mask = item->mask; + int ret; + + if (target_protocol != 0xff && target_protocol != IPPROTO_GRE) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "protocol filtering not compatible" + " with this GRE layer"); + if (item_flags & MLX5_FLOW_LAYER_TUNNEL) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "multiple tunnel layers not" + " supported"); + if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "L3 Layer is missing"); + if (!mask) + mask = &rte_flow_item_nvgre_mask; + ret = mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)&rte_flow_item_nvgre_mask, + sizeof(struct rte_flow_item_nvgre), error); + if (ret < 0) + return ret; + return 0; +} + +/* Allocate unique ID for the split Q/RSS subflows. */ +static uint32_t +flow_qrss_get_id(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + uint32_t qrss_id, ret; + + ret = mlx5_flow_id_get(priv->qrss_id_pool, &qrss_id); + if (ret) + return 0; + MLX5_ASSERT(qrss_id); + return qrss_id; +} + +/* Free unique ID for the split Q/RSS subflows. */ +static void +flow_qrss_free_id(struct rte_eth_dev *dev, uint32_t qrss_id) +{ + struct mlx5_priv *priv = dev->data->dev_private; + + if (qrss_id) + mlx5_flow_id_release(priv->qrss_id_pool, qrss_id); +} + +/** + * Release resource related QUEUE/RSS action split. + * + * @param dev + * Pointer to Ethernet device. + * @param flow + * Flow to release id's from. + */ +static void +flow_mreg_split_qrss_release(struct rte_eth_dev *dev, + struct rte_flow *flow) +{ + struct mlx5_priv *priv = dev->data->dev_private; + uint32_t handle_idx; + struct mlx5_flow_handle *dev_handle; + + SILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], flow->dev_handles, + handle_idx, dev_handle, next) + if (dev_handle->split_flow_id) + flow_qrss_free_id(dev, dev_handle->split_flow_id); +} + +static int +flow_null_validate(struct rte_eth_dev *dev __rte_unused, + const struct rte_flow_attr *attr __rte_unused, + const struct rte_flow_item items[] __rte_unused, + const struct rte_flow_action actions[] __rte_unused, + bool external __rte_unused, + int hairpin __rte_unused, + struct rte_flow_error *error) +{ + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, NULL); +} + +static struct mlx5_flow * +flow_null_prepare(struct rte_eth_dev *dev __rte_unused, + const struct rte_flow_attr *attr __rte_unused, + const struct rte_flow_item items[] __rte_unused, + const struct rte_flow_action actions[] __rte_unused, + struct rte_flow_error *error) +{ + rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, NULL); + return NULL; +} + +static int +flow_null_translate(struct rte_eth_dev *dev __rte_unused, + struct mlx5_flow *dev_flow __rte_unused, + const struct rte_flow_attr *attr __rte_unused, + const struct rte_flow_item items[] __rte_unused, + const struct rte_flow_action actions[] __rte_unused, + struct rte_flow_error *error) +{ + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, NULL); +} + +static int +flow_null_apply(struct rte_eth_dev *dev __rte_unused, + struct rte_flow *flow __rte_unused, + struct rte_flow_error *error) +{ + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, NULL); +} + +static void +flow_null_remove(struct rte_eth_dev *dev __rte_unused, + struct rte_flow *flow __rte_unused) +{ +} + +static void +flow_null_destroy(struct rte_eth_dev *dev __rte_unused, + struct rte_flow *flow __rte_unused) +{ +} + +static int +flow_null_query(struct rte_eth_dev *dev __rte_unused, + struct rte_flow *flow __rte_unused, + const struct rte_flow_action *actions __rte_unused, + void *data __rte_unused, + struct rte_flow_error *error) +{ + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, NULL); +} + +/* Void driver to protect from null pointer reference. */ +const struct mlx5_flow_driver_ops mlx5_flow_null_drv_ops = { + .validate = flow_null_validate, + .prepare = flow_null_prepare, + .translate = flow_null_translate, + .apply = flow_null_apply, + .remove = flow_null_remove, + .destroy = flow_null_destroy, + .query = flow_null_query, +}; + +/** + * Select flow driver type according to flow attributes and device + * configuration. + * + * @param[in] dev + * Pointer to the dev structure. + * @param[in] attr + * Pointer to the flow attributes. + * + * @return + * flow driver type, MLX5_FLOW_TYPE_MAX otherwise. + */ +static enum mlx5_flow_drv_type +flow_get_drv_type(struct rte_eth_dev *dev, const struct rte_flow_attr *attr) +{ + struct mlx5_priv *priv = dev->data->dev_private; + enum mlx5_flow_drv_type type = MLX5_FLOW_TYPE_MAX; + + if (attr->transfer && priv->config.dv_esw_en) + type = MLX5_FLOW_TYPE_DV; + if (!attr->transfer) + type = priv->config.dv_flow_en ? MLX5_FLOW_TYPE_DV : + MLX5_FLOW_TYPE_VERBS; + return type; +} + +#define flow_get_drv_ops(type) flow_drv_ops[type] + +/** + * Flow driver validation API. This abstracts calling driver specific functions. + * The type of flow driver is determined according to flow attributes. + * + * @param[in] dev + * Pointer to the dev structure. + * @param[in] attr + * Pointer to the flow attributes. + * @param[in] items + * Pointer to the list of items. + * @param[in] actions + * Pointer to the list of actions. + * @param[in] external + * This flow rule is created by request external to PMD. + * @param[in] hairpin + * Number of hairpin TX actions, 0 means classic flow. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static inline int +flow_drv_validate(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + bool external, int hairpin, struct rte_flow_error *error) +{ + const struct mlx5_flow_driver_ops *fops; + enum mlx5_flow_drv_type type = flow_get_drv_type(dev, attr); + + fops = flow_get_drv_ops(type); + return fops->validate(dev, attr, items, actions, external, + hairpin, error); +} + +/** + * Flow driver preparation API. This abstracts calling driver specific + * functions. Parent flow (rte_flow) should have driver type (drv_type). It + * calculates the size of memory required for device flow, allocates the memory, + * initializes the device flow and returns the pointer. + * + * @note + * This function initializes device flow structure such as dv or verbs in + * struct mlx5_flow. However, it is caller's responsibility to initialize the + * rest. For example, adding returning device flow to flow->dev_flow list and + * setting backward reference to the flow should be done out of this function. + * layers field is not filled either. + * + * @param[in] dev + * Pointer to the dev structure. + * @param[in] attr + * Pointer to the flow attributes. + * @param[in] items + * Pointer to the list of items. + * @param[in] actions + * Pointer to the list of actions. + * @param[in] flow_idx + * This memory pool index to the flow. + * @param[out] error + * Pointer to the error structure. + * + * @return + * Pointer to device flow on success, otherwise NULL and rte_errno is set. + */ +static inline struct mlx5_flow * +flow_drv_prepare(struct rte_eth_dev *dev, + const struct rte_flow *flow, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + uint32_t flow_idx, + struct rte_flow_error *error) +{ + const struct mlx5_flow_driver_ops *fops; + enum mlx5_flow_drv_type type = flow->drv_type; + struct mlx5_flow *mlx5_flow = NULL; + + MLX5_ASSERT(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX); + fops = flow_get_drv_ops(type); + mlx5_flow = fops->prepare(dev, attr, items, actions, error); + if (mlx5_flow) + mlx5_flow->flow_idx = flow_idx; + return mlx5_flow; +} + +/** + * Flow driver translation API. This abstracts calling driver specific + * functions. Parent flow (rte_flow) should have driver type (drv_type). It + * translates a generic flow into a driver flow. flow_drv_prepare() must + * precede. + * + * @note + * dev_flow->layers could be filled as a result of parsing during translation + * if needed by flow_drv_apply(). dev_flow->flow->actions can also be filled + * if necessary. As a flow can have multiple dev_flows by RSS flow expansion, + * flow->actions could be overwritten even though all the expanded dev_flows + * have the same actions. + * + * @param[in] dev + * Pointer to the rte dev structure. + * @param[in, out] dev_flow + * Pointer to the mlx5 flow. + * @param[in] attr + * Pointer to the flow attributes. + * @param[in] items + * Pointer to the list of items. + * @param[in] actions + * Pointer to the list of actions. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static inline int +flow_drv_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + const struct mlx5_flow_driver_ops *fops; + enum mlx5_flow_drv_type type = dev_flow->flow->drv_type; + + MLX5_ASSERT(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX); + fops = flow_get_drv_ops(type); + return fops->translate(dev, dev_flow, attr, items, actions, error); +} + +/** + * Flow driver apply API. This abstracts calling driver specific functions. + * Parent flow (rte_flow) should have driver type (drv_type). It applies + * translated driver flows on to device. flow_drv_translate() must precede. + * + * @param[in] dev + * Pointer to Ethernet device structure. + * @param[in, out] flow + * Pointer to flow structure. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static inline int +flow_drv_apply(struct rte_eth_dev *dev, struct rte_flow *flow, + struct rte_flow_error *error) +{ + const struct mlx5_flow_driver_ops *fops; + enum mlx5_flow_drv_type type = flow->drv_type; + + MLX5_ASSERT(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX); + fops = flow_get_drv_ops(type); + return fops->apply(dev, flow, error); +} + +/** + * Flow driver remove API. This abstracts calling driver specific functions. + * Parent flow (rte_flow) should have driver type (drv_type). It removes a flow + * on device. All the resources of the flow should be freed by calling + * flow_drv_destroy(). + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in, out] flow + * Pointer to flow structure. + */ +static inline void +flow_drv_remove(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + const struct mlx5_flow_driver_ops *fops; + enum mlx5_flow_drv_type type = flow->drv_type; + + MLX5_ASSERT(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX); + fops = flow_get_drv_ops(type); + fops->remove(dev, flow); +} + +/** + * Flow driver destroy API. This abstracts calling driver specific functions. + * Parent flow (rte_flow) should have driver type (drv_type). It removes a flow + * on device and releases resources of the flow. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in, out] flow + * Pointer to flow structure. + */ +static inline void +flow_drv_destroy(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + const struct mlx5_flow_driver_ops *fops; + enum mlx5_flow_drv_type type = flow->drv_type; + + flow_mreg_split_qrss_release(dev, flow); + MLX5_ASSERT(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX); + fops = flow_get_drv_ops(type); + fops->destroy(dev, flow); +} + +/** + * Get RSS action from the action list. + * + * @param[in] actions + * Pointer to the list of actions. + * + * @return + * Pointer to the RSS action if exist, else return NULL. + */ +static const struct rte_flow_action_rss* +flow_get_rss_action(const struct rte_flow_action actions[]) +{ + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_RSS: + return (const struct rte_flow_action_rss *) + actions->conf; + default: + break; + } + } + return NULL; +} + +static unsigned int +find_graph_root(const struct rte_flow_item pattern[], uint32_t rss_level) +{ + const struct rte_flow_item *item; + unsigned int has_vlan = 0; + + for (item = pattern; item->type != RTE_FLOW_ITEM_TYPE_END; item++) { + if (item->type == RTE_FLOW_ITEM_TYPE_VLAN) { + has_vlan = 1; + break; + } + } + if (has_vlan) + return rss_level < 2 ? MLX5_EXPANSION_ROOT_ETH_VLAN : + MLX5_EXPANSION_ROOT_OUTER_ETH_VLAN; + return rss_level < 2 ? MLX5_EXPANSION_ROOT : + MLX5_EXPANSION_ROOT_OUTER; +} + +/** + * Get layer flags from the prefix flow. + * + * Some flows may be split to several subflows, the prefix subflow gets the + * match items and the suffix sub flow gets the actions. + * Some actions need the user defined match item flags to get the detail for + * the action. + * This function helps the suffix flow to get the item layer flags from prefix + * subflow. + * + * @param[in] dev_flow + * Pointer the created preifx subflow. + * + * @return + * The layers get from prefix subflow. + */ +static inline uint64_t +flow_get_prefix_layer_flags(struct mlx5_flow *dev_flow) +{ + uint64_t layers = 0; + + /* + * Layers bits could be localization, but usually the compiler will + * help to do the optimization work for source code. + * If no decap actions, use the layers directly. + */ + if (!(dev_flow->act_flags & MLX5_FLOW_ACTION_DECAP)) + return dev_flow->handle->layers; + /* Convert L3 layers with decap action. */ + if (dev_flow->handle->layers & MLX5_FLOW_LAYER_INNER_L3_IPV4) + layers |= MLX5_FLOW_LAYER_OUTER_L3_IPV4; + else if (dev_flow->handle->layers & MLX5_FLOW_LAYER_INNER_L3_IPV6) + layers |= MLX5_FLOW_LAYER_OUTER_L3_IPV6; + /* Convert L4 layers with decap action. */ + if (dev_flow->handle->layers & MLX5_FLOW_LAYER_INNER_L4_TCP) + layers |= MLX5_FLOW_LAYER_OUTER_L4_TCP; + else if (dev_flow->handle->layers & MLX5_FLOW_LAYER_INNER_L4_UDP) + layers |= MLX5_FLOW_LAYER_OUTER_L4_UDP; + return layers; +} + +/** + * Get metadata split action information. + * + * @param[in] actions + * Pointer to the list of actions. + * @param[out] qrss + * Pointer to the return pointer. + * @param[out] qrss_type + * Pointer to the action type to return. RTE_FLOW_ACTION_TYPE_END is returned + * if no QUEUE/RSS is found. + * @param[out] encap_idx + * Pointer to the index of the encap action if exists, otherwise the last + * action index. + * + * @return + * Total number of actions. + */ +static int +flow_parse_metadata_split_actions_info(const struct rte_flow_action actions[], + const struct rte_flow_action **qrss, + int *encap_idx) +{ + const struct rte_flow_action_raw_encap *raw_encap; + int actions_n = 0; + int raw_decap_idx = -1; + + *encap_idx = -1; + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: + case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP: + *encap_idx = actions_n; + break; + case RTE_FLOW_ACTION_TYPE_RAW_DECAP: + raw_decap_idx = actions_n; + break; + case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: + raw_encap = actions->conf; + if (raw_encap->size > MLX5_ENCAPSULATION_DECISION_SIZE) + *encap_idx = raw_decap_idx != -1 ? + raw_decap_idx : actions_n; + break; + case RTE_FLOW_ACTION_TYPE_QUEUE: + case RTE_FLOW_ACTION_TYPE_RSS: + *qrss = actions; + break; + default: + break; + } + actions_n++; + } + if (*encap_idx == -1) + *encap_idx = actions_n; + /* Count RTE_FLOW_ACTION_TYPE_END. */ + return actions_n + 1; +} + +/** + * Check meter action from the action list. + * + * @param[in] actions + * Pointer to the list of actions. + * @param[out] mtr + * Pointer to the meter exist flag. + * + * @return + * Total number of actions. + */ +static int +flow_check_meter_action(const struct rte_flow_action actions[], uint32_t *mtr) +{ + int actions_n = 0; + + MLX5_ASSERT(mtr); + *mtr = 0; + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_METER: + *mtr = 1; + break; + default: + break; + } + actions_n++; + } + /* Count RTE_FLOW_ACTION_TYPE_END. */ + return actions_n + 1; +} + +/** + * Check if the flow should be splited due to hairpin. + * The reason for the split is that in current HW we can't + * support encap on Rx, so if a flow have encap we move it + * to Tx. + * + * @param dev + * Pointer to Ethernet device. + * @param[in] attr + * Flow rule attributes. + * @param[in] actions + * Associated actions (list terminated by the END action). + * + * @return + * > 0 the number of actions and the flow should be split, + * 0 when no split required. + */ +static int +flow_check_hairpin_split(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_action actions[]) +{ + int queue_action = 0; + int action_n = 0; + int encap = 0; + const struct rte_flow_action_queue *queue; + const struct rte_flow_action_rss *rss; + const struct rte_flow_action_raw_encap *raw_encap; + + if (!attr->ingress) + return 0; + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_QUEUE: + queue = actions->conf; + if (queue == NULL) + return 0; + if (mlx5_rxq_get_type(dev, queue->index) != + MLX5_RXQ_TYPE_HAIRPIN) + return 0; + queue_action = 1; + action_n++; + break; + case RTE_FLOW_ACTION_TYPE_RSS: + rss = actions->conf; + if (rss == NULL || rss->queue_num == 0) + return 0; + if (mlx5_rxq_get_type(dev, rss->queue[0]) != + MLX5_RXQ_TYPE_HAIRPIN) + return 0; + queue_action = 1; + action_n++; + break; + case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: + case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP: + encap = 1; + action_n++; + break; + case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: + raw_encap = actions->conf; + if (raw_encap->size > + (sizeof(struct rte_flow_item_eth) + + sizeof(struct rte_flow_item_ipv4))) + encap = 1; + action_n++; + break; + default: + action_n++; + break; + } + } + if (encap == 1 && queue_action) + return action_n; + return 0; +} + +/* Declare flow create/destroy prototype in advance. */ +static uint32_t +flow_list_create(struct rte_eth_dev *dev, uint32_t *list, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + bool external, struct rte_flow_error *error); + +static void +flow_list_destroy(struct rte_eth_dev *dev, uint32_t *list, + uint32_t flow_idx); + +/** + * Add a flow of copying flow metadata registers in RX_CP_TBL. + * + * As mark_id is unique, if there's already a registered flow for the mark_id, + * return by increasing the reference counter of the resource. Otherwise, create + * the resource (mcp_res) and flow. + * + * Flow looks like, + * - If ingress port is ANY and reg_c[1] is mark_id, + * flow_tag := mark_id, reg_b := reg_c[0] and jump to RX_ACT_TBL. + * + * For default flow (zero mark_id), flow is like, + * - If ingress port is ANY, + * reg_b := reg_c[0] and jump to RX_ACT_TBL. + * + * @param dev + * Pointer to Ethernet device. + * @param mark_id + * ID of MARK action, zero means default flow for META. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * Associated resource on success, NULL otherwise and rte_errno is set. + */ +static struct mlx5_flow_mreg_copy_resource * +flow_mreg_add_copy_action(struct rte_eth_dev *dev, uint32_t mark_id, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct rte_flow_attr attr = { + .group = MLX5_FLOW_MREG_CP_TABLE_GROUP, + .ingress = 1, + }; + struct mlx5_rte_flow_item_tag tag_spec = { + .data = mark_id, + }; + struct rte_flow_item items[] = { + [1] = { .type = RTE_FLOW_ITEM_TYPE_END, }, + }; + struct rte_flow_action_mark ftag = { + .id = mark_id, + }; + struct mlx5_flow_action_copy_mreg cp_mreg = { + .dst = REG_B, + .src = 0, + }; + struct rte_flow_action_jump jump = { + .group = MLX5_FLOW_MREG_ACT_TABLE_GROUP, + }; + struct rte_flow_action actions[] = { + [3] = { .type = RTE_FLOW_ACTION_TYPE_END, }, + }; + struct mlx5_flow_mreg_copy_resource *mcp_res; + uint32_t idx = 0; + int ret; + + /* Fill the register fileds in the flow. */ + ret = mlx5_flow_get_reg_id(dev, MLX5_FLOW_MARK, 0, error); + if (ret < 0) + return NULL; + tag_spec.id = ret; + ret = mlx5_flow_get_reg_id(dev, MLX5_METADATA_RX, 0, error); + if (ret < 0) + return NULL; + cp_mreg.src = ret; + /* Check if already registered. */ + MLX5_ASSERT(priv->mreg_cp_tbl); + mcp_res = (void *)mlx5_hlist_lookup(priv->mreg_cp_tbl, mark_id); + if (mcp_res) { + /* For non-default rule. */ + if (mark_id != MLX5_DEFAULT_COPY_ID) + mcp_res->refcnt++; + MLX5_ASSERT(mark_id != MLX5_DEFAULT_COPY_ID || + mcp_res->refcnt == 1); + return mcp_res; + } + /* Provide the full width of FLAG specific value. */ + if (mark_id == (priv->sh->dv_regc0_mask & MLX5_FLOW_MARK_DEFAULT)) + tag_spec.data = MLX5_FLOW_MARK_DEFAULT; + /* Build a new flow. */ + if (mark_id != MLX5_DEFAULT_COPY_ID) { + items[0] = (struct rte_flow_item){ + .type = (enum rte_flow_item_type) + MLX5_RTE_FLOW_ITEM_TYPE_TAG, + .spec = &tag_spec, + }; + items[1] = (struct rte_flow_item){ + .type = RTE_FLOW_ITEM_TYPE_END, + }; + actions[0] = (struct rte_flow_action){ + .type = (enum rte_flow_action_type) + MLX5_RTE_FLOW_ACTION_TYPE_MARK, + .conf = &ftag, + }; + actions[1] = (struct rte_flow_action){ + .type = (enum rte_flow_action_type) + MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG, + .conf = &cp_mreg, + }; + actions[2] = (struct rte_flow_action){ + .type = RTE_FLOW_ACTION_TYPE_JUMP, + .conf = &jump, + }; + actions[3] = (struct rte_flow_action){ + .type = RTE_FLOW_ACTION_TYPE_END, + }; + } else { + /* Default rule, wildcard match. */ + attr.priority = MLX5_FLOW_PRIO_RSVD; + items[0] = (struct rte_flow_item){ + .type = RTE_FLOW_ITEM_TYPE_END, + }; + actions[0] = (struct rte_flow_action){ + .type = (enum rte_flow_action_type) + MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG, + .conf = &cp_mreg, + }; + actions[1] = (struct rte_flow_action){ + .type = RTE_FLOW_ACTION_TYPE_JUMP, + .conf = &jump, + }; + actions[2] = (struct rte_flow_action){ + .type = RTE_FLOW_ACTION_TYPE_END, + }; + } + /* Build a new entry. */ + mcp_res = mlx5_ipool_zmalloc(priv->sh->ipool[MLX5_IPOOL_MCP], &idx); + if (!mcp_res) { + rte_errno = ENOMEM; + return NULL; + } + mcp_res->idx = idx; + /* + * The copy Flows are not included in any list. There + * ones are referenced from other Flows and can not + * be applied, removed, deleted in ardbitrary order + * by list traversing. + */ + mcp_res->rix_flow = flow_list_create(dev, NULL, &attr, items, + actions, false, error); + if (!mcp_res->rix_flow) + goto error; + mcp_res->refcnt++; + mcp_res->hlist_ent.key = mark_id; + ret = mlx5_hlist_insert(priv->mreg_cp_tbl, + &mcp_res->hlist_ent); + MLX5_ASSERT(!ret); + if (ret) + goto error; + return mcp_res; +error: + if (mcp_res->rix_flow) + flow_list_destroy(dev, NULL, mcp_res->rix_flow); + mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MCP], mcp_res->idx); + return NULL; +} + +/** + * Release flow in RX_CP_TBL. + * + * @param dev + * Pointer to Ethernet device. + * @flow + * Parent flow for wich copying is provided. + */ +static void +flow_mreg_del_copy_action(struct rte_eth_dev *dev, + struct rte_flow *flow) +{ + struct mlx5_flow_mreg_copy_resource *mcp_res; + struct mlx5_priv *priv = dev->data->dev_private; + + if (!flow->rix_mreg_copy) + return; + mcp_res = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_MCP], + flow->rix_mreg_copy); + if (!mcp_res || !priv->mreg_cp_tbl) + return; + if (flow->copy_applied) { + MLX5_ASSERT(mcp_res->appcnt); + flow->copy_applied = 0; + --mcp_res->appcnt; + if (!mcp_res->appcnt) { + struct rte_flow *mcp_flow = mlx5_ipool_get + (priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], + mcp_res->rix_flow); + + if (mcp_flow) + flow_drv_remove(dev, mcp_flow); + } + } + /* + * We do not check availability of metadata registers here, + * because copy resources are not allocated in this case. + */ + if (--mcp_res->refcnt) + return; + MLX5_ASSERT(mcp_res->rix_flow); + flow_list_destroy(dev, NULL, mcp_res->rix_flow); + mlx5_hlist_remove(priv->mreg_cp_tbl, &mcp_res->hlist_ent); + mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MCP], mcp_res->idx); + flow->rix_mreg_copy = 0; +} + +/** + * Start flow in RX_CP_TBL. + * + * @param dev + * Pointer to Ethernet device. + * @flow + * Parent flow for wich copying is provided. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_mreg_start_copy_action(struct rte_eth_dev *dev, + struct rte_flow *flow) +{ + struct mlx5_flow_mreg_copy_resource *mcp_res; + struct mlx5_priv *priv = dev->data->dev_private; + int ret; + + if (!flow->rix_mreg_copy || flow->copy_applied) + return 0; + mcp_res = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_MCP], + flow->rix_mreg_copy); + if (!mcp_res) + return 0; + if (!mcp_res->appcnt) { + struct rte_flow *mcp_flow = mlx5_ipool_get + (priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], + mcp_res->rix_flow); + + if (mcp_flow) { + ret = flow_drv_apply(dev, mcp_flow, NULL); + if (ret) + return ret; + } + } + ++mcp_res->appcnt; + flow->copy_applied = 1; + return 0; +} + +/** + * Stop flow in RX_CP_TBL. + * + * @param dev + * Pointer to Ethernet device. + * @flow + * Parent flow for wich copying is provided. + */ +static void +flow_mreg_stop_copy_action(struct rte_eth_dev *dev, + struct rte_flow *flow) +{ + struct mlx5_flow_mreg_copy_resource *mcp_res; + struct mlx5_priv *priv = dev->data->dev_private; + + if (!flow->rix_mreg_copy || !flow->copy_applied) + return; + mcp_res = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_MCP], + flow->rix_mreg_copy); + if (!mcp_res) + return; + MLX5_ASSERT(mcp_res->appcnt); + --mcp_res->appcnt; + flow->copy_applied = 0; + if (!mcp_res->appcnt) { + struct rte_flow *mcp_flow = mlx5_ipool_get + (priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], + mcp_res->rix_flow); + + if (mcp_flow) + flow_drv_remove(dev, mcp_flow); + } +} + +/** + * Remove the default copy action from RX_CP_TBL. + * + * @param dev + * Pointer to Ethernet device. + */ +static void +flow_mreg_del_default_copy_action(struct rte_eth_dev *dev) +{ + struct mlx5_flow_mreg_copy_resource *mcp_res; + struct mlx5_priv *priv = dev->data->dev_private; + + /* Check if default flow is registered. */ + if (!priv->mreg_cp_tbl) + return; + mcp_res = (void *)mlx5_hlist_lookup(priv->mreg_cp_tbl, + MLX5_DEFAULT_COPY_ID); + if (!mcp_res) + return; + MLX5_ASSERT(mcp_res->rix_flow); + flow_list_destroy(dev, NULL, mcp_res->rix_flow); + mlx5_hlist_remove(priv->mreg_cp_tbl, &mcp_res->hlist_ent); + mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MCP], mcp_res->idx); +} + +/** + * Add the default copy action in in RX_CP_TBL. + * + * @param dev + * Pointer to Ethernet device. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 for success, negative value otherwise and rte_errno is set. + */ +static int +flow_mreg_add_default_copy_action(struct rte_eth_dev *dev, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_mreg_copy_resource *mcp_res; + + /* Check whether extensive metadata feature is engaged. */ + if (!priv->config.dv_flow_en || + priv->config.dv_xmeta_en == MLX5_XMETA_MODE_LEGACY || + !mlx5_flow_ext_mreg_supported(dev) || + !priv->sh->dv_regc0_mask) + return 0; + mcp_res = flow_mreg_add_copy_action(dev, MLX5_DEFAULT_COPY_ID, error); + if (!mcp_res) + return -rte_errno; + return 0; +} + +/** + * Add a flow of copying flow metadata registers in RX_CP_TBL. + * + * All the flow having Q/RSS action should be split by + * flow_mreg_split_qrss_prep() to pass by RX_CP_TBL. A flow in the RX_CP_TBL + * performs the following, + * - CQE->flow_tag := reg_c[1] (MARK) + * - CQE->flow_table_metadata (reg_b) := reg_c[0] (META) + * As CQE's flow_tag is not a register, it can't be simply copied from reg_c[1] + * but there should be a flow per each MARK ID set by MARK action. + * + * For the aforementioned reason, if there's a MARK action in flow's action + * list, a corresponding flow should be added to the RX_CP_TBL in order to copy + * the MARK ID to CQE's flow_tag like, + * - If reg_c[1] is mark_id, + * flow_tag := mark_id, reg_b := reg_c[0] and jump to RX_ACT_TBL. + * + * For SET_META action which stores value in reg_c[0], as the destination is + * also a flow metadata register (reg_b), adding a default flow is enough. Zero + * MARK ID means the default flow. The default flow looks like, + * - For all flow, reg_b := reg_c[0] and jump to RX_ACT_TBL. + * + * @param dev + * Pointer to Ethernet device. + * @param flow + * Pointer to flow structure. + * @param[in] actions + * Pointer to the list of actions. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, negative value otherwise and rte_errno is set. + */ +static int +flow_mreg_update_copy_table(struct rte_eth_dev *dev, + struct rte_flow *flow, + const struct rte_flow_action *actions, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_dev_config *config = &priv->config; + struct mlx5_flow_mreg_copy_resource *mcp_res; + const struct rte_flow_action_mark *mark; + + /* Check whether extensive metadata feature is engaged. */ + if (!config->dv_flow_en || + config->dv_xmeta_en == MLX5_XMETA_MODE_LEGACY || + !mlx5_flow_ext_mreg_supported(dev) || + !priv->sh->dv_regc0_mask) + return 0; + /* Find MARK action. */ + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_FLAG: + mcp_res = flow_mreg_add_copy_action + (dev, MLX5_FLOW_MARK_DEFAULT, error); + if (!mcp_res) + return -rte_errno; + flow->rix_mreg_copy = mcp_res->idx; + if (dev->data->dev_started) { + mcp_res->appcnt++; + flow->copy_applied = 1; + } + return 0; + case RTE_FLOW_ACTION_TYPE_MARK: + mark = (const struct rte_flow_action_mark *) + actions->conf; + mcp_res = + flow_mreg_add_copy_action(dev, mark->id, error); + if (!mcp_res) + return -rte_errno; + flow->rix_mreg_copy = mcp_res->idx; + if (dev->data->dev_started) { + mcp_res->appcnt++; + flow->copy_applied = 1; + } + return 0; + default: + break; + } + } + return 0; +} + +#define MLX5_MAX_SPLIT_ACTIONS 24 +#define MLX5_MAX_SPLIT_ITEMS 24 + +/** + * Split the hairpin flow. + * Since HW can't support encap on Rx we move the encap to Tx. + * If the count action is after the encap then we also + * move the count action. in this case the count will also measure + * the outer bytes. + * + * @param dev + * Pointer to Ethernet device. + * @param[in] actions + * Associated actions (list terminated by the END action). + * @param[out] actions_rx + * Rx flow actions. + * @param[out] actions_tx + * Tx flow actions.. + * @param[out] pattern_tx + * The pattern items for the Tx flow. + * @param[out] flow_id + * The flow ID connected to this flow. + * + * @return + * 0 on success. + */ +static int +flow_hairpin_split(struct rte_eth_dev *dev, + const struct rte_flow_action actions[], + struct rte_flow_action actions_rx[], + struct rte_flow_action actions_tx[], + struct rte_flow_item pattern_tx[], + uint32_t *flow_id) +{ + struct mlx5_priv *priv = dev->data->dev_private; + const struct rte_flow_action_raw_encap *raw_encap; + const struct rte_flow_action_raw_decap *raw_decap; + struct mlx5_rte_flow_action_set_tag *set_tag; + struct rte_flow_action *tag_action; + struct mlx5_rte_flow_item_tag *tag_item; + struct rte_flow_item *item; + char *addr; + int encap = 0; + + mlx5_flow_id_get(priv->sh->flow_id_pool, flow_id); + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: + case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP: + rte_memcpy(actions_tx, actions, + sizeof(struct rte_flow_action)); + actions_tx++; + break; + case RTE_FLOW_ACTION_TYPE_COUNT: + if (encap) { + rte_memcpy(actions_tx, actions, + sizeof(struct rte_flow_action)); + actions_tx++; + } else { + rte_memcpy(actions_rx, actions, + sizeof(struct rte_flow_action)); + actions_rx++; + } + break; + case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: + raw_encap = actions->conf; + if (raw_encap->size > + (sizeof(struct rte_flow_item_eth) + + sizeof(struct rte_flow_item_ipv4))) { + memcpy(actions_tx, actions, + sizeof(struct rte_flow_action)); + actions_tx++; + encap = 1; + } else { + rte_memcpy(actions_rx, actions, + sizeof(struct rte_flow_action)); + actions_rx++; + } + break; + case RTE_FLOW_ACTION_TYPE_RAW_DECAP: + raw_decap = actions->conf; + if (raw_decap->size < + (sizeof(struct rte_flow_item_eth) + + sizeof(struct rte_flow_item_ipv4))) { + memcpy(actions_tx, actions, + sizeof(struct rte_flow_action)); + actions_tx++; + } else { + rte_memcpy(actions_rx, actions, + sizeof(struct rte_flow_action)); + actions_rx++; + } + break; + default: + rte_memcpy(actions_rx, actions, + sizeof(struct rte_flow_action)); + actions_rx++; + break; + } + } + /* Add set meta action and end action for the Rx flow. */ + tag_action = actions_rx; + tag_action->type = (enum rte_flow_action_type) + MLX5_RTE_FLOW_ACTION_TYPE_TAG; + actions_rx++; + rte_memcpy(actions_rx, actions, sizeof(struct rte_flow_action)); + actions_rx++; + set_tag = (void *)actions_rx; + set_tag->id = mlx5_flow_get_reg_id(dev, MLX5_HAIRPIN_RX, 0, NULL); + MLX5_ASSERT(set_tag->id > REG_NONE); + set_tag->data = *flow_id; + tag_action->conf = set_tag; + /* Create Tx item list. */ + rte_memcpy(actions_tx, actions, sizeof(struct rte_flow_action)); + addr = (void *)&pattern_tx[2]; + item = pattern_tx; + item->type = (enum rte_flow_item_type) + MLX5_RTE_FLOW_ITEM_TYPE_TAG; + tag_item = (void *)addr; + tag_item->data = *flow_id; + tag_item->id = mlx5_flow_get_reg_id(dev, MLX5_HAIRPIN_TX, 0, NULL); + MLX5_ASSERT(set_tag->id > REG_NONE); + item->spec = tag_item; + addr += sizeof(struct mlx5_rte_flow_item_tag); + tag_item = (void *)addr; + tag_item->data = UINT32_MAX; + tag_item->id = UINT16_MAX; + item->mask = tag_item; + addr += sizeof(struct mlx5_rte_flow_item_tag); + item->last = NULL; + item++; + item->type = RTE_FLOW_ITEM_TYPE_END; + return 0; +} + +/** + * The last stage of splitting chain, just creates the subflow + * without any modification. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] flow + * Parent flow structure pointer. + * @param[in, out] sub_flow + * Pointer to return the created subflow, may be NULL. + * @param[in] prefix_layers + * Prefix subflow layers, may be 0. + * @param[in] attr + * Flow rule attributes. + * @param[in] items + * Pattern specification (list terminated by the END pattern item). + * @param[in] actions + * Associated actions (list terminated by the END action). + * @param[in] external + * This flow rule is created by request external to PMD. + * @param[in] flow_idx + * This memory pool index to the flow. + * @param[out] error + * Perform verbose error reporting if not NULL. + * @return + * 0 on success, negative value otherwise + */ +static int +flow_create_split_inner(struct rte_eth_dev *dev, + struct rte_flow *flow, + struct mlx5_flow **sub_flow, + uint64_t prefix_layers, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + bool external, uint32_t flow_idx, + struct rte_flow_error *error) +{ + struct mlx5_flow *dev_flow; + + dev_flow = flow_drv_prepare(dev, flow, attr, items, actions, + flow_idx, error); + if (!dev_flow) + return -rte_errno; + dev_flow->flow = flow; + dev_flow->external = external; + /* Subflow object was created, we must include one in the list. */ + SILIST_INSERT(&flow->dev_handles, dev_flow->handle_idx, + dev_flow->handle, next); + /* + * If dev_flow is as one of the suffix flow, some actions in suffix + * flow may need some user defined item layer flags. + */ + if (prefix_layers) + dev_flow->handle->layers = prefix_layers; + if (sub_flow) + *sub_flow = dev_flow; + return flow_drv_translate(dev, dev_flow, attr, items, actions, error); +} + +/** + * Split the meter flow. + * + * As meter flow will split to three sub flow, other than meter + * action, the other actions make sense to only meter accepts + * the packet. If it need to be dropped, no other additional + * actions should be take. + * + * One kind of special action which decapsulates the L3 tunnel + * header will be in the prefix sub flow, as not to take the + * L3 tunnel header into account. + * + * @param dev + * Pointer to Ethernet device. + * @param[in] items + * Pattern specification (list terminated by the END pattern item). + * @param[out] sfx_items + * Suffix flow match items (list terminated by the END pattern item). + * @param[in] actions + * Associated actions (list terminated by the END action). + * @param[out] actions_sfx + * Suffix flow actions. + * @param[out] actions_pre + * Prefix flow actions. + * @param[out] pattern_sfx + * The pattern items for the suffix flow. + * @param[out] tag_sfx + * Pointer to suffix flow tag. + * + * @return + * 0 on success. + */ +static int +flow_meter_split_prep(struct rte_eth_dev *dev, + const struct rte_flow_item items[], + struct rte_flow_item sfx_items[], + const struct rte_flow_action actions[], + struct rte_flow_action actions_sfx[], + struct rte_flow_action actions_pre[]) +{ + struct rte_flow_action *tag_action = NULL; + struct rte_flow_item *tag_item; + struct mlx5_rte_flow_action_set_tag *set_tag; + struct rte_flow_error error; + const struct rte_flow_action_raw_encap *raw_encap; + const struct rte_flow_action_raw_decap *raw_decap; + struct mlx5_rte_flow_item_tag *tag_spec; + struct mlx5_rte_flow_item_tag *tag_mask; + uint32_t tag_id; + bool copy_vlan = false; + + /* Prepare the actions for prefix and suffix flow. */ + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + struct rte_flow_action **action_cur = NULL; + + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_METER: + /* Add the extra tag action first. */ + tag_action = actions_pre; + tag_action->type = (enum rte_flow_action_type) + MLX5_RTE_FLOW_ACTION_TYPE_TAG; + actions_pre++; + action_cur = &actions_pre; + break; + case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP: + case RTE_FLOW_ACTION_TYPE_NVGRE_DECAP: + action_cur = &actions_pre; + break; + case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: + raw_encap = actions->conf; + if (raw_encap->size < MLX5_ENCAPSULATION_DECISION_SIZE) + action_cur = &actions_pre; + break; + case RTE_FLOW_ACTION_TYPE_RAW_DECAP: + raw_decap = actions->conf; + if (raw_decap->size > MLX5_ENCAPSULATION_DECISION_SIZE) + action_cur = &actions_pre; + break; + case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: + case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID: + copy_vlan = true; + break; + default: + break; + } + if (!action_cur) + action_cur = &actions_sfx; + memcpy(*action_cur, actions, sizeof(struct rte_flow_action)); + (*action_cur)++; + } + /* Add end action to the actions. */ + actions_sfx->type = RTE_FLOW_ACTION_TYPE_END; + actions_pre->type = RTE_FLOW_ACTION_TYPE_END; + actions_pre++; + /* Set the tag. */ + set_tag = (void *)actions_pre; + set_tag->id = mlx5_flow_get_reg_id(dev, MLX5_MTR_SFX, 0, &error); + /* + * Get the id from the qrss_pool to make qrss share the id with meter. + */ + tag_id = flow_qrss_get_id(dev); + set_tag->data = tag_id << MLX5_MTR_COLOR_BITS; + assert(tag_action); + tag_action->conf = set_tag; + /* Prepare the suffix subflow items. */ + tag_item = sfx_items++; + for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + int item_type = items->type; + + switch (item_type) { + case RTE_FLOW_ITEM_TYPE_PORT_ID: + memcpy(sfx_items, items, sizeof(*sfx_items)); + sfx_items++; + break; + case RTE_FLOW_ITEM_TYPE_VLAN: + if (copy_vlan) { + memcpy(sfx_items, items, sizeof(*sfx_items)); + /* + * Convert to internal match item, it is used + * for vlan push and set vid. + */ + sfx_items->type = (enum rte_flow_item_type) + MLX5_RTE_FLOW_ITEM_TYPE_VLAN; + sfx_items++; + } + break; + default: + break; + } + } + sfx_items->type = RTE_FLOW_ITEM_TYPE_END; + sfx_items++; + tag_spec = (struct mlx5_rte_flow_item_tag *)sfx_items; + tag_spec->data = tag_id << MLX5_MTR_COLOR_BITS; + tag_spec->id = mlx5_flow_get_reg_id(dev, MLX5_MTR_SFX, 0, &error); + tag_mask = tag_spec + 1; + tag_mask->data = 0xffffff00; + tag_item->type = (enum rte_flow_item_type) + MLX5_RTE_FLOW_ITEM_TYPE_TAG; + tag_item->spec = tag_spec; + tag_item->last = NULL; + tag_item->mask = tag_mask; + return tag_id; +} + +/** + * Split action list having QUEUE/RSS for metadata register copy. + * + * Once Q/RSS action is detected in user's action list, the flow action + * should be split in order to copy metadata registers, which will happen in + * RX_CP_TBL like, + * - CQE->flow_tag := reg_c[1] (MARK) + * - CQE->flow_table_metadata (reg_b) := reg_c[0] (META) + * The Q/RSS action will be performed on RX_ACT_TBL after passing by RX_CP_TBL. + * This is because the last action of each flow must be a terminal action + * (QUEUE, RSS or DROP). + * + * Flow ID must be allocated to identify actions in the RX_ACT_TBL and it is + * stored and kept in the mlx5_flow structure per each sub_flow. + * + * The Q/RSS action is replaced with, + * - SET_TAG, setting the allocated flow ID to reg_c[2]. + * And the following JUMP action is added at the end, + * - JUMP, to RX_CP_TBL. + * + * A flow to perform remained Q/RSS action will be created in RX_ACT_TBL by + * flow_create_split_metadata() routine. The flow will look like, + * - If flow ID matches (reg_c[2]), perform Q/RSS. + * + * @param dev + * Pointer to Ethernet device. + * @param[out] split_actions + * Pointer to store split actions to jump to CP_TBL. + * @param[in] actions + * Pointer to the list of original flow actions. + * @param[in] qrss + * Pointer to the Q/RSS action. + * @param[in] actions_n + * Number of original actions. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * non-zero unique flow_id on success, otherwise 0 and + * error/rte_error are set. + */ +static uint32_t +flow_mreg_split_qrss_prep(struct rte_eth_dev *dev, + struct rte_flow_action *split_actions, + const struct rte_flow_action *actions, + const struct rte_flow_action *qrss, + int actions_n, struct rte_flow_error *error) +{ + struct mlx5_rte_flow_action_set_tag *set_tag; + struct rte_flow_action_jump *jump; + const int qrss_idx = qrss - actions; + uint32_t flow_id = 0; + int ret = 0; + + /* + * Given actions will be split + * - Replace QUEUE/RSS action with SET_TAG to set flow ID. + * - Add jump to mreg CP_TBL. + * As a result, there will be one more action. + */ + ++actions_n; + memcpy(split_actions, actions, sizeof(*split_actions) * actions_n); + set_tag = (void *)(split_actions + actions_n); + /* + * If tag action is not set to void(it means we are not the meter + * suffix flow), add the tag action. Since meter suffix flow already + * has the tag added. + */ + if (split_actions[qrss_idx].type != RTE_FLOW_ACTION_TYPE_VOID) { + /* + * Allocate the new subflow ID. This one is unique within + * device and not shared with representors. Otherwise, + * we would have to resolve multi-thread access synch + * issue. Each flow on the shared device is appended + * with source vport identifier, so the resulting + * flows will be unique in the shared (by master and + * representors) domain even if they have coinciding + * IDs. + */ + flow_id = flow_qrss_get_id(dev); + if (!flow_id) + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "can't allocate id " + "for split Q/RSS subflow"); + /* Internal SET_TAG action to set flow ID. */ + *set_tag = (struct mlx5_rte_flow_action_set_tag){ + .data = flow_id, + }; + ret = mlx5_flow_get_reg_id(dev, MLX5_COPY_MARK, 0, error); + if (ret < 0) + return ret; + set_tag->id = ret; + /* Construct new actions array. */ + /* Replace QUEUE/RSS action. */ + split_actions[qrss_idx] = (struct rte_flow_action){ + .type = (enum rte_flow_action_type) + MLX5_RTE_FLOW_ACTION_TYPE_TAG, + .conf = set_tag, + }; + } + /* JUMP action to jump to mreg copy table (CP_TBL). */ + jump = (void *)(set_tag + 1); + *jump = (struct rte_flow_action_jump){ + .group = MLX5_FLOW_MREG_CP_TABLE_GROUP, + }; + split_actions[actions_n - 2] = (struct rte_flow_action){ + .type = RTE_FLOW_ACTION_TYPE_JUMP, + .conf = jump, + }; + split_actions[actions_n - 1] = (struct rte_flow_action){ + .type = RTE_FLOW_ACTION_TYPE_END, + }; + return flow_id; +} + +/** + * Extend the given action list for Tx metadata copy. + * + * Copy the given action list to the ext_actions and add flow metadata register + * copy action in order to copy reg_a set by WQE to reg_c[0]. + * + * @param[out] ext_actions + * Pointer to the extended action list. + * @param[in] actions + * Pointer to the list of actions. + * @param[in] actions_n + * Number of actions in the list. + * @param[out] error + * Perform verbose error reporting if not NULL. + * @param[in] encap_idx + * The encap action inndex. + * + * @return + * 0 on success, negative value otherwise + */ +static int +flow_mreg_tx_copy_prep(struct rte_eth_dev *dev, + struct rte_flow_action *ext_actions, + const struct rte_flow_action *actions, + int actions_n, struct rte_flow_error *error, + int encap_idx) +{ + struct mlx5_flow_action_copy_mreg *cp_mreg = + (struct mlx5_flow_action_copy_mreg *) + (ext_actions + actions_n + 1); + int ret; + + ret = mlx5_flow_get_reg_id(dev, MLX5_METADATA_RX, 0, error); + if (ret < 0) + return ret; + cp_mreg->dst = ret; + ret = mlx5_flow_get_reg_id(dev, MLX5_METADATA_TX, 0, error); + if (ret < 0) + return ret; + cp_mreg->src = ret; + if (encap_idx != 0) + memcpy(ext_actions, actions, sizeof(*ext_actions) * encap_idx); + if (encap_idx == actions_n - 1) { + ext_actions[actions_n - 1] = (struct rte_flow_action){ + .type = (enum rte_flow_action_type) + MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG, + .conf = cp_mreg, + }; + ext_actions[actions_n] = (struct rte_flow_action){ + .type = RTE_FLOW_ACTION_TYPE_END, + }; + } else { + ext_actions[encap_idx] = (struct rte_flow_action){ + .type = (enum rte_flow_action_type) + MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG, + .conf = cp_mreg, + }; + memcpy(ext_actions + encap_idx + 1, actions + encap_idx, + sizeof(*ext_actions) * (actions_n - encap_idx)); + } + return 0; +} + +/** + * The splitting for metadata feature. + * + * - Q/RSS action on NIC Rx should be split in order to pass by + * the mreg copy table (RX_CP_TBL) and then it jumps to the + * action table (RX_ACT_TBL) which has the split Q/RSS action. + * + * - All the actions on NIC Tx should have a mreg copy action to + * copy reg_a from WQE to reg_c[0]. + * + * @param dev + * Pointer to Ethernet device. + * @param[in] flow + * Parent flow structure pointer. + * @param[in] prefix_layers + * Prefix flow layer flags. + * @param[in] attr + * Flow rule attributes. + * @param[in] items + * Pattern specification (list terminated by the END pattern item). + * @param[in] actions + * Associated actions (list terminated by the END action). + * @param[in] external + * This flow rule is created by request external to PMD. + * @param[in] flow_idx + * This memory pool index to the flow. + * @param[out] error + * Perform verbose error reporting if not NULL. + * @return + * 0 on success, negative value otherwise + */ +static int +flow_create_split_metadata(struct rte_eth_dev *dev, + struct rte_flow *flow, + uint64_t prefix_layers, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + bool external, uint32_t flow_idx, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_dev_config *config = &priv->config; + const struct rte_flow_action *qrss = NULL; + struct rte_flow_action *ext_actions = NULL; + struct mlx5_flow *dev_flow = NULL; + uint32_t qrss_id = 0; + int mtr_sfx = 0; + size_t act_size; + int actions_n; + int encap_idx; + int ret; + + /* Check whether extensive metadata feature is engaged. */ + if (!config->dv_flow_en || + config->dv_xmeta_en == MLX5_XMETA_MODE_LEGACY || + !mlx5_flow_ext_mreg_supported(dev)) + return flow_create_split_inner(dev, flow, NULL, prefix_layers, + attr, items, actions, external, + flow_idx, error); + actions_n = flow_parse_metadata_split_actions_info(actions, &qrss, + &encap_idx); + if (qrss) { + /* Exclude hairpin flows from splitting. */ + if (qrss->type == RTE_FLOW_ACTION_TYPE_QUEUE) { + const struct rte_flow_action_queue *queue; + + queue = qrss->conf; + if (mlx5_rxq_get_type(dev, queue->index) == + MLX5_RXQ_TYPE_HAIRPIN) + qrss = NULL; + } else if (qrss->type == RTE_FLOW_ACTION_TYPE_RSS) { + const struct rte_flow_action_rss *rss; + + rss = qrss->conf; + if (mlx5_rxq_get_type(dev, rss->queue[0]) == + MLX5_RXQ_TYPE_HAIRPIN) + qrss = NULL; + } + } + if (qrss) { + /* Check if it is in meter suffix table. */ + mtr_sfx = attr->group == (attr->transfer ? + (MLX5_FLOW_TABLE_LEVEL_SUFFIX - 1) : + MLX5_FLOW_TABLE_LEVEL_SUFFIX); + /* + * Q/RSS action on NIC Rx should be split in order to pass by + * the mreg copy table (RX_CP_TBL) and then it jumps to the + * action table (RX_ACT_TBL) which has the split Q/RSS action. + */ + act_size = sizeof(struct rte_flow_action) * (actions_n + 1) + + sizeof(struct rte_flow_action_set_tag) + + sizeof(struct rte_flow_action_jump); + ext_actions = rte_zmalloc(__func__, act_size, 0); + if (!ext_actions) + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "no memory to split " + "metadata flow"); + /* + * If we are the suffix flow of meter, tag already exist. + * Set the tag action to void. + */ + if (mtr_sfx) + ext_actions[qrss - actions].type = + RTE_FLOW_ACTION_TYPE_VOID; + else + ext_actions[qrss - actions].type = + (enum rte_flow_action_type) + MLX5_RTE_FLOW_ACTION_TYPE_TAG; + /* + * Create the new actions list with removed Q/RSS action + * and appended set tag and jump to register copy table + * (RX_CP_TBL). We should preallocate unique tag ID here + * in advance, because it is needed for set tag action. + */ + qrss_id = flow_mreg_split_qrss_prep(dev, ext_actions, actions, + qrss, actions_n, error); + if (!mtr_sfx && !qrss_id) { + ret = -rte_errno; + goto exit; + } + } else if (attr->egress && !attr->transfer) { + /* + * All the actions on NIC Tx should have a metadata register + * copy action to copy reg_a from WQE to reg_c[meta] + */ + act_size = sizeof(struct rte_flow_action) * (actions_n + 1) + + sizeof(struct mlx5_flow_action_copy_mreg); + ext_actions = rte_zmalloc(__func__, act_size, 0); + if (!ext_actions) + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "no memory to split " + "metadata flow"); + /* Create the action list appended with copy register. */ + ret = flow_mreg_tx_copy_prep(dev, ext_actions, actions, + actions_n, error, encap_idx); + if (ret < 0) + goto exit; + } + /* Add the unmodified original or prefix subflow. */ + ret = flow_create_split_inner(dev, flow, &dev_flow, prefix_layers, attr, + items, ext_actions ? ext_actions : + actions, external, flow_idx, error); + if (ret < 0) + goto exit; + MLX5_ASSERT(dev_flow); + if (qrss) { + const struct rte_flow_attr q_attr = { + .group = MLX5_FLOW_MREG_ACT_TABLE_GROUP, + .ingress = 1, + }; + /* Internal PMD action to set register. */ + struct mlx5_rte_flow_item_tag q_tag_spec = { + .data = qrss_id, + .id = 0, + }; + struct rte_flow_item q_items[] = { + { + .type = (enum rte_flow_item_type) + MLX5_RTE_FLOW_ITEM_TYPE_TAG, + .spec = &q_tag_spec, + .last = NULL, + .mask = NULL, + }, + { + .type = RTE_FLOW_ITEM_TYPE_END, + }, + }; + struct rte_flow_action q_actions[] = { + { + .type = qrss->type, + .conf = qrss->conf, + }, + { + .type = RTE_FLOW_ACTION_TYPE_END, + }, + }; + uint64_t layers = flow_get_prefix_layer_flags(dev_flow); + + /* + * Configure the tag item only if there is no meter subflow. + * Since tag is already marked in the meter suffix subflow + * we can just use the meter suffix items as is. + */ + if (qrss_id) { + /* Not meter subflow. */ + MLX5_ASSERT(!mtr_sfx); + /* + * Put unique id in prefix flow due to it is destroyed + * after suffix flow and id will be freed after there + * is no actual flows with this id and identifier + * reallocation becomes possible (for example, for + * other flows in other threads). + */ + dev_flow->handle->split_flow_id = qrss_id; + ret = mlx5_flow_get_reg_id(dev, MLX5_COPY_MARK, 0, + error); + if (ret < 0) + goto exit; + q_tag_spec.id = ret; + } + dev_flow = NULL; + /* Add suffix subflow to execute Q/RSS. */ + ret = flow_create_split_inner(dev, flow, &dev_flow, layers, + &q_attr, mtr_sfx ? items : + q_items, q_actions, + external, flow_idx, error); + if (ret < 0) + goto exit; + /* qrss ID should be freed if failed. */ + qrss_id = 0; + MLX5_ASSERT(dev_flow); + } + +exit: + /* + * We do not destroy the partially created sub_flows in case of error. + * These ones are included into parent flow list and will be destroyed + * by flow_drv_destroy. + */ + flow_qrss_free_id(dev, qrss_id); + rte_free(ext_actions); + return ret; +} + +/** + * The splitting for meter feature. + * + * - The meter flow will be split to two flows as prefix and + * suffix flow. The packets make sense only it pass the prefix + * meter action. + * + * - Reg_C_5 is used for the packet to match betweend prefix and + * suffix flow. + * + * @param dev + * Pointer to Ethernet device. + * @param[in] flow + * Parent flow structure pointer. + * @param[in] attr + * Flow rule attributes. + * @param[in] items + * Pattern specification (list terminated by the END pattern item). + * @param[in] actions + * Associated actions (list terminated by the END action). + * @param[in] external + * This flow rule is created by request external to PMD. + * @param[in] flow_idx + * This memory pool index to the flow. + * @param[out] error + * Perform verbose error reporting if not NULL. + * @return + * 0 on success, negative value otherwise + */ +static int +flow_create_split_meter(struct rte_eth_dev *dev, + struct rte_flow *flow, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + bool external, uint32_t flow_idx, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct rte_flow_action *sfx_actions = NULL; + struct rte_flow_action *pre_actions = NULL; + struct rte_flow_item *sfx_items = NULL; + struct mlx5_flow *dev_flow = NULL; + struct rte_flow_attr sfx_attr = *attr; + uint32_t mtr = 0; + uint32_t mtr_tag_id = 0; + size_t act_size; + size_t item_size; + int actions_n = 0; + int ret; + + if (priv->mtr_en) + actions_n = flow_check_meter_action(actions, &mtr); + if (mtr) { + /* The five prefix actions: meter, decap, encap, tag, end. */ + act_size = sizeof(struct rte_flow_action) * (actions_n + 5) + + sizeof(struct mlx5_rte_flow_action_set_tag); + /* tag, vlan, port id, end. */ +#define METER_SUFFIX_ITEM 4 + item_size = sizeof(struct rte_flow_item) * METER_SUFFIX_ITEM + + sizeof(struct mlx5_rte_flow_item_tag) * 2; + sfx_actions = rte_zmalloc(__func__, (act_size + item_size), 0); + if (!sfx_actions) + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "no memory to split " + "meter flow"); + sfx_items = (struct rte_flow_item *)((char *)sfx_actions + + act_size); + pre_actions = sfx_actions + actions_n; + mtr_tag_id = flow_meter_split_prep(dev, items, sfx_items, + actions, sfx_actions, + pre_actions); + if (!mtr_tag_id) { + ret = -rte_errno; + goto exit; + } + /* Add the prefix subflow. */ + ret = flow_create_split_inner(dev, flow, &dev_flow, 0, attr, + items, pre_actions, external, + flow_idx, error); + if (ret) { + ret = -rte_errno; + goto exit; + } + dev_flow->handle->split_flow_id = mtr_tag_id; + /* Setting the sfx group atrr. */ + sfx_attr.group = sfx_attr.transfer ? + (MLX5_FLOW_TABLE_LEVEL_SUFFIX - 1) : + MLX5_FLOW_TABLE_LEVEL_SUFFIX; + } + /* Add the prefix subflow. */ + ret = flow_create_split_metadata(dev, flow, dev_flow ? + flow_get_prefix_layer_flags(dev_flow) : + 0, &sfx_attr, + sfx_items ? sfx_items : items, + sfx_actions ? sfx_actions : actions, + external, flow_idx, error); +exit: + if (sfx_actions) + rte_free(sfx_actions); + return ret; +} + +/** + * Split the flow to subflow set. The splitters might be linked + * in the chain, like this: + * flow_create_split_outer() calls: + * flow_create_split_meter() calls: + * flow_create_split_metadata(meter_subflow_0) calls: + * flow_create_split_inner(metadata_subflow_0) + * flow_create_split_inner(metadata_subflow_1) + * flow_create_split_inner(metadata_subflow_2) + * flow_create_split_metadata(meter_subflow_1) calls: + * flow_create_split_inner(metadata_subflow_0) + * flow_create_split_inner(metadata_subflow_1) + * flow_create_split_inner(metadata_subflow_2) + * + * This provide flexible way to add new levels of flow splitting. + * The all of successfully created subflows are included to the + * parent flow dev_flow list. + * + * @param dev + * Pointer to Ethernet device. + * @param[in] flow + * Parent flow structure pointer. + * @param[in] attr + * Flow rule attributes. + * @param[in] items + * Pattern specification (list terminated by the END pattern item). + * @param[in] actions + * Associated actions (list terminated by the END action). + * @param[in] external + * This flow rule is created by request external to PMD. + * @param[in] flow_idx + * This memory pool index to the flow. + * @param[out] error + * Perform verbose error reporting if not NULL. + * @return + * 0 on success, negative value otherwise + */ +static int +flow_create_split_outer(struct rte_eth_dev *dev, + struct rte_flow *flow, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + bool external, uint32_t flow_idx, + struct rte_flow_error *error) +{ + int ret; + + ret = flow_create_split_meter(dev, flow, attr, items, + actions, external, flow_idx, error); + MLX5_ASSERT(ret <= 0); + return ret; +} + +/** + * Create a flow and add it to @p list. + * + * @param dev + * Pointer to Ethernet device. + * @param list + * Pointer to a TAILQ flow list. If this parameter NULL, + * no list insertion occurred, flow is just created, + * this is caller's responsibility to track the + * created flow. + * @param[in] attr + * Flow rule attributes. + * @param[in] items + * Pattern specification (list terminated by the END pattern item). + * @param[in] actions + * Associated actions (list terminated by the END action). + * @param[in] external + * This flow rule is created by request external to PMD. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * A flow index on success, 0 otherwise and rte_errno is set. + */ +static uint32_t +flow_list_create(struct rte_eth_dev *dev, uint32_t *list, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + bool external, struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct rte_flow *flow = NULL; + struct mlx5_flow *dev_flow; + const struct rte_flow_action_rss *rss; + union { + struct rte_flow_expand_rss buf; + uint8_t buffer[2048]; + } expand_buffer; + union { + struct rte_flow_action actions[MLX5_MAX_SPLIT_ACTIONS]; + uint8_t buffer[2048]; + } actions_rx; + union { + struct rte_flow_action actions[MLX5_MAX_SPLIT_ACTIONS]; + uint8_t buffer[2048]; + } actions_hairpin_tx; + union { + struct rte_flow_item items[MLX5_MAX_SPLIT_ITEMS]; + uint8_t buffer[2048]; + } items_tx; + struct rte_flow_expand_rss *buf = &expand_buffer.buf; + struct mlx5_flow_rss_desc *rss_desc = &((struct mlx5_flow_rss_desc *) + priv->rss_desc)[!!priv->flow_idx]; + const struct rte_flow_action *p_actions_rx = actions; + uint32_t i; + uint32_t idx = 0; + int hairpin_flow; + uint32_t hairpin_id = 0; + struct rte_flow_attr attr_tx = { .priority = 0 }; + int ret; + + hairpin_flow = flow_check_hairpin_split(dev, attr, actions); + ret = flow_drv_validate(dev, attr, items, p_actions_rx, + external, hairpin_flow, error); + if (ret < 0) + return 0; + if (hairpin_flow > 0) { + if (hairpin_flow > MLX5_MAX_SPLIT_ACTIONS) { + rte_errno = EINVAL; + return 0; + } + flow_hairpin_split(dev, actions, actions_rx.actions, + actions_hairpin_tx.actions, items_tx.items, + &hairpin_id); + p_actions_rx = actions_rx.actions; + } + flow = mlx5_ipool_zmalloc(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], &idx); + if (!flow) { + rte_errno = ENOMEM; + goto error_before_flow; + } + flow->drv_type = flow_get_drv_type(dev, attr); + if (hairpin_id != 0) + flow->hairpin_flow_id = hairpin_id; + MLX5_ASSERT(flow->drv_type > MLX5_FLOW_TYPE_MIN && + flow->drv_type < MLX5_FLOW_TYPE_MAX); + memset(rss_desc, 0, sizeof(*rss_desc)); + rss = flow_get_rss_action(p_actions_rx); + if (rss) { + /* + * The following information is required by + * mlx5_flow_hashfields_adjust() in advance. + */ + rss_desc->level = rss->level; + /* RSS type 0 indicates default RSS type (ETH_RSS_IP). */ + rss_desc->types = !rss->types ? ETH_RSS_IP : rss->types; + } + flow->dev_handles = 0; + if (rss && rss->types) { + unsigned int graph_root; + + graph_root = find_graph_root(items, rss->level); + ret = rte_flow_expand_rss(buf, sizeof(expand_buffer.buffer), + items, rss->types, + mlx5_support_expansion, + graph_root); + MLX5_ASSERT(ret > 0 && + (unsigned int)ret < sizeof(expand_buffer.buffer)); + } else { + buf->entries = 1; + buf->entry[0].pattern = (void *)(uintptr_t)items; + } + /* + * Record the start index when there is a nested call. All sub-flows + * need to be translated before another calling. + * No need to use ping-pong buffer to save memory here. + */ + if (priv->flow_idx) { + MLX5_ASSERT(!priv->flow_nested_idx); + priv->flow_nested_idx = priv->flow_idx; + } + for (i = 0; i < buf->entries; ++i) { + /* + * The splitter may create multiple dev_flows, + * depending on configuration. In the simplest + * case it just creates unmodified original flow. + */ + ret = flow_create_split_outer(dev, flow, attr, + buf->entry[i].pattern, + p_actions_rx, external, idx, + error); + if (ret < 0) + goto error; + } + /* Create the tx flow. */ + if (hairpin_flow) { + attr_tx.group = MLX5_HAIRPIN_TX_TABLE; + attr_tx.ingress = 0; + attr_tx.egress = 1; + dev_flow = flow_drv_prepare(dev, flow, &attr_tx, items_tx.items, + actions_hairpin_tx.actions, + idx, error); + if (!dev_flow) + goto error; + dev_flow->flow = flow; + dev_flow->external = 0; + SILIST_INSERT(&flow->dev_handles, dev_flow->handle_idx, + dev_flow->handle, next); + ret = flow_drv_translate(dev, dev_flow, &attr_tx, + items_tx.items, + actions_hairpin_tx.actions, error); + if (ret < 0) + goto error; + } + /* + * Update the metadata register copy table. If extensive + * metadata feature is enabled and registers are supported + * we might create the extra rte_flow for each unique + * MARK/FLAG action ID. + * + * The table is updated for ingress Flows only, because + * the egress Flows belong to the different device and + * copy table should be updated in peer NIC Rx domain. + */ + if (attr->ingress && + (external || attr->group != MLX5_FLOW_MREG_CP_TABLE_GROUP)) { + ret = flow_mreg_update_copy_table(dev, flow, actions, error); + if (ret) + goto error; + } + /* + * If the flow is external (from application) OR device is started, then + * the flow will be applied immediately. + */ + if (external || dev->data->dev_started) { + ret = flow_drv_apply(dev, flow, error); + if (ret < 0) + goto error; + } + if (list) + ILIST_INSERT(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], list, idx, + flow, next); + flow_rxq_flags_set(dev, flow); + /* Nested flow creation index recovery. */ + priv->flow_idx = priv->flow_nested_idx; + if (priv->flow_nested_idx) + priv->flow_nested_idx = 0; + return idx; +error: + MLX5_ASSERT(flow); + ret = rte_errno; /* Save rte_errno before cleanup. */ + flow_mreg_del_copy_action(dev, flow); + flow_drv_destroy(dev, flow); + mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], idx); + rte_errno = ret; /* Restore rte_errno. */ +error_before_flow: + ret = rte_errno; + if (hairpin_id) + mlx5_flow_id_release(priv->sh->flow_id_pool, + hairpin_id); + rte_errno = ret; + priv->flow_idx = priv->flow_nested_idx; + if (priv->flow_nested_idx) + priv->flow_nested_idx = 0; + return 0; +} + +/** + * Create a dedicated flow rule on e-switch table 0 (root table), to direct all + * incoming packets to table 1. + * + * Other flow rules, requested for group n, will be created in + * e-switch table n+1. + * Jump action to e-switch group n will be created to group n+1. + * + * Used when working in switchdev mode, to utilise advantages of table 1 + * and above. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * Pointer to flow on success, NULL otherwise and rte_errno is set. + */ +struct rte_flow * +mlx5_flow_create_esw_table_zero_flow(struct rte_eth_dev *dev) +{ + const struct rte_flow_attr attr = { + .group = 0, + .priority = 0, + .ingress = 1, + .egress = 0, + .transfer = 1, + }; + const struct rte_flow_item pattern = { + .type = RTE_FLOW_ITEM_TYPE_END, + }; + struct rte_flow_action_jump jump = { + .group = 1, + }; + const struct rte_flow_action actions[] = { + { + .type = RTE_FLOW_ACTION_TYPE_JUMP, + .conf = &jump, + }, + { + .type = RTE_FLOW_ACTION_TYPE_END, + }, + }; + struct mlx5_priv *priv = dev->data->dev_private; + struct rte_flow_error error; + + return (void *)(uintptr_t)flow_list_create(dev, &priv->ctrl_flows, + &attr, &pattern, + actions, false, &error); +} + +/** + * Validate a flow supported by the NIC. + * + * @see rte_flow_validate() + * @see rte_flow_ops + */ +int +mlx5_flow_validate(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + int hairpin_flow; + + hairpin_flow = flow_check_hairpin_split(dev, attr, actions); + return flow_drv_validate(dev, attr, items, actions, + true, hairpin_flow, error); +} + +/** + * Create a flow. + * + * @see rte_flow_create() + * @see rte_flow_ops + */ +struct rte_flow * +mlx5_flow_create(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + + /* + * If the device is not started yet, it is not allowed to created a + * flow from application. PMD default flows and traffic control flows + * are not affected. + */ + if (unlikely(!dev->data->dev_started)) { + DRV_LOG(DEBUG, "port %u is not started when " + "inserting a flow", dev->data->port_id); + rte_flow_error_set(error, ENODEV, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "port not started"); + return NULL; + } + return (void *)(uintptr_t)flow_list_create(dev, &priv->flows, + attr, items, actions, true, error); +} + +/** + * Destroy a flow in a list. + * + * @param dev + * Pointer to Ethernet device. + * @param list + * Pointer to the Indexed flow list. If this parameter NULL, + * there is no flow removal from the list. Be noted that as + * flow is add to the indexed list, memory of the indexed + * list points to maybe changed as flow destroyed. + * @param[in] flow_idx + * Index of flow to destroy. + */ +static void +flow_list_destroy(struct rte_eth_dev *dev, uint32_t *list, + uint32_t flow_idx) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_fdir_flow *priv_fdir_flow = NULL; + struct rte_flow *flow = mlx5_ipool_get(priv->sh->ipool + [MLX5_IPOOL_RTE_FLOW], flow_idx); + + if (!flow) + return; + /* + * Update RX queue flags only if port is started, otherwise it is + * already clean. + */ + if (dev->data->dev_started) + flow_rxq_flags_trim(dev, flow); + if (flow->hairpin_flow_id) + mlx5_flow_id_release(priv->sh->flow_id_pool, + flow->hairpin_flow_id); + flow_drv_destroy(dev, flow); + if (list) + ILIST_REMOVE(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], list, + flow_idx, flow, next); + flow_mreg_del_copy_action(dev, flow); + if (flow->fdir) { + LIST_FOREACH(priv_fdir_flow, &priv->fdir_flows, next) { + if (priv_fdir_flow->rix_flow == flow_idx) + break; + } + if (priv_fdir_flow) { + LIST_REMOVE(priv_fdir_flow, next); + rte_free(priv_fdir_flow->fdir); + rte_free(priv_fdir_flow); + } + } + mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], flow_idx); +} + +/** + * Destroy all flows. + * + * @param dev + * Pointer to Ethernet device. + * @param list + * Pointer to the Indexed flow list. + * @param active + * If flushing is called avtively. + */ +void +mlx5_flow_list_flush(struct rte_eth_dev *dev, uint32_t *list, bool active) +{ + uint32_t num_flushed = 0; + + while (*list) { + flow_list_destroy(dev, list, *list); + num_flushed++; + } + if (active) { + DRV_LOG(INFO, "port %u: %u flows flushed before stopping", + dev->data->port_id, num_flushed); + } +} + +/** + * Remove all flows. + * + * @param dev + * Pointer to Ethernet device. + * @param list + * Pointer to the Indexed flow list. + */ +void +mlx5_flow_stop(struct rte_eth_dev *dev, uint32_t *list) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct rte_flow *flow = NULL; + uint32_t idx; + + ILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], *list, idx, + flow, next) { + flow_drv_remove(dev, flow); + flow_mreg_stop_copy_action(dev, flow); + } + flow_mreg_del_default_copy_action(dev); + flow_rxq_flags_clear(dev); +} + +/** + * Add all flows. + * + * @param dev + * Pointer to Ethernet device. + * @param list + * Pointer to the Indexed flow list. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_start(struct rte_eth_dev *dev, uint32_t *list) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct rte_flow *flow = NULL; + struct rte_flow_error error; + uint32_t idx; + int ret = 0; + + /* Make sure default copy action (reg_c[0] -> reg_b) is created. */ + ret = flow_mreg_add_default_copy_action(dev, &error); + if (ret < 0) + return -rte_errno; + /* Apply Flows created by application. */ + ILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], *list, idx, + flow, next) { + ret = flow_mreg_start_copy_action(dev, flow); + if (ret < 0) + goto error; + ret = flow_drv_apply(dev, flow, &error); + if (ret < 0) + goto error; + flow_rxq_flags_set(dev, flow); + } + return 0; +error: + ret = rte_errno; /* Save rte_errno before cleanup. */ + mlx5_flow_stop(dev, list); + rte_errno = ret; /* Restore rte_errno. */ + return -rte_errno; +} + +/** + * Stop all default actions for flows. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx5_flow_stop_default(struct rte_eth_dev *dev) +{ + flow_mreg_del_default_copy_action(dev); + flow_rxq_flags_clear(dev); +} + +/** + * Start all default actions for flows. + * + * @param dev + * Pointer to Ethernet device. + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_start_default(struct rte_eth_dev *dev) +{ + struct rte_flow_error error; + + /* Make sure default copy action (reg_c[0] -> reg_b) is created. */ + return flow_mreg_add_default_copy_action(dev, &error); +} + +/** + * Allocate intermediate resources for flow creation. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx5_flow_alloc_intermediate(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + + if (!priv->inter_flows) { + priv->inter_flows = rte_calloc(__func__, 1, + MLX5_NUM_MAX_DEV_FLOWS * + sizeof(struct mlx5_flow) + + (sizeof(struct mlx5_flow_rss_desc) + + sizeof(uint16_t) * UINT16_MAX) * 2, 0); + if (!priv->inter_flows) { + DRV_LOG(ERR, "can't allocate intermediate memory."); + return; + } + } + priv->rss_desc = &((struct mlx5_flow *)priv->inter_flows) + [MLX5_NUM_MAX_DEV_FLOWS]; + /* Reset the index. */ + priv->flow_idx = 0; + priv->flow_nested_idx = 0; +} + +/** + * Free intermediate resources for flows. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx5_flow_free_intermediate(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + + rte_free(priv->inter_flows); + priv->inter_flows = NULL; +} + +/** + * Verify the flow list is empty + * + * @param dev + * Pointer to Ethernet device. + * + * @return the number of flows not released. + */ +int +mlx5_flow_verify(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct rte_flow *flow; + uint32_t idx; + int ret = 0; + + ILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], priv->flows, idx, + flow, next) { + DRV_LOG(DEBUG, "port %u flow %p still referenced", + dev->data->port_id, (void *)flow); + ++ret; + } + return ret; +} + +/** + * Enable default hairpin egress flow. + * + * @param dev + * Pointer to Ethernet device. + * @param queue + * The queue index. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_ctrl_flow_source_queue(struct rte_eth_dev *dev, + uint32_t queue) +{ + struct mlx5_priv *priv = dev->data->dev_private; + const struct rte_flow_attr attr = { + .egress = 1, + .priority = 0, + }; + struct mlx5_rte_flow_item_tx_queue queue_spec = { + .queue = queue, + }; + struct mlx5_rte_flow_item_tx_queue queue_mask = { + .queue = UINT32_MAX, + }; + struct rte_flow_item items[] = { + { + .type = (enum rte_flow_item_type) + MLX5_RTE_FLOW_ITEM_TYPE_TX_QUEUE, + .spec = &queue_spec, + .last = NULL, + .mask = &queue_mask, + }, + { + .type = RTE_FLOW_ITEM_TYPE_END, + }, + }; + struct rte_flow_action_jump jump = { + .group = MLX5_HAIRPIN_TX_TABLE, + }; + struct rte_flow_action actions[2]; + uint32_t flow_idx; + struct rte_flow_error error; + + actions[0].type = RTE_FLOW_ACTION_TYPE_JUMP; + actions[0].conf = &jump; + actions[1].type = RTE_FLOW_ACTION_TYPE_END; + flow_idx = flow_list_create(dev, &priv->ctrl_flows, + &attr, items, actions, false, &error); + if (!flow_idx) { + DRV_LOG(DEBUG, + "Failed to create ctrl flow: rte_errno(%d)," + " type(%d), message(%s)", + rte_errno, error.type, + error.message ? error.message : " (no stated reason)"); + return -rte_errno; + } + return 0; +} + +/** + * Enable a control flow configured from the control plane. + * + * @param dev + * Pointer to Ethernet device. + * @param eth_spec + * An Ethernet flow spec to apply. + * @param eth_mask + * An Ethernet flow mask to apply. + * @param vlan_spec + * A VLAN flow spec to apply. + * @param vlan_mask + * A VLAN flow mask to apply. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev, + struct rte_flow_item_eth *eth_spec, + struct rte_flow_item_eth *eth_mask, + struct rte_flow_item_vlan *vlan_spec, + struct rte_flow_item_vlan *vlan_mask) +{ + struct mlx5_priv *priv = dev->data->dev_private; + const struct rte_flow_attr attr = { + .ingress = 1, + .priority = MLX5_FLOW_PRIO_RSVD, + }; + struct rte_flow_item items[] = { + { + .type = RTE_FLOW_ITEM_TYPE_ETH, + .spec = eth_spec, + .last = NULL, + .mask = eth_mask, + }, + { + .type = (vlan_spec) ? RTE_FLOW_ITEM_TYPE_VLAN : + RTE_FLOW_ITEM_TYPE_END, + .spec = vlan_spec, + .last = NULL, + .mask = vlan_mask, + }, + { + .type = RTE_FLOW_ITEM_TYPE_END, + }, + }; + uint16_t queue[priv->reta_idx_n]; + struct rte_flow_action_rss action_rss = { + .func = RTE_ETH_HASH_FUNCTION_DEFAULT, + .level = 0, + .types = priv->rss_conf.rss_hf, + .key_len = priv->rss_conf.rss_key_len, + .queue_num = priv->reta_idx_n, + .key = priv->rss_conf.rss_key, + .queue = queue, + }; + struct rte_flow_action actions[] = { + { + .type = RTE_FLOW_ACTION_TYPE_RSS, + .conf = &action_rss, + }, + { + .type = RTE_FLOW_ACTION_TYPE_END, + }, + }; + uint32_t flow_idx; + struct rte_flow_error error; + unsigned int i; + + if (!priv->reta_idx_n || !priv->rxqs_n) { + return 0; + } + if (!(dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG)) + action_rss.types = 0; + for (i = 0; i != priv->reta_idx_n; ++i) + queue[i] = (*priv->reta_idx)[i]; + flow_idx = flow_list_create(dev, &priv->ctrl_flows, + &attr, items, actions, false, &error); + if (!flow_idx) + return -rte_errno; + return 0; +} + +/** + * Enable a flow control configured from the control plane. + * + * @param dev + * Pointer to Ethernet device. + * @param eth_spec + * An Ethernet flow spec to apply. + * @param eth_mask + * An Ethernet flow mask to apply. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_ctrl_flow(struct rte_eth_dev *dev, + struct rte_flow_item_eth *eth_spec, + struct rte_flow_item_eth *eth_mask) +{ + return mlx5_ctrl_flow_vlan(dev, eth_spec, eth_mask, NULL, NULL); +} + +/** + * Destroy a flow. + * + * @see rte_flow_destroy() + * @see rte_flow_ops + */ +int +mlx5_flow_destroy(struct rte_eth_dev *dev, + struct rte_flow *flow, + struct rte_flow_error *error __rte_unused) +{ + struct mlx5_priv *priv = dev->data->dev_private; + + flow_list_destroy(dev, &priv->flows, (uintptr_t)(void *)flow); + return 0; +} + +/** + * Destroy all flows. + * + * @see rte_flow_flush() + * @see rte_flow_ops + */ +int +mlx5_flow_flush(struct rte_eth_dev *dev, + struct rte_flow_error *error __rte_unused) +{ + struct mlx5_priv *priv = dev->data->dev_private; + + mlx5_flow_list_flush(dev, &priv->flows, false); + return 0; +} + +/** + * Isolated mode. + * + * @see rte_flow_isolate() + * @see rte_flow_ops + */ +int +mlx5_flow_isolate(struct rte_eth_dev *dev, + int enable, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + + if (dev->data->dev_started) { + rte_flow_error_set(error, EBUSY, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "port must be stopped first"); + return -rte_errno; + } + priv->isolated = !!enable; + if (enable) + dev->dev_ops = &mlx5_dev_ops_isolate; + else + dev->dev_ops = &mlx5_dev_ops; + return 0; +} + +/** + * Query a flow. + * + * @see rte_flow_query() + * @see rte_flow_ops + */ +static int +flow_drv_query(struct rte_eth_dev *dev, + uint32_t flow_idx, + const struct rte_flow_action *actions, + void *data, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + const struct mlx5_flow_driver_ops *fops; + struct rte_flow *flow = mlx5_ipool_get(priv->sh->ipool + [MLX5_IPOOL_RTE_FLOW], + flow_idx); + enum mlx5_flow_drv_type ftype; + + if (!flow) { + return rte_flow_error_set(error, ENOENT, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "invalid flow handle"); + } + ftype = flow->drv_type; + MLX5_ASSERT(ftype > MLX5_FLOW_TYPE_MIN && ftype < MLX5_FLOW_TYPE_MAX); + fops = flow_get_drv_ops(ftype); + + return fops->query(dev, flow, actions, data, error); +} + +/** + * Query a flow. + * + * @see rte_flow_query() + * @see rte_flow_ops + */ +int +mlx5_flow_query(struct rte_eth_dev *dev, + struct rte_flow *flow, + const struct rte_flow_action *actions, + void *data, + struct rte_flow_error *error) +{ + int ret; + + ret = flow_drv_query(dev, (uintptr_t)(void *)flow, actions, data, + error); + if (ret < 0) + return ret; + return 0; +} + +/** + * Convert a flow director filter to a generic flow. + * + * @param dev + * Pointer to Ethernet device. + * @param fdir_filter + * Flow director filter to add. + * @param attributes + * Generic flow parameters structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_fdir_filter_convert(struct rte_eth_dev *dev, + const struct rte_eth_fdir_filter *fdir_filter, + struct mlx5_fdir *attributes) +{ + struct mlx5_priv *priv = dev->data->dev_private; + const struct rte_eth_fdir_input *input = &fdir_filter->input; + const struct rte_eth_fdir_masks *mask = + &dev->data->dev_conf.fdir_conf.mask; + + /* Validate queue number. */ + if (fdir_filter->action.rx_queue >= priv->rxqs_n) { + DRV_LOG(ERR, "port %u invalid queue number %d", + dev->data->port_id, fdir_filter->action.rx_queue); + rte_errno = EINVAL; + return -rte_errno; + } + attributes->attr.ingress = 1; + attributes->items[0] = (struct rte_flow_item) { + .type = RTE_FLOW_ITEM_TYPE_ETH, + .spec = &attributes->l2, + .mask = &attributes->l2_mask, + }; + switch (fdir_filter->action.behavior) { + case RTE_ETH_FDIR_ACCEPT: + attributes->actions[0] = (struct rte_flow_action){ + .type = RTE_FLOW_ACTION_TYPE_QUEUE, + .conf = &attributes->queue, + }; + break; + case RTE_ETH_FDIR_REJECT: + attributes->actions[0] = (struct rte_flow_action){ + .type = RTE_FLOW_ACTION_TYPE_DROP, + }; + break; + default: + DRV_LOG(ERR, "port %u invalid behavior %d", + dev->data->port_id, + fdir_filter->action.behavior); + rte_errno = ENOTSUP; + return -rte_errno; + } + attributes->queue.index = fdir_filter->action.rx_queue; + /* Handle L3. */ + switch (fdir_filter->input.flow_type) { + case RTE_ETH_FLOW_NONFRAG_IPV4_UDP: + case RTE_ETH_FLOW_NONFRAG_IPV4_TCP: + case RTE_ETH_FLOW_NONFRAG_IPV4_OTHER: + attributes->l3.ipv4.hdr = (struct rte_ipv4_hdr){ + .src_addr = input->flow.ip4_flow.src_ip, + .dst_addr = input->flow.ip4_flow.dst_ip, + .time_to_live = input->flow.ip4_flow.ttl, + .type_of_service = input->flow.ip4_flow.tos, + }; + attributes->l3_mask.ipv4.hdr = (struct rte_ipv4_hdr){ + .src_addr = mask->ipv4_mask.src_ip, + .dst_addr = mask->ipv4_mask.dst_ip, + .time_to_live = mask->ipv4_mask.ttl, + .type_of_service = mask->ipv4_mask.tos, + .next_proto_id = mask->ipv4_mask.proto, + }; + attributes->items[1] = (struct rte_flow_item){ + .type = RTE_FLOW_ITEM_TYPE_IPV4, + .spec = &attributes->l3, + .mask = &attributes->l3_mask, + }; + break; + case RTE_ETH_FLOW_NONFRAG_IPV6_UDP: + case RTE_ETH_FLOW_NONFRAG_IPV6_TCP: + case RTE_ETH_FLOW_NONFRAG_IPV6_OTHER: + attributes->l3.ipv6.hdr = (struct rte_ipv6_hdr){ + .hop_limits = input->flow.ipv6_flow.hop_limits, + .proto = input->flow.ipv6_flow.proto, + }; + + memcpy(attributes->l3.ipv6.hdr.src_addr, + input->flow.ipv6_flow.src_ip, + RTE_DIM(attributes->l3.ipv6.hdr.src_addr)); + memcpy(attributes->l3.ipv6.hdr.dst_addr, + input->flow.ipv6_flow.dst_ip, + RTE_DIM(attributes->l3.ipv6.hdr.src_addr)); + memcpy(attributes->l3_mask.ipv6.hdr.src_addr, + mask->ipv6_mask.src_ip, + RTE_DIM(attributes->l3_mask.ipv6.hdr.src_addr)); + memcpy(attributes->l3_mask.ipv6.hdr.dst_addr, + mask->ipv6_mask.dst_ip, + RTE_DIM(attributes->l3_mask.ipv6.hdr.src_addr)); + attributes->items[1] = (struct rte_flow_item){ + .type = RTE_FLOW_ITEM_TYPE_IPV6, + .spec = &attributes->l3, + .mask = &attributes->l3_mask, + }; + break; + default: + DRV_LOG(ERR, "port %u invalid flow type%d", + dev->data->port_id, fdir_filter->input.flow_type); + rte_errno = ENOTSUP; + return -rte_errno; + } + /* Handle L4. */ + switch (fdir_filter->input.flow_type) { + case RTE_ETH_FLOW_NONFRAG_IPV4_UDP: + attributes->l4.udp.hdr = (struct rte_udp_hdr){ + .src_port = input->flow.udp4_flow.src_port, + .dst_port = input->flow.udp4_flow.dst_port, + }; + attributes->l4_mask.udp.hdr = (struct rte_udp_hdr){ + .src_port = mask->src_port_mask, + .dst_port = mask->dst_port_mask, + }; + attributes->items[2] = (struct rte_flow_item){ + .type = RTE_FLOW_ITEM_TYPE_UDP, + .spec = &attributes->l4, + .mask = &attributes->l4_mask, + }; + break; + case RTE_ETH_FLOW_NONFRAG_IPV4_TCP: + attributes->l4.tcp.hdr = (struct rte_tcp_hdr){ + .src_port = input->flow.tcp4_flow.src_port, + .dst_port = input->flow.tcp4_flow.dst_port, + }; + attributes->l4_mask.tcp.hdr = (struct rte_tcp_hdr){ + .src_port = mask->src_port_mask, + .dst_port = mask->dst_port_mask, + }; + attributes->items[2] = (struct rte_flow_item){ + .type = RTE_FLOW_ITEM_TYPE_TCP, + .spec = &attributes->l4, + .mask = &attributes->l4_mask, + }; + break; + case RTE_ETH_FLOW_NONFRAG_IPV6_UDP: + attributes->l4.udp.hdr = (struct rte_udp_hdr){ + .src_port = input->flow.udp6_flow.src_port, + .dst_port = input->flow.udp6_flow.dst_port, + }; + attributes->l4_mask.udp.hdr = (struct rte_udp_hdr){ + .src_port = mask->src_port_mask, + .dst_port = mask->dst_port_mask, + }; + attributes->items[2] = (struct rte_flow_item){ + .type = RTE_FLOW_ITEM_TYPE_UDP, + .spec = &attributes->l4, + .mask = &attributes->l4_mask, + }; + break; + case RTE_ETH_FLOW_NONFRAG_IPV6_TCP: + attributes->l4.tcp.hdr = (struct rte_tcp_hdr){ + .src_port = input->flow.tcp6_flow.src_port, + .dst_port = input->flow.tcp6_flow.dst_port, + }; + attributes->l4_mask.tcp.hdr = (struct rte_tcp_hdr){ + .src_port = mask->src_port_mask, + .dst_port = mask->dst_port_mask, + }; + attributes->items[2] = (struct rte_flow_item){ + .type = RTE_FLOW_ITEM_TYPE_TCP, + .spec = &attributes->l4, + .mask = &attributes->l4_mask, + }; + break; + case RTE_ETH_FLOW_NONFRAG_IPV4_OTHER: + case RTE_ETH_FLOW_NONFRAG_IPV6_OTHER: + break; + default: + DRV_LOG(ERR, "port %u invalid flow type%d", + dev->data->port_id, fdir_filter->input.flow_type); + rte_errno = ENOTSUP; + return -rte_errno; + } + return 0; +} + +#define FLOW_FDIR_CMP(f1, f2, fld) \ + memcmp(&(f1)->fld, &(f2)->fld, sizeof(f1->fld)) + +/** + * Compare two FDIR flows. If items and actions are identical, the two flows are + * regarded as same. + * + * @param dev + * Pointer to Ethernet device. + * @param f1 + * FDIR flow to compare. + * @param f2 + * FDIR flow to compare. + * + * @return + * Zero on match, 1 otherwise. + */ +static int +flow_fdir_cmp(const struct mlx5_fdir *f1, const struct mlx5_fdir *f2) +{ + if (FLOW_FDIR_CMP(f1, f2, attr) || + FLOW_FDIR_CMP(f1, f2, l2) || + FLOW_FDIR_CMP(f1, f2, l2_mask) || + FLOW_FDIR_CMP(f1, f2, l3) || + FLOW_FDIR_CMP(f1, f2, l3_mask) || + FLOW_FDIR_CMP(f1, f2, l4) || + FLOW_FDIR_CMP(f1, f2, l4_mask) || + FLOW_FDIR_CMP(f1, f2, actions[0].type)) + return 1; + if (f1->actions[0].type == RTE_FLOW_ACTION_TYPE_QUEUE && + FLOW_FDIR_CMP(f1, f2, queue)) + return 1; + return 0; +} + +/** + * Search device flow list to find out a matched FDIR flow. + * + * @param dev + * Pointer to Ethernet device. + * @param fdir_flow + * FDIR flow to lookup. + * + * @return + * Index of flow if found, 0 otherwise. + */ +static uint32_t +flow_fdir_filter_lookup(struct rte_eth_dev *dev, struct mlx5_fdir *fdir_flow) +{ + struct mlx5_priv *priv = dev->data->dev_private; + uint32_t flow_idx = 0; + struct mlx5_fdir_flow *priv_fdir_flow = NULL; + + MLX5_ASSERT(fdir_flow); + LIST_FOREACH(priv_fdir_flow, &priv->fdir_flows, next) { + if (!flow_fdir_cmp(priv_fdir_flow->fdir, fdir_flow)) { + DRV_LOG(DEBUG, "port %u found FDIR flow %u", + dev->data->port_id, flow_idx); + flow_idx = priv_fdir_flow->rix_flow; + break; + } + } + return flow_idx; +} + +/** + * Add new flow director filter and store it in list. + * + * @param dev + * Pointer to Ethernet device. + * @param fdir_filter + * Flow director filter to add. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_fdir_filter_add(struct rte_eth_dev *dev, + const struct rte_eth_fdir_filter *fdir_filter) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_fdir *fdir_flow; + struct rte_flow *flow; + struct mlx5_fdir_flow *priv_fdir_flow = NULL; + uint32_t flow_idx; + int ret; + + fdir_flow = rte_zmalloc(__func__, sizeof(*fdir_flow), 0); + if (!fdir_flow) { + rte_errno = ENOMEM; + return -rte_errno; + } + ret = flow_fdir_filter_convert(dev, fdir_filter, fdir_flow); + if (ret) + goto error; + flow_idx = flow_fdir_filter_lookup(dev, fdir_flow); + if (flow_idx) { + rte_errno = EEXIST; + goto error; + } + priv_fdir_flow = rte_zmalloc(__func__, sizeof(struct mlx5_fdir_flow), + 0); + if (!priv_fdir_flow) { + rte_errno = ENOMEM; + goto error; + } + flow_idx = flow_list_create(dev, &priv->flows, &fdir_flow->attr, + fdir_flow->items, fdir_flow->actions, true, + NULL); + flow = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], flow_idx); + if (!flow) + goto error; + flow->fdir = 1; + priv_fdir_flow->fdir = fdir_flow; + priv_fdir_flow->rix_flow = flow_idx; + LIST_INSERT_HEAD(&priv->fdir_flows, priv_fdir_flow, next); + DRV_LOG(DEBUG, "port %u created FDIR flow %p", + dev->data->port_id, (void *)flow); + return 0; +error: + rte_free(priv_fdir_flow); + rte_free(fdir_flow); + return -rte_errno; +} + +/** + * Delete specific filter. + * + * @param dev + * Pointer to Ethernet device. + * @param fdir_filter + * Filter to be deleted. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_fdir_filter_delete(struct rte_eth_dev *dev, + const struct rte_eth_fdir_filter *fdir_filter) +{ + struct mlx5_priv *priv = dev->data->dev_private; + uint32_t flow_idx; + struct mlx5_fdir fdir_flow = { + .attr.group = 0, + }; + struct mlx5_fdir_flow *priv_fdir_flow = NULL; + int ret; + + ret = flow_fdir_filter_convert(dev, fdir_filter, &fdir_flow); + if (ret) + return -rte_errno; + LIST_FOREACH(priv_fdir_flow, &priv->fdir_flows, next) { + /* Find the fdir in priv list */ + if (!flow_fdir_cmp(priv_fdir_flow->fdir, &fdir_flow)) + break; + } + if (!priv_fdir_flow) + return 0; + LIST_REMOVE(priv_fdir_flow, next); + flow_idx = priv_fdir_flow->rix_flow; + flow_list_destroy(dev, &priv->flows, flow_idx); + rte_free(priv_fdir_flow->fdir); + rte_free(priv_fdir_flow); + DRV_LOG(DEBUG, "port %u deleted FDIR flow %u", + dev->data->port_id, flow_idx); + return 0; +} + +/** + * Update queue for specific filter. + * + * @param dev + * Pointer to Ethernet device. + * @param fdir_filter + * Filter to be updated. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_fdir_filter_update(struct rte_eth_dev *dev, + const struct rte_eth_fdir_filter *fdir_filter) +{ + int ret; + + ret = flow_fdir_filter_delete(dev, fdir_filter); + if (ret) + return ret; + return flow_fdir_filter_add(dev, fdir_filter); +} + +/** + * Flush all filters. + * + * @param dev + * Pointer to Ethernet device. + */ +static void +flow_fdir_filter_flush(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_fdir_flow *priv_fdir_flow = NULL; + + while (!LIST_EMPTY(&priv->fdir_flows)) { + priv_fdir_flow = LIST_FIRST(&priv->fdir_flows); + LIST_REMOVE(priv_fdir_flow, next); + flow_list_destroy(dev, &priv->flows, priv_fdir_flow->rix_flow); + rte_free(priv_fdir_flow->fdir); + rte_free(priv_fdir_flow); + } +} + +/** + * Get flow director information. + * + * @param dev + * Pointer to Ethernet device. + * @param[out] fdir_info + * Resulting flow director information. + */ +static void +flow_fdir_info_get(struct rte_eth_dev *dev, struct rte_eth_fdir_info *fdir_info) +{ + struct rte_eth_fdir_masks *mask = + &dev->data->dev_conf.fdir_conf.mask; + + fdir_info->mode = dev->data->dev_conf.fdir_conf.mode; + fdir_info->guarant_spc = 0; + rte_memcpy(&fdir_info->mask, mask, sizeof(fdir_info->mask)); + fdir_info->max_flexpayload = 0; + fdir_info->flow_types_mask[0] = 0; + fdir_info->flex_payload_unit = 0; + fdir_info->max_flex_payload_segment_num = 0; + fdir_info->flex_payload_limit = 0; + memset(&fdir_info->flex_conf, 0, sizeof(fdir_info->flex_conf)); +} + +/** + * Deal with flow director operations. + * + * @param dev + * Pointer to Ethernet device. + * @param filter_op + * Operation to perform. + * @param arg + * Pointer to operation-specific structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_fdir_ctrl_func(struct rte_eth_dev *dev, enum rte_filter_op filter_op, + void *arg) +{ + enum rte_fdir_mode fdir_mode = + dev->data->dev_conf.fdir_conf.mode; + + if (filter_op == RTE_ETH_FILTER_NOP) + return 0; + if (fdir_mode != RTE_FDIR_MODE_PERFECT && + fdir_mode != RTE_FDIR_MODE_PERFECT_MAC_VLAN) { + DRV_LOG(ERR, "port %u flow director mode %d not supported", + dev->data->port_id, fdir_mode); + rte_errno = EINVAL; + return -rte_errno; + } + switch (filter_op) { + case RTE_ETH_FILTER_ADD: + return flow_fdir_filter_add(dev, arg); + case RTE_ETH_FILTER_UPDATE: + return flow_fdir_filter_update(dev, arg); + case RTE_ETH_FILTER_DELETE: + return flow_fdir_filter_delete(dev, arg); + case RTE_ETH_FILTER_FLUSH: + flow_fdir_filter_flush(dev); + break; + case RTE_ETH_FILTER_INFO: + flow_fdir_info_get(dev, arg); + break; + default: + DRV_LOG(DEBUG, "port %u unknown operation %u", + dev->data->port_id, filter_op); + rte_errno = EINVAL; + return -rte_errno; + } + return 0; +} + +/** + * Manage filter operations. + * + * @param dev + * Pointer to Ethernet device structure. + * @param filter_type + * Filter type. + * @param filter_op + * Operation to perform. + * @param arg + * Pointer to operation-specific structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_dev_filter_ctrl(struct rte_eth_dev *dev, + enum rte_filter_type filter_type, + enum rte_filter_op filter_op, + void *arg) +{ + switch (filter_type) { + case RTE_ETH_FILTER_GENERIC: + if (filter_op != RTE_ETH_FILTER_GET) { + rte_errno = EINVAL; + return -rte_errno; + } + *(const void **)arg = &mlx5_flow_ops; + return 0; + case RTE_ETH_FILTER_FDIR: + return flow_fdir_ctrl_func(dev, filter_op, arg); + default: + DRV_LOG(ERR, "port %u filter type (%d) not supported", + dev->data->port_id, filter_type); + rte_errno = ENOTSUP; + return -rte_errno; + } + return 0; +} + +/** + * Create the needed meter and suffix tables. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] fm + * Pointer to the flow meter. + * + * @return + * Pointer to table set on success, NULL otherwise. + */ +struct mlx5_meter_domains_infos * +mlx5_flow_create_mtr_tbls(struct rte_eth_dev *dev, + const struct mlx5_flow_meter *fm) +{ + const struct mlx5_flow_driver_ops *fops; + + fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV); + return fops->create_mtr_tbls(dev, fm); +} + +/** + * Destroy the meter table set. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] tbl + * Pointer to the meter table set. + * + * @return + * 0 on success. + */ +int +mlx5_flow_destroy_mtr_tbls(struct rte_eth_dev *dev, + struct mlx5_meter_domains_infos *tbls) +{ + const struct mlx5_flow_driver_ops *fops; + + fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV); + return fops->destroy_mtr_tbls(dev, tbls); +} + +/** + * Create policer rules. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] fm + * Pointer to flow meter structure. + * @param[in] attr + * Pointer to flow attributes. + * + * @return + * 0 on success, -1 otherwise. + */ +int +mlx5_flow_create_policer_rules(struct rte_eth_dev *dev, + struct mlx5_flow_meter *fm, + const struct rte_flow_attr *attr) +{ + const struct mlx5_flow_driver_ops *fops; + + fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV); + return fops->create_policer_rules(dev, fm, attr); +} + +/** + * Destroy policer rules. + * + * @param[in] fm + * Pointer to flow meter structure. + * @param[in] attr + * Pointer to flow attributes. + * + * @return + * 0 on success, -1 otherwise. + */ +int +mlx5_flow_destroy_policer_rules(struct rte_eth_dev *dev, + struct mlx5_flow_meter *fm, + const struct rte_flow_attr *attr) +{ + const struct mlx5_flow_driver_ops *fops; + + fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV); + return fops->destroy_policer_rules(dev, fm, attr); +} + +/** + * Allocate a counter. + * + * @param[in] dev + * Pointer to Ethernet device structure. + * + * @return + * Index to allocated counter on success, 0 otherwise. + */ +uint32_t +mlx5_counter_alloc(struct rte_eth_dev *dev) +{ + const struct mlx5_flow_driver_ops *fops; + struct rte_flow_attr attr = { .transfer = 0 }; + + if (flow_get_drv_type(dev, &attr) == MLX5_FLOW_TYPE_DV) { + fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV); + return fops->counter_alloc(dev); + } + DRV_LOG(ERR, + "port %u counter allocate is not supported.", + dev->data->port_id); + return 0; +} + +/** + * Free a counter. + * + * @param[in] dev + * Pointer to Ethernet device structure. + * @param[in] cnt + * Index to counter to be free. + */ +void +mlx5_counter_free(struct rte_eth_dev *dev, uint32_t cnt) +{ + const struct mlx5_flow_driver_ops *fops; + struct rte_flow_attr attr = { .transfer = 0 }; + + if (flow_get_drv_type(dev, &attr) == MLX5_FLOW_TYPE_DV) { + fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV); + fops->counter_free(dev, cnt); + return; + } + DRV_LOG(ERR, + "port %u counter free is not supported.", + dev->data->port_id); +} + +/** + * Query counter statistics. + * + * @param[in] dev + * Pointer to Ethernet device structure. + * @param[in] cnt + * Index to counter to query. + * @param[in] clear + * Set to clear counter statistics. + * @param[out] pkts + * The counter hits packets number to save. + * @param[out] bytes + * The counter hits bytes number to save. + * + * @return + * 0 on success, a negative errno value otherwise. + */ +int +mlx5_counter_query(struct rte_eth_dev *dev, uint32_t cnt, + bool clear, uint64_t *pkts, uint64_t *bytes) +{ + const struct mlx5_flow_driver_ops *fops; + struct rte_flow_attr attr = { .transfer = 0 }; + + if (flow_get_drv_type(dev, &attr) == MLX5_FLOW_TYPE_DV) { + fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV); + return fops->counter_query(dev, cnt, clear, pkts, bytes); + } + DRV_LOG(ERR, + "port %u counter query is not supported.", + dev->data->port_id); + return -ENOTSUP; +} + +#define MLX5_POOL_QUERY_FREQ_US 1000000 + +/** + * Get number of all validate pools. + * + * @param[in] sh + * Pointer to mlx5_ibv_shared object. + * + * @return + * The number of all validate pools. + */ +static uint32_t +mlx5_get_all_valid_pool_count(struct mlx5_ibv_shared *sh) +{ + int i; + uint32_t pools_n = 0; + + for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) + pools_n += rte_atomic16_read(&sh->cmng.ccont[i].n_valid); + return pools_n; +} + +/** + * Set the periodic procedure for triggering asynchronous batch queries for all + * the counter pools. + * + * @param[in] sh + * Pointer to mlx5_ibv_shared object. + */ +void +mlx5_set_query_alarm(struct mlx5_ibv_shared *sh) +{ + uint32_t pools_n, us; + + pools_n = mlx5_get_all_valid_pool_count(sh); + us = MLX5_POOL_QUERY_FREQ_US / pools_n; + DRV_LOG(DEBUG, "Set alarm for %u pools each %u us", pools_n, us); + if (rte_eal_alarm_set(us, mlx5_flow_query_alarm, sh)) { + sh->cmng.query_thread_on = 0; + DRV_LOG(ERR, "Cannot reinitialize query alarm"); + } else { + sh->cmng.query_thread_on = 1; + } +} + +/** + * The periodic procedure for triggering asynchronous batch queries for all the + * counter pools. This function is probably called by the host thread. + * + * @param[in] arg + * The parameter for the alarm process. + */ +void +mlx5_flow_query_alarm(void *arg) +{ + struct mlx5_ibv_shared *sh = arg; + struct mlx5_devx_obj *dcs; + uint16_t offset; + int ret; + uint8_t batch = sh->cmng.batch; + uint8_t age = sh->cmng.age; + uint16_t pool_index = sh->cmng.pool_index; + struct mlx5_pools_container *cont; + struct mlx5_flow_counter_pool *pool; + int cont_loop = MLX5_CCONT_TYPE_MAX; + + if (sh->cmng.pending_queries >= MLX5_MAX_PENDING_QUERIES) + goto set_alarm; +next_container: + cont = MLX5_CNT_CONTAINER(sh, batch, age); + rte_spinlock_lock(&cont->resize_sl); + if (!cont->pools) { + rte_spinlock_unlock(&cont->resize_sl); + /* Check if all the containers are empty. */ + if (unlikely(--cont_loop == 0)) + goto set_alarm; + batch ^= 0x1; + pool_index = 0; + if (batch == 0 && pool_index == 0) { + age ^= 0x1; + sh->cmng.batch = batch; + sh->cmng.age = age; + } + goto next_container; + } + pool = cont->pools[pool_index]; + rte_spinlock_unlock(&cont->resize_sl); + if (pool->raw_hw) + /* There is a pool query in progress. */ + goto set_alarm; + pool->raw_hw = + LIST_FIRST(&sh->cmng.free_stat_raws); + if (!pool->raw_hw) + /* No free counter statistics raw memory. */ + goto set_alarm; + dcs = (struct mlx5_devx_obj *)(uintptr_t)rte_atomic64_read + (&pool->a64_dcs); + offset = batch ? 0 : dcs->id % MLX5_COUNTERS_PER_POOL; + /* + * Identify the counters released between query trigger and query + * handle more effiecntly. The counter released in this gap period + * should wait for a new round of query as the new arrived packets + * will not be taken into account. + */ + rte_atomic64_add(&pool->start_query_gen, 1); + ret = mlx5_devx_cmd_flow_counter_query(dcs, 0, MLX5_COUNTERS_PER_POOL - + offset, NULL, NULL, + pool->raw_hw->mem_mng->dm->id, + (void *)(uintptr_t) + (pool->raw_hw->data + offset), + sh->devx_comp, + (uint64_t)(uintptr_t)pool); + if (ret) { + rte_atomic64_sub(&pool->start_query_gen, 1); + DRV_LOG(ERR, "Failed to trigger asynchronous query for dcs ID" + " %d", pool->min_dcs->id); + pool->raw_hw = NULL; + goto set_alarm; + } + pool->raw_hw->min_dcs_id = dcs->id; + LIST_REMOVE(pool->raw_hw, next); + sh->cmng.pending_queries++; + pool_index++; + if (pool_index >= rte_atomic16_read(&cont->n_valid)) { + batch ^= 0x1; + pool_index = 0; + if (batch == 0 && pool_index == 0) + age ^= 0x1; + } +set_alarm: + sh->cmng.batch = batch; + sh->cmng.pool_index = pool_index; + sh->cmng.age = age; + mlx5_set_query_alarm(sh); +} + +/** + * Check and callback event for new aged flow in the counter pool + * + * @param[in] sh + * Pointer to mlx5_ibv_shared object. + * @param[in] pool + * Pointer to Current counter pool. + */ +static void +mlx5_flow_aging_check(struct mlx5_ibv_shared *sh, + struct mlx5_flow_counter_pool *pool) +{ + struct mlx5_priv *priv; + struct mlx5_flow_counter *cnt; + struct mlx5_age_info *age_info; + struct mlx5_age_param *age_param; + struct mlx5_counter_stats_raw *cur = pool->raw_hw; + struct mlx5_counter_stats_raw *prev = pool->raw; + uint16_t curr = rte_rdtsc() / (rte_get_tsc_hz() / 10); + uint32_t i; + + for (i = 0; i < MLX5_COUNTERS_PER_POOL; ++i) { + cnt = MLX5_POOL_GET_CNT(pool, i); + age_param = MLX5_CNT_TO_AGE(cnt); + if (rte_atomic16_read(&age_param->state) != AGE_CANDIDATE) + continue; + if (cur->data[i].hits != prev->data[i].hits) { + age_param->expire = curr + age_param->timeout; + continue; + } + if ((uint16_t)(curr - age_param->expire) >= (UINT16_MAX / 2)) + continue; + /** + * Hold the lock first, or if between the + * state AGE_TMOUT and tailq operation the + * release happened, the release procedure + * may delete a non-existent tailq node. + */ + priv = rte_eth_devices[age_param->port_id].data->dev_private; + age_info = GET_PORT_AGE_INFO(priv); + rte_spinlock_lock(&age_info->aged_sl); + /* If the cpmset fails, release happens. */ + if (rte_atomic16_cmpset((volatile uint16_t *) + &age_param->state, + AGE_CANDIDATE, + AGE_TMOUT) == + AGE_CANDIDATE) { + TAILQ_INSERT_TAIL(&age_info->aged_counters, cnt, next); + MLX5_AGE_SET(age_info, MLX5_AGE_EVENT_NEW); + } + rte_spinlock_unlock(&age_info->aged_sl); + } + for (i = 0; i < sh->max_port; i++) { + age_info = &sh->port[i].age_info; + if (!MLX5_AGE_GET(age_info, MLX5_AGE_EVENT_NEW)) + continue; + if (MLX5_AGE_GET(age_info, MLX5_AGE_TRIGGER)) + _rte_eth_dev_callback_process + (&rte_eth_devices[sh->port[i].devx_ih_port_id], + RTE_ETH_EVENT_FLOW_AGED, NULL); + age_info->flags = 0; + } +} + +/** + * Handler for the HW respond about ready values from an asynchronous batch + * query. This function is probably called by the host thread. + * + * @param[in] sh + * The pointer to the shared IB device context. + * @param[in] async_id + * The Devx async ID. + * @param[in] status + * The status of the completion. + */ +void +mlx5_flow_async_pool_query_handle(struct mlx5_ibv_shared *sh, + uint64_t async_id, int status) +{ + struct mlx5_flow_counter_pool *pool = + (struct mlx5_flow_counter_pool *)(uintptr_t)async_id; + struct mlx5_counter_stats_raw *raw_to_free; + + if (unlikely(status)) { + rte_atomic64_sub(&pool->start_query_gen, 1); + raw_to_free = pool->raw_hw; + } else { + raw_to_free = pool->raw; + if (IS_AGE_POOL(pool)) + mlx5_flow_aging_check(sh, pool); + rte_spinlock_lock(&pool->sl); + pool->raw = pool->raw_hw; + rte_spinlock_unlock(&pool->sl); + MLX5_ASSERT(rte_atomic64_read(&pool->end_query_gen) + 1 == + rte_atomic64_read(&pool->start_query_gen)); + rte_atomic64_set(&pool->end_query_gen, + rte_atomic64_read(&pool->start_query_gen)); + /* Be sure the new raw counters data is updated in memory. */ + rte_cio_wmb(); + } + LIST_INSERT_HEAD(&sh->cmng.free_stat_raws, raw_to_free, next); + pool->raw_hw = NULL; + sh->cmng.pending_queries--; +} + +/** + * Translate the rte_flow group index to HW table value. + * + * @param[in] attributes + * Pointer to flow attributes + * @param[in] external + * Value is part of flow rule created by request external to PMD. + * @param[in] group + * rte_flow group index value. + * @param[out] fdb_def_rule + * Whether fdb jump to table 1 is configured. + * @param[out] table + * HW table value. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_group_to_table(const struct rte_flow_attr *attributes, bool external, + uint32_t group, bool fdb_def_rule, uint32_t *table, + struct rte_flow_error *error) +{ + if (attributes->transfer && external && fdb_def_rule) { + if (group == UINT32_MAX) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ATTR_GROUP, + NULL, + "group index not supported"); + *table = group + 1; + } else { + *table = group; + } + return 0; +} + +/** + * Discover availability of metadata reg_c's. + * + * Iteratively use test flows to check availability. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_discover_mreg_c(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_dev_config *config = &priv->config; + enum modify_reg idx; + int n = 0; + + /* reg_c[0] and reg_c[1] are reserved. */ + config->flow_mreg_c[n++] = REG_C_0; + config->flow_mreg_c[n++] = REG_C_1; + /* Discover availability of other reg_c's. */ + for (idx = REG_C_2; idx <= REG_C_7; ++idx) { + struct rte_flow_attr attr = { + .group = MLX5_FLOW_MREG_CP_TABLE_GROUP, + .priority = MLX5_FLOW_PRIO_RSVD, + .ingress = 1, + }; + struct rte_flow_item items[] = { + [0] = { + .type = RTE_FLOW_ITEM_TYPE_END, + }, + }; + struct rte_flow_action actions[] = { + [0] = { + .type = (enum rte_flow_action_type) + MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG, + .conf = &(struct mlx5_flow_action_copy_mreg){ + .src = REG_C_1, + .dst = idx, + }, + }, + [1] = { + .type = RTE_FLOW_ACTION_TYPE_JUMP, + .conf = &(struct rte_flow_action_jump){ + .group = MLX5_FLOW_MREG_ACT_TABLE_GROUP, + }, + }, + [2] = { + .type = RTE_FLOW_ACTION_TYPE_END, + }, + }; + uint32_t flow_idx; + struct rte_flow *flow; + struct rte_flow_error error; + + if (!config->dv_flow_en) + break; + /* Create internal flow, validation skips copy action. */ + flow_idx = flow_list_create(dev, NULL, &attr, items, + actions, false, &error); + flow = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], + flow_idx); + if (!flow) + continue; + if (dev->data->dev_started || !flow_drv_apply(dev, flow, NULL)) + config->flow_mreg_c[n++] = idx; + flow_list_destroy(dev, NULL, flow_idx); + } + for (; n < MLX5_MREG_C_NUM; ++n) + config->flow_mreg_c[n] = REG_NONE; + return 0; +} + +/** + * Dump flow raw hw data to file + * + * @param[in] dev + * The pointer to Ethernet device. + * @param[in] file + * A pointer to a file for output. + * @param[out] error + * Perform verbose error reporting if not NULL. PMDs initialize this + * structure in case of error only. + * @return + * 0 on success, a nagative value otherwise. + */ +int +mlx5_flow_dev_dump(struct rte_eth_dev *dev, + FILE *file, + struct rte_flow_error *error __rte_unused) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; + + return mlx5_devx_cmd_flow_dump(sh->fdb_domain, sh->rx_domain, + sh->tx_domain, file); +} + +/** + * Get aged-out flows. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] context + * The address of an array of pointers to the aged-out flows contexts. + * @param[in] nb_countexts + * The length of context array pointers. + * @param[out] error + * Perform verbose error reporting if not NULL. Initialized in case of + * error only. + * + * @return + * how many contexts get in success, otherwise negative errno value. + * if nb_contexts is 0, return the amount of all aged contexts. + * if nb_contexts is not 0 , return the amount of aged flows reported + * in the context array. + */ +int +mlx5_flow_get_aged_flows(struct rte_eth_dev *dev, void **contexts, + uint32_t nb_contexts, struct rte_flow_error *error) +{ + const struct mlx5_flow_driver_ops *fops; + struct rte_flow_attr attr = { .transfer = 0 }; + + if (flow_get_drv_type(dev, &attr) == MLX5_FLOW_TYPE_DV) { + fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV); + return fops->get_aged_flows(dev, contexts, nb_contexts, + error); + } + DRV_LOG(ERR, + "port %u get aged flows is not supported.", + dev->data->port_id); + return -ENOTSUP; +} diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow.h new file mode 100644 index 000000000..2c9667756 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow.h @@ -0,0 +1,1034 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 Mellanox Technologies, Ltd + */ + +#ifndef RTE_PMD_MLX5_FLOW_H_ +#define RTE_PMD_MLX5_FLOW_H_ + +#include <netinet/in.h> +#include <sys/queue.h> +#include <stdalign.h> +#include <stdint.h> +#include <string.h> + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_atomic.h> +#include <rte_alarm.h> +#include <rte_mtr.h> + +#include <mlx5_prm.h> + +#include "mlx5.h" + +/* Private rte flow items. */ +enum mlx5_rte_flow_item_type { + MLX5_RTE_FLOW_ITEM_TYPE_END = INT_MIN, + MLX5_RTE_FLOW_ITEM_TYPE_TAG, + MLX5_RTE_FLOW_ITEM_TYPE_TX_QUEUE, + MLX5_RTE_FLOW_ITEM_TYPE_VLAN, +}; + +/* Private (internal) rte flow actions. */ +enum mlx5_rte_flow_action_type { + MLX5_RTE_FLOW_ACTION_TYPE_END = INT_MIN, + MLX5_RTE_FLOW_ACTION_TYPE_TAG, + MLX5_RTE_FLOW_ACTION_TYPE_MARK, + MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG, +}; + +/* Matches on selected register. */ +struct mlx5_rte_flow_item_tag { + enum modify_reg id; + uint32_t data; +}; + +/* Modify selected register. */ +struct mlx5_rte_flow_action_set_tag { + enum modify_reg id; + uint32_t data; +}; + +struct mlx5_flow_action_copy_mreg { + enum modify_reg dst; + enum modify_reg src; +}; + +/* Matches on source queue. */ +struct mlx5_rte_flow_item_tx_queue { + uint32_t queue; +}; + +/* Feature name to allocate metadata register. */ +enum mlx5_feature_name { + MLX5_HAIRPIN_RX, + MLX5_HAIRPIN_TX, + MLX5_METADATA_RX, + MLX5_METADATA_TX, + MLX5_METADATA_FDB, + MLX5_FLOW_MARK, + MLX5_APP_TAG, + MLX5_COPY_MARK, + MLX5_MTR_COLOR, + MLX5_MTR_SFX, +}; + +/* Pattern outer Layer bits. */ +#define MLX5_FLOW_LAYER_OUTER_L2 (1u << 0) +#define MLX5_FLOW_LAYER_OUTER_L3_IPV4 (1u << 1) +#define MLX5_FLOW_LAYER_OUTER_L3_IPV6 (1u << 2) +#define MLX5_FLOW_LAYER_OUTER_L4_UDP (1u << 3) +#define MLX5_FLOW_LAYER_OUTER_L4_TCP (1u << 4) +#define MLX5_FLOW_LAYER_OUTER_VLAN (1u << 5) + +/* Pattern inner Layer bits. */ +#define MLX5_FLOW_LAYER_INNER_L2 (1u << 6) +#define MLX5_FLOW_LAYER_INNER_L3_IPV4 (1u << 7) +#define MLX5_FLOW_LAYER_INNER_L3_IPV6 (1u << 8) +#define MLX5_FLOW_LAYER_INNER_L4_UDP (1u << 9) +#define MLX5_FLOW_LAYER_INNER_L4_TCP (1u << 10) +#define MLX5_FLOW_LAYER_INNER_VLAN (1u << 11) + +/* Pattern tunnel Layer bits. */ +#define MLX5_FLOW_LAYER_VXLAN (1u << 12) +#define MLX5_FLOW_LAYER_VXLAN_GPE (1u << 13) +#define MLX5_FLOW_LAYER_GRE (1u << 14) +#define MLX5_FLOW_LAYER_MPLS (1u << 15) +/* List of tunnel Layer bits continued below. */ + +/* General pattern items bits. */ +#define MLX5_FLOW_ITEM_METADATA (1u << 16) +#define MLX5_FLOW_ITEM_PORT_ID (1u << 17) +#define MLX5_FLOW_ITEM_TAG (1u << 18) +#define MLX5_FLOW_ITEM_MARK (1u << 19) + +/* Pattern MISC bits. */ +#define MLX5_FLOW_LAYER_ICMP (1u << 20) +#define MLX5_FLOW_LAYER_ICMP6 (1u << 21) +#define MLX5_FLOW_LAYER_GRE_KEY (1u << 22) + +/* Pattern tunnel Layer bits (continued). */ +#define MLX5_FLOW_LAYER_IPIP (1u << 23) +#define MLX5_FLOW_LAYER_IPV6_ENCAP (1u << 24) +#define MLX5_FLOW_LAYER_NVGRE (1u << 25) +#define MLX5_FLOW_LAYER_GENEVE (1u << 26) + +/* Queue items. */ +#define MLX5_FLOW_ITEM_TX_QUEUE (1u << 27) + +/* Pattern tunnel Layer bits (continued). */ +#define MLX5_FLOW_LAYER_GTP (1u << 28) + +/* Outer Masks. */ +#define MLX5_FLOW_LAYER_OUTER_L3 \ + (MLX5_FLOW_LAYER_OUTER_L3_IPV4 | MLX5_FLOW_LAYER_OUTER_L3_IPV6) +#define MLX5_FLOW_LAYER_OUTER_L4 \ + (MLX5_FLOW_LAYER_OUTER_L4_UDP | MLX5_FLOW_LAYER_OUTER_L4_TCP) +#define MLX5_FLOW_LAYER_OUTER \ + (MLX5_FLOW_LAYER_OUTER_L2 | MLX5_FLOW_LAYER_OUTER_L3 | \ + MLX5_FLOW_LAYER_OUTER_L4) + +/* Tunnel Masks. */ +#define MLX5_FLOW_LAYER_TUNNEL \ + (MLX5_FLOW_LAYER_VXLAN | MLX5_FLOW_LAYER_VXLAN_GPE | \ + MLX5_FLOW_LAYER_GRE | MLX5_FLOW_LAYER_NVGRE | MLX5_FLOW_LAYER_MPLS | \ + MLX5_FLOW_LAYER_IPIP | MLX5_FLOW_LAYER_IPV6_ENCAP | \ + MLX5_FLOW_LAYER_GENEVE | MLX5_FLOW_LAYER_GTP) + +/* Inner Masks. */ +#define MLX5_FLOW_LAYER_INNER_L3 \ + (MLX5_FLOW_LAYER_INNER_L3_IPV4 | MLX5_FLOW_LAYER_INNER_L3_IPV6) +#define MLX5_FLOW_LAYER_INNER_L4 \ + (MLX5_FLOW_LAYER_INNER_L4_UDP | MLX5_FLOW_LAYER_INNER_L4_TCP) +#define MLX5_FLOW_LAYER_INNER \ + (MLX5_FLOW_LAYER_INNER_L2 | MLX5_FLOW_LAYER_INNER_L3 | \ + MLX5_FLOW_LAYER_INNER_L4) + +/* Layer Masks. */ +#define MLX5_FLOW_LAYER_L2 \ + (MLX5_FLOW_LAYER_OUTER_L2 | MLX5_FLOW_LAYER_INNER_L2) +#define MLX5_FLOW_LAYER_L3_IPV4 \ + (MLX5_FLOW_LAYER_OUTER_L3_IPV4 | MLX5_FLOW_LAYER_INNER_L3_IPV4) +#define MLX5_FLOW_LAYER_L3_IPV6 \ + (MLX5_FLOW_LAYER_OUTER_L3_IPV6 | MLX5_FLOW_LAYER_INNER_L3_IPV6) +#define MLX5_FLOW_LAYER_L3 \ + (MLX5_FLOW_LAYER_L3_IPV4 | MLX5_FLOW_LAYER_L3_IPV6) +#define MLX5_FLOW_LAYER_L4 \ + (MLX5_FLOW_LAYER_OUTER_L4 | MLX5_FLOW_LAYER_INNER_L4) + +/* Actions */ +#define MLX5_FLOW_ACTION_DROP (1u << 0) +#define MLX5_FLOW_ACTION_QUEUE (1u << 1) +#define MLX5_FLOW_ACTION_RSS (1u << 2) +#define MLX5_FLOW_ACTION_FLAG (1u << 3) +#define MLX5_FLOW_ACTION_MARK (1u << 4) +#define MLX5_FLOW_ACTION_COUNT (1u << 5) +#define MLX5_FLOW_ACTION_PORT_ID (1u << 6) +#define MLX5_FLOW_ACTION_OF_POP_VLAN (1u << 7) +#define MLX5_FLOW_ACTION_OF_PUSH_VLAN (1u << 8) +#define MLX5_FLOW_ACTION_OF_SET_VLAN_VID (1u << 9) +#define MLX5_FLOW_ACTION_OF_SET_VLAN_PCP (1u << 10) +#define MLX5_FLOW_ACTION_SET_IPV4_SRC (1u << 11) +#define MLX5_FLOW_ACTION_SET_IPV4_DST (1u << 12) +#define MLX5_FLOW_ACTION_SET_IPV6_SRC (1u << 13) +#define MLX5_FLOW_ACTION_SET_IPV6_DST (1u << 14) +#define MLX5_FLOW_ACTION_SET_TP_SRC (1u << 15) +#define MLX5_FLOW_ACTION_SET_TP_DST (1u << 16) +#define MLX5_FLOW_ACTION_JUMP (1u << 17) +#define MLX5_FLOW_ACTION_SET_TTL (1u << 18) +#define MLX5_FLOW_ACTION_DEC_TTL (1u << 19) +#define MLX5_FLOW_ACTION_SET_MAC_SRC (1u << 20) +#define MLX5_FLOW_ACTION_SET_MAC_DST (1u << 21) +#define MLX5_FLOW_ACTION_ENCAP (1u << 22) +#define MLX5_FLOW_ACTION_DECAP (1u << 23) +#define MLX5_FLOW_ACTION_INC_TCP_SEQ (1u << 24) +#define MLX5_FLOW_ACTION_DEC_TCP_SEQ (1u << 25) +#define MLX5_FLOW_ACTION_INC_TCP_ACK (1u << 26) +#define MLX5_FLOW_ACTION_DEC_TCP_ACK (1u << 27) +#define MLX5_FLOW_ACTION_SET_TAG (1ull << 28) +#define MLX5_FLOW_ACTION_MARK_EXT (1ull << 29) +#define MLX5_FLOW_ACTION_SET_META (1ull << 30) +#define MLX5_FLOW_ACTION_METER (1ull << 31) +#define MLX5_FLOW_ACTION_SET_IPV4_DSCP (1ull << 32) +#define MLX5_FLOW_ACTION_SET_IPV6_DSCP (1ull << 33) +#define MLX5_FLOW_ACTION_AGE (1ull << 34) + +#define MLX5_FLOW_FATE_ACTIONS \ + (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_QUEUE | \ + MLX5_FLOW_ACTION_RSS | MLX5_FLOW_ACTION_JUMP) + +#define MLX5_FLOW_FATE_ESWITCH_ACTIONS \ + (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \ + MLX5_FLOW_ACTION_JUMP) + + +#define MLX5_FLOW_MODIFY_HDR_ACTIONS (MLX5_FLOW_ACTION_SET_IPV4_SRC | \ + MLX5_FLOW_ACTION_SET_IPV4_DST | \ + MLX5_FLOW_ACTION_SET_IPV6_SRC | \ + MLX5_FLOW_ACTION_SET_IPV6_DST | \ + MLX5_FLOW_ACTION_SET_TP_SRC | \ + MLX5_FLOW_ACTION_SET_TP_DST | \ + MLX5_FLOW_ACTION_SET_TTL | \ + MLX5_FLOW_ACTION_DEC_TTL | \ + MLX5_FLOW_ACTION_SET_MAC_SRC | \ + MLX5_FLOW_ACTION_SET_MAC_DST | \ + MLX5_FLOW_ACTION_INC_TCP_SEQ | \ + MLX5_FLOW_ACTION_DEC_TCP_SEQ | \ + MLX5_FLOW_ACTION_INC_TCP_ACK | \ + MLX5_FLOW_ACTION_DEC_TCP_ACK | \ + MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \ + MLX5_FLOW_ACTION_SET_TAG | \ + MLX5_FLOW_ACTION_MARK_EXT | \ + MLX5_FLOW_ACTION_SET_META | \ + MLX5_FLOW_ACTION_SET_IPV4_DSCP | \ + MLX5_FLOW_ACTION_SET_IPV6_DSCP) + +#define MLX5_FLOW_VLAN_ACTIONS (MLX5_FLOW_ACTION_OF_POP_VLAN | \ + MLX5_FLOW_ACTION_OF_PUSH_VLAN) + +#define MLX5_FLOW_XCAP_ACTIONS (MLX5_FLOW_ACTION_ENCAP | MLX5_FLOW_ACTION_DECAP) + +#ifndef IPPROTO_MPLS +#define IPPROTO_MPLS 137 +#endif + +/* UDP port number for MPLS */ +#define MLX5_UDP_PORT_MPLS 6635 + +/* UDP port numbers for VxLAN. */ +#define MLX5_UDP_PORT_VXLAN 4789 +#define MLX5_UDP_PORT_VXLAN_GPE 4790 + +/* UDP port numbers for GENEVE. */ +#define MLX5_UDP_PORT_GENEVE 6081 + +/* Priority reserved for default flows. */ +#define MLX5_FLOW_PRIO_RSVD ((uint32_t)-1) + +/* + * Number of sub priorities. + * For each kind of pattern matching i.e. L2, L3, L4 to have a correct + * matching on the NIC (firmware dependent) L4 most have the higher priority + * followed by L3 and ending with L2. + */ +#define MLX5_PRIORITY_MAP_L2 2 +#define MLX5_PRIORITY_MAP_L3 1 +#define MLX5_PRIORITY_MAP_L4 0 +#define MLX5_PRIORITY_MAP_MAX 3 + +/* Valid layer type for IPV4 RSS. */ +#define MLX5_IPV4_LAYER_TYPES \ + (ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 | \ + ETH_RSS_NONFRAG_IPV4_TCP | ETH_RSS_NONFRAG_IPV4_UDP | \ + ETH_RSS_NONFRAG_IPV4_OTHER) + +/* IBV hash source bits for IPV4. */ +#define MLX5_IPV4_IBV_RX_HASH (IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4) + +/* Valid layer type for IPV6 RSS. */ +#define MLX5_IPV6_LAYER_TYPES \ + (ETH_RSS_IPV6 | ETH_RSS_FRAG_IPV6 | ETH_RSS_NONFRAG_IPV6_TCP | \ + ETH_RSS_NONFRAG_IPV6_UDP | ETH_RSS_IPV6_EX | ETH_RSS_IPV6_TCP_EX | \ + ETH_RSS_IPV6_UDP_EX | ETH_RSS_NONFRAG_IPV6_OTHER) + +/* IBV hash source bits for IPV6. */ +#define MLX5_IPV6_IBV_RX_HASH (IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6) + +/* IBV hash bits for L3 SRC. */ +#define MLX5_L3_SRC_IBV_RX_HASH (IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_SRC_IPV6) + +/* IBV hash bits for L3 DST. */ +#define MLX5_L3_DST_IBV_RX_HASH (IBV_RX_HASH_DST_IPV4 | IBV_RX_HASH_DST_IPV6) + +/* IBV hash bits for TCP. */ +#define MLX5_TCP_IBV_RX_HASH (IBV_RX_HASH_SRC_PORT_TCP | \ + IBV_RX_HASH_DST_PORT_TCP) + +/* IBV hash bits for UDP. */ +#define MLX5_UDP_IBV_RX_HASH (IBV_RX_HASH_SRC_PORT_UDP | \ + IBV_RX_HASH_DST_PORT_UDP) + +/* IBV hash bits for L4 SRC. */ +#define MLX5_L4_SRC_IBV_RX_HASH (IBV_RX_HASH_SRC_PORT_TCP | \ + IBV_RX_HASH_SRC_PORT_UDP) + +/* IBV hash bits for L4 DST. */ +#define MLX5_L4_DST_IBV_RX_HASH (IBV_RX_HASH_DST_PORT_TCP | \ + IBV_RX_HASH_DST_PORT_UDP) + +/* Geneve header first 16Bit */ +#define MLX5_GENEVE_VER_MASK 0x3 +#define MLX5_GENEVE_VER_SHIFT 14 +#define MLX5_GENEVE_VER_VAL(a) \ + (((a) >> (MLX5_GENEVE_VER_SHIFT)) & (MLX5_GENEVE_VER_MASK)) +#define MLX5_GENEVE_OPTLEN_MASK 0x3F +#define MLX5_GENEVE_OPTLEN_SHIFT 7 +#define MLX5_GENEVE_OPTLEN_VAL(a) \ + (((a) >> (MLX5_GENEVE_OPTLEN_SHIFT)) & (MLX5_GENEVE_OPTLEN_MASK)) +#define MLX5_GENEVE_OAMF_MASK 0x1 +#define MLX5_GENEVE_OAMF_SHIFT 7 +#define MLX5_GENEVE_OAMF_VAL(a) \ + (((a) >> (MLX5_GENEVE_OAMF_SHIFT)) & (MLX5_GENEVE_OAMF_MASK)) +#define MLX5_GENEVE_CRITO_MASK 0x1 +#define MLX5_GENEVE_CRITO_SHIFT 6 +#define MLX5_GENEVE_CRITO_VAL(a) \ + (((a) >> (MLX5_GENEVE_CRITO_SHIFT)) & (MLX5_GENEVE_CRITO_MASK)) +#define MLX5_GENEVE_RSVD_MASK 0x3F +#define MLX5_GENEVE_RSVD_VAL(a) ((a) & (MLX5_GENEVE_RSVD_MASK)) +/* + * The length of the Geneve options fields, expressed in four byte multiples, + * not including the eight byte fixed tunnel. + */ +#define MLX5_GENEVE_OPT_LEN_0 14 +#define MLX5_GENEVE_OPT_LEN_1 63 + +#define MLX5_ENCAPSULATION_DECISION_SIZE (sizeof(struct rte_flow_item_eth) + \ + sizeof(struct rte_flow_item_ipv4)) + +/* Software header modify action numbers of a flow. */ +#define MLX5_ACT_NUM_MDF_IPV4 1 +#define MLX5_ACT_NUM_MDF_IPV6 4 +#define MLX5_ACT_NUM_MDF_MAC 2 +#define MLX5_ACT_NUM_MDF_VID 1 +#define MLX5_ACT_NUM_MDF_PORT 2 +#define MLX5_ACT_NUM_MDF_TTL 1 +#define MLX5_ACT_NUM_DEC_TTL MLX5_ACT_NUM_MDF_TTL +#define MLX5_ACT_NUM_MDF_TCPSEQ 1 +#define MLX5_ACT_NUM_MDF_TCPACK 1 +#define MLX5_ACT_NUM_SET_REG 1 +#define MLX5_ACT_NUM_SET_TAG 1 +#define MLX5_ACT_NUM_CPY_MREG MLX5_ACT_NUM_SET_TAG +#define MLX5_ACT_NUM_SET_MARK MLX5_ACT_NUM_SET_TAG +#define MLX5_ACT_NUM_SET_META MLX5_ACT_NUM_SET_TAG +#define MLX5_ACT_NUM_SET_DSCP 1 + +enum mlx5_flow_drv_type { + MLX5_FLOW_TYPE_MIN, + MLX5_FLOW_TYPE_DV, + MLX5_FLOW_TYPE_VERBS, + MLX5_FLOW_TYPE_MAX, +}; + +/* Fate action type. */ +enum mlx5_flow_fate_type { + MLX5_FLOW_FATE_NONE, /* Egress flow. */ + MLX5_FLOW_FATE_QUEUE, + MLX5_FLOW_FATE_JUMP, + MLX5_FLOW_FATE_PORT_ID, + MLX5_FLOW_FATE_DROP, + MLX5_FLOW_FATE_MAX, +}; + +/* Matcher PRM representation */ +struct mlx5_flow_dv_match_params { + size_t size; + /**< Size of match value. Do NOT split size and key! */ + uint32_t buf[MLX5_ST_SZ_DW(fte_match_param)]; + /**< Matcher value. This value is used as the mask or as a key. */ +}; + +/* Matcher structure. */ +struct mlx5_flow_dv_matcher { + LIST_ENTRY(mlx5_flow_dv_matcher) next; + /**< Pointer to the next element. */ + struct mlx5_flow_tbl_resource *tbl; + /**< Pointer to the table(group) the matcher associated with. */ + rte_atomic32_t refcnt; /**< Reference counter. */ + void *matcher_object; /**< Pointer to DV matcher */ + uint16_t crc; /**< CRC of key. */ + uint16_t priority; /**< Priority of matcher. */ + struct mlx5_flow_dv_match_params mask; /**< Matcher mask. */ +}; + +#define MLX5_ENCAP_MAX_LEN 132 + +/* Encap/decap resource structure. */ +struct mlx5_flow_dv_encap_decap_resource { + ILIST_ENTRY(uint32_t)next; + /* Pointer to next element. */ + rte_atomic32_t refcnt; /**< Reference counter. */ + void *verbs_action; + /**< Verbs encap/decap action object. */ + uint8_t buf[MLX5_ENCAP_MAX_LEN]; + size_t size; + uint8_t reformat_type; + uint8_t ft_type; + uint64_t flags; /**< Flags for RDMA API. */ +}; + +/* Tag resource structure. */ +struct mlx5_flow_dv_tag_resource { + struct mlx5_hlist_entry entry; + /**< hash list entry for tag resource, tag value as the key. */ + void *action; + /**< Verbs tag action object. */ + rte_atomic32_t refcnt; /**< Reference counter. */ + uint32_t idx; /**< Index for the index memory pool. */ +}; + +/* + * Number of modification commands. + * The maximal actions amount in FW is some constant, and it is 16 in the + * latest releases. In some old releases, it will be limited to 8. + * Since there is no interface to query the capacity, the maximal value should + * be used to allow PMD to create the flow. The validation will be done in the + * lower driver layer or FW. A failure will be returned if exceeds the maximal + * supported actions number on the root table. + * On non-root tables, there is no limitation, but 32 is enough right now. + */ +#define MLX5_MAX_MODIFY_NUM 32 +#define MLX5_ROOT_TBL_MODIFY_NUM 16 + +/* Modify resource structure */ +struct mlx5_flow_dv_modify_hdr_resource { + LIST_ENTRY(mlx5_flow_dv_modify_hdr_resource) next; + /* Pointer to next element. */ + rte_atomic32_t refcnt; /**< Reference counter. */ + struct ibv_flow_action *verbs_action; + /**< Verbs modify header action object. */ + uint8_t ft_type; /**< Flow table type, Rx or Tx. */ + uint32_t actions_num; /**< Number of modification actions. */ + uint64_t flags; /**< Flags for RDMA API. */ + struct mlx5_modification_cmd actions[]; + /**< Modification actions. */ +}; + +/* Jump action resource structure. */ +struct mlx5_flow_dv_jump_tbl_resource { + rte_atomic32_t refcnt; /**< Reference counter. */ + uint8_t ft_type; /**< Flow table type, Rx or Tx. */ + void *action; /**< Pointer to the rdma core action. */ +}; + +/* Port ID resource structure. */ +struct mlx5_flow_dv_port_id_action_resource { + ILIST_ENTRY(uint32_t)next; + /* Pointer to next element. */ + rte_atomic32_t refcnt; /**< Reference counter. */ + void *action; + /**< Verbs tag action object. */ + uint32_t port_id; /**< Port ID value. */ +}; + +/* Push VLAN action resource structure */ +struct mlx5_flow_dv_push_vlan_action_resource { + ILIST_ENTRY(uint32_t)next; + /* Pointer to next element. */ + rte_atomic32_t refcnt; /**< Reference counter. */ + void *action; /**< Direct verbs action object. */ + uint8_t ft_type; /**< Flow table type, Rx, Tx or FDB. */ + rte_be32_t vlan_tag; /**< VLAN tag value. */ +}; + +/* Metadata register copy table entry. */ +struct mlx5_flow_mreg_copy_resource { + /* + * Hash list entry for copy table. + * - Key is 32/64-bit MARK action ID. + * - MUST be the first entry. + */ + struct mlx5_hlist_entry hlist_ent; + LIST_ENTRY(mlx5_flow_mreg_copy_resource) next; + /* List entry for device flows. */ + uint32_t refcnt; /* Reference counter. */ + uint32_t appcnt; /* Apply/Remove counter. */ + uint32_t idx; + uint32_t rix_flow; /* Built flow for copy. */ +}; + +/* Table data structure of the hash organization. */ +struct mlx5_flow_tbl_data_entry { + struct mlx5_hlist_entry entry; + /**< hash list entry, 64-bits key inside. */ + struct mlx5_flow_tbl_resource tbl; + /**< flow table resource. */ + LIST_HEAD(matchers, mlx5_flow_dv_matcher) matchers; + /**< matchers' header associated with the flow table. */ + struct mlx5_flow_dv_jump_tbl_resource jump; + /**< jump resource, at most one for each table created. */ + uint32_t idx; /**< index for the indexed mempool. */ +}; + +/* Verbs specification header. */ +struct ibv_spec_header { + enum ibv_flow_spec_type type; + uint16_t size; +}; + +/* RSS description. */ +struct mlx5_flow_rss_desc { + uint32_t level; + uint32_t queue_num; /**< Number of entries in @p queue. */ + uint64_t types; /**< Specific RSS hash types (see ETH_RSS_*). */ + uint8_t key[MLX5_RSS_HASH_KEY_LEN]; /**< RSS hash key. */ + uint16_t queue[]; /**< Destination queues to redirect traffic to. */ +}; + + +/** Device flow handle structure for DV mode only. */ +struct mlx5_flow_handle_dv { + /* Flow DV api: */ + struct mlx5_flow_dv_matcher *matcher; /**< Cache to matcher. */ + struct mlx5_flow_dv_modify_hdr_resource *modify_hdr; + /**< Pointer to modify header resource in cache. */ + uint32_t rix_encap_decap; + /**< Index to encap/decap resource in cache. */ + uint32_t rix_push_vlan; + /**< Index to push VLAN action resource in cache. */ + uint32_t rix_tag; + /**< Index to the tag action. */ +} __rte_packed; + +/** Device flow handle structure: used both for creating & destroying. */ +struct mlx5_flow_handle { + SILIST_ENTRY(uint32_t)next; + struct mlx5_vf_vlan vf_vlan; /**< Structure for VF VLAN workaround. */ + /**< Index to next device flow handle. */ + uint64_t layers; + /**< Bit-fields of present layers, see MLX5_FLOW_LAYER_*. */ + void *ib_flow; /**< Verbs flow pointer. */ + uint32_t split_flow_id:28; /**< Sub flow unique match flow id. */ + uint32_t mark:1; /**< Metadate rxq mark flag. */ + uint32_t fate_action:3; /**< Fate action type. */ + union { + uint32_t rix_hrxq; /**< Hash Rx queue object index. */ + uint32_t rix_jump; /**< Index to the jump action resource. */ + uint32_t rix_port_id_action; + /**< Index to port ID action resource. */ + uint32_t rix_fate; + /**< Generic value indicates the fate action. */ + }; +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + struct mlx5_flow_handle_dv dvh; +#endif +} __rte_packed; + +/* + * Size for Verbs device flow handle structure only. Do not use the DV only + * structure in Verbs. No DV flows attributes will be accessed. + * Macro offsetof() could also be used here. + */ +#ifdef HAVE_IBV_FLOW_DV_SUPPORT +#define MLX5_FLOW_HANDLE_VERBS_SIZE \ + (sizeof(struct mlx5_flow_handle) - sizeof(struct mlx5_flow_handle_dv)) +#else +#define MLX5_FLOW_HANDLE_VERBS_SIZE (sizeof(struct mlx5_flow_handle)) +#endif + +/* + * Max number of actions per DV flow. + * See CREATE_FLOW_MAX_FLOW_ACTIONS_SUPPORTED + * in rdma-core file providers/mlx5/verbs.c. + */ +#define MLX5_DV_MAX_NUMBER_OF_ACTIONS 8 + +/** Device flow structure only for DV flow creation. */ +struct mlx5_flow_dv_workspace { + uint32_t group; /**< The group index. */ + uint8_t transfer; /**< 1 if the flow is E-Switch flow. */ + int actions_n; /**< number of actions. */ + void *actions[MLX5_DV_MAX_NUMBER_OF_ACTIONS]; /**< Action list. */ + struct mlx5_flow_dv_encap_decap_resource *encap_decap; + /**< Pointer to encap/decap resource in cache. */ + struct mlx5_flow_dv_push_vlan_action_resource *push_vlan_res; + /**< Pointer to push VLAN action resource in cache. */ + struct mlx5_flow_dv_tag_resource *tag_resource; + /**< pointer to the tag action. */ + struct mlx5_flow_dv_port_id_action_resource *port_id_action; + /**< Pointer to port ID action resource. */ + struct mlx5_flow_dv_jump_tbl_resource *jump; + /**< Pointer to the jump action resource. */ + struct mlx5_flow_dv_match_params value; + /**< Holds the value that the packet is compared to. */ +}; + +/* + * Maximal Verbs flow specifications & actions size. + * Some elements are mutually exclusive, but enough space should be allocated. + * Tunnel cases: 1. Max 2 Ethernet + IP(v6 len > v4 len) + TCP/UDP headers. + * 2. One tunnel header (exception: GRE + MPLS), + * SPEC length: GRE == tunnel. + * Actions: 1. 1 Mark OR Flag. + * 2. 1 Drop (if any). + * 3. No limitation for counters, but it makes no sense to support too + * many counters in a single device flow. + */ +#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT +#define MLX5_VERBS_MAX_SPEC_SIZE \ + ( \ + (2 * (sizeof(struct ibv_flow_spec_eth) + \ + sizeof(struct ibv_flow_spec_ipv6) + \ + sizeof(struct ibv_flow_spec_tcp_udp)) + \ + sizeof(struct ibv_flow_spec_gre) + \ + sizeof(struct ibv_flow_spec_mpls)) \ + ) +#else +#define MLX5_VERBS_MAX_SPEC_SIZE \ + ( \ + (2 * (sizeof(struct ibv_flow_spec_eth) + \ + sizeof(struct ibv_flow_spec_ipv6) + \ + sizeof(struct ibv_flow_spec_tcp_udp)) + \ + sizeof(struct ibv_flow_spec_tunnel)) \ + ) +#endif + +#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) || \ + defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) +#define MLX5_VERBS_MAX_ACT_SIZE \ + ( \ + sizeof(struct ibv_flow_spec_action_tag) + \ + sizeof(struct ibv_flow_spec_action_drop) + \ + sizeof(struct ibv_flow_spec_counter_action) * 4 \ + ) +#else +#define MLX5_VERBS_MAX_ACT_SIZE \ + ( \ + sizeof(struct ibv_flow_spec_action_tag) + \ + sizeof(struct ibv_flow_spec_action_drop) \ + ) +#endif + +#define MLX5_VERBS_MAX_SPEC_ACT_SIZE \ + (MLX5_VERBS_MAX_SPEC_SIZE + MLX5_VERBS_MAX_ACT_SIZE) + +/** Device flow structure only for Verbs flow creation. */ +struct mlx5_flow_verbs_workspace { + unsigned int size; /**< Size of the attribute. */ + struct ibv_flow_attr attr; /**< Verbs flow attribute buffer. */ + uint8_t specs[MLX5_VERBS_MAX_SPEC_ACT_SIZE]; + /**< Specifications & actions buffer of verbs flow. */ +}; + +/** Maximal number of device sub-flows supported. */ +#define MLX5_NUM_MAX_DEV_FLOWS 32 + +/** Device flow structure. */ +struct mlx5_flow { + struct rte_flow *flow; /**< Pointer to the main flow. */ + uint32_t flow_idx; /**< The memory pool index to the main flow. */ + uint64_t hash_fields; /**< Verbs hash Rx queue hash fields. */ + uint64_t act_flags; + /**< Bit-fields of detected actions, see MLX5_FLOW_ACTION_*. */ + bool external; /**< true if the flow is created external to PMD. */ + uint8_t ingress; /**< 1 if the flow is ingress. */ + union { +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + struct mlx5_flow_dv_workspace dv; +#endif + struct mlx5_flow_verbs_workspace verbs; + }; + struct mlx5_flow_handle *handle; + uint32_t handle_idx; /* Index of the mlx5 flow handle memory. */ +}; + +/* Flow meter state. */ +#define MLX5_FLOW_METER_DISABLE 0 +#define MLX5_FLOW_METER_ENABLE 1 + +#define MLX5_MAN_WIDTH 8 +/* Modify this value if enum rte_mtr_color changes. */ +#define RTE_MTR_DROPPED RTE_COLORS + +/* Meter policer statistics */ +struct mlx5_flow_policer_stats { + uint32_t cnt[RTE_COLORS + 1]; + /**< Color counter, extra for drop. */ + uint64_t stats_mask; + /**< Statistics mask for the colors. */ +}; + +/* Meter table structure. */ +struct mlx5_meter_domain_info { + struct mlx5_flow_tbl_resource *tbl; + /**< Meter table. */ + struct mlx5_flow_tbl_resource *sfx_tbl; + /**< Meter suffix table. */ + void *any_matcher; + /**< Meter color not match default criteria. */ + void *color_matcher; + /**< Meter color match criteria. */ + void *jump_actn; + /**< Meter match action. */ + void *policer_rules[RTE_MTR_DROPPED + 1]; + /**< Meter policer for the match. */ +}; + +/* Meter table set for TX RX FDB. */ +struct mlx5_meter_domains_infos { + uint32_t ref_cnt; + /**< Table user count. */ + struct mlx5_meter_domain_info egress; + /**< TX meter table. */ + struct mlx5_meter_domain_info ingress; + /**< RX meter table. */ + struct mlx5_meter_domain_info transfer; + /**< FDB meter table. */ + void *drop_actn; + /**< Drop action as not matched. */ + void *count_actns[RTE_MTR_DROPPED + 1]; + /**< Counters for match and unmatched statistics. */ + uint32_t fmp[MLX5_ST_SZ_DW(flow_meter_parameters)]; + /**< Flow meter parameter. */ + size_t fmp_size; + /**< Flow meter parameter size. */ + void *meter_action; + /**< Flow meter action. */ +}; + +/* Meter parameter structure. */ +struct mlx5_flow_meter { + TAILQ_ENTRY(mlx5_flow_meter) next; + /**< Pointer to the next flow meter structure. */ + uint32_t idx; /* Index to meter object. */ + uint32_t meter_id; + /**< Meter id. */ + struct mlx5_flow_meter_profile *profile; + /**< Meter profile parameters. */ + + /** Policer actions (per meter output color). */ + enum rte_mtr_policer_action action[RTE_COLORS]; + + /** Set of stats counters to be enabled. + * @see enum rte_mtr_stats_type + */ + uint64_t stats_mask; + + /**< Rule applies to ingress traffic. */ + uint32_t ingress:1; + + /**< Rule applies to egress traffic. */ + uint32_t egress:1; + /** + * Instead of simply matching the properties of traffic as it would + * appear on a given DPDK port ID, enabling this attribute transfers + * a flow rule to the lowest possible level of any device endpoints + * found in the pattern. + * + * When supported, this effectively enables an application to + * re-route traffic not necessarily intended for it (e.g. coming + * from or addressed to different physical ports, VFs or + * applications) at the device level. + * + * It complements the behavior of some pattern items such as + * RTE_FLOW_ITEM_TYPE_PHY_PORT and is meaningless without them. + * + * When transferring flow rules, ingress and egress attributes keep + * their original meaning, as if processing traffic emitted or + * received by the application. + */ + uint32_t transfer:1; + struct mlx5_meter_domains_infos *mfts; + /**< Flow table created for this meter. */ + struct mlx5_flow_policer_stats policer_stats; + /**< Meter policer statistics. */ + uint32_t ref_cnt; + /**< Use count. */ + uint32_t active_state:1; + /**< Meter state. */ + uint32_t shared:1; + /**< Meter shared or not. */ +}; + +/* RFC2697 parameter structure. */ +struct mlx5_flow_meter_srtcm_rfc2697_prm { + /* green_saturation_value = cbs_mantissa * 2^cbs_exponent */ + uint32_t cbs_exponent:5; + uint32_t cbs_mantissa:8; + /* cir = 8G * cir_mantissa * 1/(2^cir_exponent) Bytes/Sec */ + uint32_t cir_exponent:5; + uint32_t cir_mantissa:8; + /* yellow _saturation_value = ebs_mantissa * 2^ebs_exponent */ + uint32_t ebs_exponent:5; + uint32_t ebs_mantissa:8; +}; + +/* Flow meter profile structure. */ +struct mlx5_flow_meter_profile { + TAILQ_ENTRY(mlx5_flow_meter_profile) next; + /**< Pointer to the next flow meter structure. */ + uint32_t meter_profile_id; /**< Profile id. */ + struct rte_mtr_meter_profile profile; /**< Profile detail. */ + union { + struct mlx5_flow_meter_srtcm_rfc2697_prm srtcm_prm; + /**< srtcm_rfc2697 struct. */ + }; + uint32_t ref_cnt; /**< Use count. */ +}; + +/* Fdir flow structure */ +struct mlx5_fdir_flow { + LIST_ENTRY(mlx5_fdir_flow) next; /* Pointer to the next element. */ + struct mlx5_fdir *fdir; /* Pointer to fdir. */ + uint32_t rix_flow; /* Index to flow. */ +}; + +#define HAIRPIN_FLOW_ID_BITS 28 + +/* Flow structure. */ +struct rte_flow { + ILIST_ENTRY(uint32_t)next; /**< Index to the next flow structure. */ + uint32_t dev_handles; + /**< Device flow handles that are part of the flow. */ + uint32_t drv_type:2; /**< Driver type. */ + uint32_t fdir:1; /**< Identifier of associated FDIR if any. */ + uint32_t hairpin_flow_id:HAIRPIN_FLOW_ID_BITS; + /**< The flow id used for hairpin. */ + uint32_t copy_applied:1; /**< The MARK copy Flow os applied. */ + uint32_t rix_mreg_copy; + /**< Index to metadata register copy table resource. */ + uint32_t counter; /**< Holds flow counter. */ + uint16_t meter; /**< Holds flow meter id. */ +} __rte_packed; + +typedef int (*mlx5_flow_validate_t)(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + bool external, + int hairpin, + struct rte_flow_error *error); +typedef struct mlx5_flow *(*mlx5_flow_prepare_t) + (struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], struct rte_flow_error *error); +typedef int (*mlx5_flow_translate_t)(struct rte_eth_dev *dev, + struct mlx5_flow *dev_flow, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error); +typedef int (*mlx5_flow_apply_t)(struct rte_eth_dev *dev, struct rte_flow *flow, + struct rte_flow_error *error); +typedef void (*mlx5_flow_remove_t)(struct rte_eth_dev *dev, + struct rte_flow *flow); +typedef void (*mlx5_flow_destroy_t)(struct rte_eth_dev *dev, + struct rte_flow *flow); +typedef int (*mlx5_flow_query_t)(struct rte_eth_dev *dev, + struct rte_flow *flow, + const struct rte_flow_action *actions, + void *data, + struct rte_flow_error *error); +typedef struct mlx5_meter_domains_infos *(*mlx5_flow_create_mtr_tbls_t) + (struct rte_eth_dev *dev, + const struct mlx5_flow_meter *fm); +typedef int (*mlx5_flow_destroy_mtr_tbls_t)(struct rte_eth_dev *dev, + struct mlx5_meter_domains_infos *tbls); +typedef int (*mlx5_flow_create_policer_rules_t) + (struct rte_eth_dev *dev, + struct mlx5_flow_meter *fm, + const struct rte_flow_attr *attr); +typedef int (*mlx5_flow_destroy_policer_rules_t) + (struct rte_eth_dev *dev, + const struct mlx5_flow_meter *fm, + const struct rte_flow_attr *attr); +typedef uint32_t (*mlx5_flow_counter_alloc_t) + (struct rte_eth_dev *dev); +typedef void (*mlx5_flow_counter_free_t)(struct rte_eth_dev *dev, + uint32_t cnt); +typedef int (*mlx5_flow_counter_query_t)(struct rte_eth_dev *dev, + uint32_t cnt, + bool clear, uint64_t *pkts, + uint64_t *bytes); +typedef int (*mlx5_flow_get_aged_flows_t) + (struct rte_eth_dev *dev, + void **context, + uint32_t nb_contexts, + struct rte_flow_error *error); +struct mlx5_flow_driver_ops { + mlx5_flow_validate_t validate; + mlx5_flow_prepare_t prepare; + mlx5_flow_translate_t translate; + mlx5_flow_apply_t apply; + mlx5_flow_remove_t remove; + mlx5_flow_destroy_t destroy; + mlx5_flow_query_t query; + mlx5_flow_create_mtr_tbls_t create_mtr_tbls; + mlx5_flow_destroy_mtr_tbls_t destroy_mtr_tbls; + mlx5_flow_create_policer_rules_t create_policer_rules; + mlx5_flow_destroy_policer_rules_t destroy_policer_rules; + mlx5_flow_counter_alloc_t counter_alloc; + mlx5_flow_counter_free_t counter_free; + mlx5_flow_counter_query_t counter_query; + mlx5_flow_get_aged_flows_t get_aged_flows; +}; + +/* mlx5_flow.c */ + +struct mlx5_flow_id_pool *mlx5_flow_id_pool_alloc(uint32_t max_id); +void mlx5_flow_id_pool_release(struct mlx5_flow_id_pool *pool); +uint32_t mlx5_flow_id_get(struct mlx5_flow_id_pool *pool, uint32_t *id); +uint32_t mlx5_flow_id_release(struct mlx5_flow_id_pool *pool, + uint32_t id); +int mlx5_flow_group_to_table(const struct rte_flow_attr *attributes, + bool external, uint32_t group, bool fdb_def_rule, + uint32_t *table, struct rte_flow_error *error); +uint64_t mlx5_flow_hashfields_adjust(struct mlx5_flow_rss_desc *rss_desc, + int tunnel, uint64_t layer_types, + uint64_t hash_fields); +uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority, + uint32_t subpriority); +int mlx5_flow_get_reg_id(struct rte_eth_dev *dev, + enum mlx5_feature_name feature, + uint32_t id, + struct rte_flow_error *error); +const struct rte_flow_action *mlx5_flow_find_action + (const struct rte_flow_action *actions, + enum rte_flow_action_type action); +int mlx5_flow_validate_action_count(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + struct rte_flow_error *error); +int mlx5_flow_validate_action_drop(uint64_t action_flags, + const struct rte_flow_attr *attr, + struct rte_flow_error *error); +int mlx5_flow_validate_action_flag(uint64_t action_flags, + const struct rte_flow_attr *attr, + struct rte_flow_error *error); +int mlx5_flow_validate_action_mark(const struct rte_flow_action *action, + uint64_t action_flags, + const struct rte_flow_attr *attr, + struct rte_flow_error *error); +int mlx5_flow_validate_action_queue(const struct rte_flow_action *action, + uint64_t action_flags, + struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + struct rte_flow_error *error); +int mlx5_flow_validate_action_rss(const struct rte_flow_action *action, + uint64_t action_flags, + struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + uint64_t item_flags, + struct rte_flow_error *error); +int mlx5_flow_validate_attributes(struct rte_eth_dev *dev, + const struct rte_flow_attr *attributes, + struct rte_flow_error *error); +int mlx5_flow_item_acceptable(const struct rte_flow_item *item, + const uint8_t *mask, + const uint8_t *nic_mask, + unsigned int size, + struct rte_flow_error *error); +int mlx5_flow_validate_item_eth(const struct rte_flow_item *item, + uint64_t item_flags, + struct rte_flow_error *error); +int mlx5_flow_validate_item_gre(const struct rte_flow_item *item, + uint64_t item_flags, + uint8_t target_protocol, + struct rte_flow_error *error); +int mlx5_flow_validate_item_gre_key(const struct rte_flow_item *item, + uint64_t item_flags, + const struct rte_flow_item *gre_item, + struct rte_flow_error *error); +int mlx5_flow_validate_item_ipv4(const struct rte_flow_item *item, + uint64_t item_flags, + uint64_t last_item, + uint16_t ether_type, + const struct rte_flow_item_ipv4 *acc_mask, + struct rte_flow_error *error); +int mlx5_flow_validate_item_ipv6(const struct rte_flow_item *item, + uint64_t item_flags, + uint64_t last_item, + uint16_t ether_type, + const struct rte_flow_item_ipv6 *acc_mask, + struct rte_flow_error *error); +int mlx5_flow_validate_item_mpls(struct rte_eth_dev *dev, + const struct rte_flow_item *item, + uint64_t item_flags, + uint64_t prev_layer, + struct rte_flow_error *error); +int mlx5_flow_validate_item_tcp(const struct rte_flow_item *item, + uint64_t item_flags, + uint8_t target_protocol, + const struct rte_flow_item_tcp *flow_mask, + struct rte_flow_error *error); +int mlx5_flow_validate_item_udp(const struct rte_flow_item *item, + uint64_t item_flags, + uint8_t target_protocol, + struct rte_flow_error *error); +int mlx5_flow_validate_item_vlan(const struct rte_flow_item *item, + uint64_t item_flags, + struct rte_eth_dev *dev, + struct rte_flow_error *error); +int mlx5_flow_validate_item_vxlan(const struct rte_flow_item *item, + uint64_t item_flags, + struct rte_flow_error *error); +int mlx5_flow_validate_item_vxlan_gpe(const struct rte_flow_item *item, + uint64_t item_flags, + struct rte_eth_dev *dev, + struct rte_flow_error *error); +int mlx5_flow_validate_item_icmp(const struct rte_flow_item *item, + uint64_t item_flags, + uint8_t target_protocol, + struct rte_flow_error *error); +int mlx5_flow_validate_item_icmp6(const struct rte_flow_item *item, + uint64_t item_flags, + uint8_t target_protocol, + struct rte_flow_error *error); +int mlx5_flow_validate_item_nvgre(const struct rte_flow_item *item, + uint64_t item_flags, + uint8_t target_protocol, + struct rte_flow_error *error); +int mlx5_flow_validate_item_geneve(const struct rte_flow_item *item, + uint64_t item_flags, + struct rte_eth_dev *dev, + struct rte_flow_error *error); +struct mlx5_meter_domains_infos *mlx5_flow_create_mtr_tbls + (struct rte_eth_dev *dev, + const struct mlx5_flow_meter *fm); +int mlx5_flow_destroy_mtr_tbls(struct rte_eth_dev *dev, + struct mlx5_meter_domains_infos *tbl); +int mlx5_flow_create_policer_rules(struct rte_eth_dev *dev, + struct mlx5_flow_meter *fm, + const struct rte_flow_attr *attr); +int mlx5_flow_destroy_policer_rules(struct rte_eth_dev *dev, + struct mlx5_flow_meter *fm, + const struct rte_flow_attr *attr); +int mlx5_flow_meter_flush(struct rte_eth_dev *dev, + struct rte_mtr_error *error); +#endif /* RTE_PMD_MLX5_FLOW_H_ */ diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_dv.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_dv.c new file mode 100644 index 000000000..e48183195 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_dv.c @@ -0,0 +1,9666 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 Mellanox Technologies, Ltd + */ + +#include <sys/queue.h> +#include <stdalign.h> +#include <stdint.h> +#include <string.h> +#include <unistd.h> + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_common.h> +#include <rte_ether.h> +#include <rte_ethdev_driver.h> +#include <rte_flow.h> +#include <rte_flow_driver.h> +#include <rte_malloc.h> +#include <rte_cycles.h> +#include <rte_ip.h> +#include <rte_gre.h> +#include <rte_vxlan.h> +#include <rte_gtp.h> + +#include <mlx5_glue.h> +#include <mlx5_devx_cmds.h> +#include <mlx5_prm.h> + +#include "mlx5_defs.h" +#include "mlx5.h" +#include "mlx5_flow.h" +#include "mlx5_rxtx.h" + +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + +#ifndef HAVE_IBV_FLOW_DEVX_COUNTERS +#define MLX5DV_FLOW_ACTION_COUNTERS_DEVX 0 +#endif + +#ifndef HAVE_MLX5DV_DR_ESWITCH +#ifndef MLX5DV_FLOW_TABLE_TYPE_FDB +#define MLX5DV_FLOW_TABLE_TYPE_FDB 0 +#endif +#endif + +#ifndef HAVE_MLX5DV_DR +#define MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL 1 +#endif + +/* VLAN header definitions */ +#define MLX5DV_FLOW_VLAN_PCP_SHIFT 13 +#define MLX5DV_FLOW_VLAN_PCP_MASK (0x7 << MLX5DV_FLOW_VLAN_PCP_SHIFT) +#define MLX5DV_FLOW_VLAN_VID_MASK 0x0fff +#define MLX5DV_FLOW_VLAN_PCP_MASK_BE RTE_BE16(MLX5DV_FLOW_VLAN_PCP_MASK) +#define MLX5DV_FLOW_VLAN_VID_MASK_BE RTE_BE16(MLX5DV_FLOW_VLAN_VID_MASK) + +union flow_dv_attr { + struct { + uint32_t valid:1; + uint32_t ipv4:1; + uint32_t ipv6:1; + uint32_t tcp:1; + uint32_t udp:1; + uint32_t reserved:27; + }; + uint32_t attr; +}; + +static int +flow_dv_tbl_resource_release(struct rte_eth_dev *dev, + struct mlx5_flow_tbl_resource *tbl); + +/** + * Initialize flow attributes structure according to flow items' types. + * + * flow_dv_validate() avoids multiple L3/L4 layers cases other than tunnel + * mode. For tunnel mode, the items to be modified are the outermost ones. + * + * @param[in] item + * Pointer to item specification. + * @param[out] attr + * Pointer to flow attributes structure. + * @param[in] dev_flow + * Pointer to the sub flow. + * @param[in] tunnel_decap + * Whether action is after tunnel decapsulation. + */ +static void +flow_dv_attr_init(const struct rte_flow_item *item, union flow_dv_attr *attr, + struct mlx5_flow *dev_flow, bool tunnel_decap) +{ + uint64_t layers = dev_flow->handle->layers; + + /* + * If layers is already initialized, it means this dev_flow is the + * suffix flow, the layers flags is set by the prefix flow. Need to + * use the layer flags from prefix flow as the suffix flow may not + * have the user defined items as the flow is split. + */ + if (layers) { + if (layers & MLX5_FLOW_LAYER_OUTER_L3_IPV4) + attr->ipv4 = 1; + else if (layers & MLX5_FLOW_LAYER_OUTER_L3_IPV6) + attr->ipv6 = 1; + if (layers & MLX5_FLOW_LAYER_OUTER_L4_TCP) + attr->tcp = 1; + else if (layers & MLX5_FLOW_LAYER_OUTER_L4_UDP) + attr->udp = 1; + attr->valid = 1; + return; + } + for (; item->type != RTE_FLOW_ITEM_TYPE_END; item++) { + uint8_t next_protocol = 0xff; + switch (item->type) { + case RTE_FLOW_ITEM_TYPE_GRE: + case RTE_FLOW_ITEM_TYPE_NVGRE: + case RTE_FLOW_ITEM_TYPE_VXLAN: + case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: + case RTE_FLOW_ITEM_TYPE_GENEVE: + case RTE_FLOW_ITEM_TYPE_MPLS: + if (tunnel_decap) + attr->attr = 0; + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + if (!attr->ipv6) + attr->ipv4 = 1; + if (item->mask != NULL && + ((const struct rte_flow_item_ipv4 *) + item->mask)->hdr.next_proto_id) + next_protocol = + ((const struct rte_flow_item_ipv4 *) + (item->spec))->hdr.next_proto_id & + ((const struct rte_flow_item_ipv4 *) + (item->mask))->hdr.next_proto_id; + if ((next_protocol == IPPROTO_IPIP || + next_protocol == IPPROTO_IPV6) && tunnel_decap) + attr->attr = 0; + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + if (!attr->ipv4) + attr->ipv6 = 1; + if (item->mask != NULL && + ((const struct rte_flow_item_ipv6 *) + item->mask)->hdr.proto) + next_protocol = + ((const struct rte_flow_item_ipv6 *) + (item->spec))->hdr.proto & + ((const struct rte_flow_item_ipv6 *) + (item->mask))->hdr.proto; + if ((next_protocol == IPPROTO_IPIP || + next_protocol == IPPROTO_IPV6) && tunnel_decap) + attr->attr = 0; + break; + case RTE_FLOW_ITEM_TYPE_UDP: + if (!attr->tcp) + attr->udp = 1; + break; + case RTE_FLOW_ITEM_TYPE_TCP: + if (!attr->udp) + attr->tcp = 1; + break; + default: + break; + } + } + attr->valid = 1; +} + +/** + * Convert rte_mtr_color to mlx5 color. + * + * @param[in] rcol + * rte_mtr_color. + * + * @return + * mlx5 color. + */ +static int +rte_col_2_mlx5_col(enum rte_color rcol) +{ + switch (rcol) { + case RTE_COLOR_GREEN: + return MLX5_FLOW_COLOR_GREEN; + case RTE_COLOR_YELLOW: + return MLX5_FLOW_COLOR_YELLOW; + case RTE_COLOR_RED: + return MLX5_FLOW_COLOR_RED; + default: + break; + } + return MLX5_FLOW_COLOR_UNDEFINED; +} + +struct field_modify_info { + uint32_t size; /* Size of field in protocol header, in bytes. */ + uint32_t offset; /* Offset of field in protocol header, in bytes. */ + enum mlx5_modification_field id; +}; + +struct field_modify_info modify_eth[] = { + {4, 0, MLX5_MODI_OUT_DMAC_47_16}, + {2, 4, MLX5_MODI_OUT_DMAC_15_0}, + {4, 6, MLX5_MODI_OUT_SMAC_47_16}, + {2, 10, MLX5_MODI_OUT_SMAC_15_0}, + {0, 0, 0}, +}; + +struct field_modify_info modify_vlan_out_first_vid[] = { + /* Size in bits !!! */ + {12, 0, MLX5_MODI_OUT_FIRST_VID}, + {0, 0, 0}, +}; + +struct field_modify_info modify_ipv4[] = { + {1, 1, MLX5_MODI_OUT_IP_DSCP}, + {1, 8, MLX5_MODI_OUT_IPV4_TTL}, + {4, 12, MLX5_MODI_OUT_SIPV4}, + {4, 16, MLX5_MODI_OUT_DIPV4}, + {0, 0, 0}, +}; + +struct field_modify_info modify_ipv6[] = { + {1, 0, MLX5_MODI_OUT_IP_DSCP}, + {1, 7, MLX5_MODI_OUT_IPV6_HOPLIMIT}, + {4, 8, MLX5_MODI_OUT_SIPV6_127_96}, + {4, 12, MLX5_MODI_OUT_SIPV6_95_64}, + {4, 16, MLX5_MODI_OUT_SIPV6_63_32}, + {4, 20, MLX5_MODI_OUT_SIPV6_31_0}, + {4, 24, MLX5_MODI_OUT_DIPV6_127_96}, + {4, 28, MLX5_MODI_OUT_DIPV6_95_64}, + {4, 32, MLX5_MODI_OUT_DIPV6_63_32}, + {4, 36, MLX5_MODI_OUT_DIPV6_31_0}, + {0, 0, 0}, +}; + +struct field_modify_info modify_udp[] = { + {2, 0, MLX5_MODI_OUT_UDP_SPORT}, + {2, 2, MLX5_MODI_OUT_UDP_DPORT}, + {0, 0, 0}, +}; + +struct field_modify_info modify_tcp[] = { + {2, 0, MLX5_MODI_OUT_TCP_SPORT}, + {2, 2, MLX5_MODI_OUT_TCP_DPORT}, + {4, 4, MLX5_MODI_OUT_TCP_SEQ_NUM}, + {4, 8, MLX5_MODI_OUT_TCP_ACK_NUM}, + {0, 0, 0}, +}; + +static void +mlx5_flow_tunnel_ip_check(const struct rte_flow_item *item __rte_unused, + uint8_t next_protocol, uint64_t *item_flags, + int *tunnel) +{ + MLX5_ASSERT(item->type == RTE_FLOW_ITEM_TYPE_IPV4 || + item->type == RTE_FLOW_ITEM_TYPE_IPV6); + if (next_protocol == IPPROTO_IPIP) { + *item_flags |= MLX5_FLOW_LAYER_IPIP; + *tunnel = 1; + } + if (next_protocol == IPPROTO_IPV6) { + *item_flags |= MLX5_FLOW_LAYER_IPV6_ENCAP; + *tunnel = 1; + } +} + +/** + * Acquire the synchronizing object to protect multithreaded access + * to shared dv context. Lock occurs only if context is actually + * shared, i.e. we have multiport IB device and representors are + * created. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + */ +static void +flow_dv_shared_lock(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; + + if (sh->dv_refcnt > 1) { + int ret; + + ret = pthread_mutex_lock(&sh->dv_mutex); + MLX5_ASSERT(!ret); + (void)ret; + } +} + +static void +flow_dv_shared_unlock(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; + + if (sh->dv_refcnt > 1) { + int ret; + + ret = pthread_mutex_unlock(&sh->dv_mutex); + MLX5_ASSERT(!ret); + (void)ret; + } +} + +/* Update VLAN's VID/PCP based on input rte_flow_action. + * + * @param[in] action + * Pointer to struct rte_flow_action. + * @param[out] vlan + * Pointer to struct rte_vlan_hdr. + */ +static void +mlx5_update_vlan_vid_pcp(const struct rte_flow_action *action, + struct rte_vlan_hdr *vlan) +{ + uint16_t vlan_tci; + if (action->type == RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) { + vlan_tci = + ((const struct rte_flow_action_of_set_vlan_pcp *) + action->conf)->vlan_pcp; + vlan_tci = vlan_tci << MLX5DV_FLOW_VLAN_PCP_SHIFT; + vlan->vlan_tci &= ~MLX5DV_FLOW_VLAN_PCP_MASK; + vlan->vlan_tci |= vlan_tci; + } else if (action->type == RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) { + vlan->vlan_tci &= ~MLX5DV_FLOW_VLAN_VID_MASK; + vlan->vlan_tci |= rte_be_to_cpu_16 + (((const struct rte_flow_action_of_set_vlan_vid *) + action->conf)->vlan_vid); + } +} + +/** + * Fetch 1, 2, 3 or 4 byte field from the byte array + * and return as unsigned integer in host-endian format. + * + * @param[in] data + * Pointer to data array. + * @param[in] size + * Size of field to extract. + * + * @return + * converted field in host endian format. + */ +static inline uint32_t +flow_dv_fetch_field(const uint8_t *data, uint32_t size) +{ + uint32_t ret; + + switch (size) { + case 1: + ret = *data; + break; + case 2: + ret = rte_be_to_cpu_16(*(const unaligned_uint16_t *)data); + break; + case 3: + ret = rte_be_to_cpu_16(*(const unaligned_uint16_t *)data); + ret = (ret << 8) | *(data + sizeof(uint16_t)); + break; + case 4: + ret = rte_be_to_cpu_32(*(const unaligned_uint32_t *)data); + break; + default: + MLX5_ASSERT(false); + ret = 0; + break; + } + return ret; +} + +/** + * Convert modify-header action to DV specification. + * + * Data length of each action is determined by provided field description + * and the item mask. Data bit offset and width of each action is determined + * by provided item mask. + * + * @param[in] item + * Pointer to item specification. + * @param[in] field + * Pointer to field modification information. + * For MLX5_MODIFICATION_TYPE_SET specifies destination field. + * For MLX5_MODIFICATION_TYPE_ADD specifies destination field. + * For MLX5_MODIFICATION_TYPE_COPY specifies source field. + * @param[in] dcopy + * Destination field info for MLX5_MODIFICATION_TYPE_COPY in @type. + * Negative offset value sets the same offset as source offset. + * size field is ignored, value is taken from source field. + * @param[in,out] resource + * Pointer to the modify-header resource. + * @param[in] type + * Type of modification. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_convert_modify_action(struct rte_flow_item *item, + struct field_modify_info *field, + struct field_modify_info *dcopy, + struct mlx5_flow_dv_modify_hdr_resource *resource, + uint32_t type, struct rte_flow_error *error) +{ + uint32_t i = resource->actions_num; + struct mlx5_modification_cmd *actions = resource->actions; + + /* + * The item and mask are provided in big-endian format. + * The fields should be presented as in big-endian format either. + * Mask must be always present, it defines the actual field width. + */ + MLX5_ASSERT(item->mask); + MLX5_ASSERT(field->size); + do { + unsigned int size_b; + unsigned int off_b; + uint32_t mask; + uint32_t data; + + if (i >= MLX5_MAX_MODIFY_NUM) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "too many items to modify"); + /* Fetch variable byte size mask from the array. */ + mask = flow_dv_fetch_field((const uint8_t *)item->mask + + field->offset, field->size); + if (!mask) { + ++field; + continue; + } + /* Deduce actual data width in bits from mask value. */ + off_b = rte_bsf32(mask); + size_b = sizeof(uint32_t) * CHAR_BIT - + off_b - __builtin_clz(mask); + MLX5_ASSERT(size_b); + size_b = size_b == sizeof(uint32_t) * CHAR_BIT ? 0 : size_b; + actions[i] = (struct mlx5_modification_cmd) { + .action_type = type, + .field = field->id, + .offset = off_b, + .length = size_b, + }; + /* Convert entire record to expected big-endian format. */ + actions[i].data0 = rte_cpu_to_be_32(actions[i].data0); + if (type == MLX5_MODIFICATION_TYPE_COPY) { + MLX5_ASSERT(dcopy); + actions[i].dst_field = dcopy->id; + actions[i].dst_offset = + (int)dcopy->offset < 0 ? off_b : dcopy->offset; + /* Convert entire record to big-endian format. */ + actions[i].data1 = rte_cpu_to_be_32(actions[i].data1); + } else { + MLX5_ASSERT(item->spec); + data = flow_dv_fetch_field((const uint8_t *)item->spec + + field->offset, field->size); + /* Shift out the trailing masked bits from data. */ + data = (data & mask) >> off_b; + actions[i].data1 = rte_cpu_to_be_32(data); + } + ++i; + ++field; + } while (field->size); + if (resource->actions_num == i) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "invalid modification flow item"); + resource->actions_num = i; + return 0; +} + +/** + * Convert modify-header set IPv4 address action to DV specification. + * + * @param[in,out] resource + * Pointer to the modify-header resource. + * @param[in] action + * Pointer to action specification. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_convert_action_modify_ipv4 + (struct mlx5_flow_dv_modify_hdr_resource *resource, + const struct rte_flow_action *action, + struct rte_flow_error *error) +{ + const struct rte_flow_action_set_ipv4 *conf = + (const struct rte_flow_action_set_ipv4 *)(action->conf); + struct rte_flow_item item = { .type = RTE_FLOW_ITEM_TYPE_IPV4 }; + struct rte_flow_item_ipv4 ipv4; + struct rte_flow_item_ipv4 ipv4_mask; + + memset(&ipv4, 0, sizeof(ipv4)); + memset(&ipv4_mask, 0, sizeof(ipv4_mask)); + if (action->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC) { + ipv4.hdr.src_addr = conf->ipv4_addr; + ipv4_mask.hdr.src_addr = rte_flow_item_ipv4_mask.hdr.src_addr; + } else { + ipv4.hdr.dst_addr = conf->ipv4_addr; + ipv4_mask.hdr.dst_addr = rte_flow_item_ipv4_mask.hdr.dst_addr; + } + item.spec = &ipv4; + item.mask = &ipv4_mask; + return flow_dv_convert_modify_action(&item, modify_ipv4, NULL, resource, + MLX5_MODIFICATION_TYPE_SET, error); +} + +/** + * Convert modify-header set IPv6 address action to DV specification. + * + * @param[in,out] resource + * Pointer to the modify-header resource. + * @param[in] action + * Pointer to action specification. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_convert_action_modify_ipv6 + (struct mlx5_flow_dv_modify_hdr_resource *resource, + const struct rte_flow_action *action, + struct rte_flow_error *error) +{ + const struct rte_flow_action_set_ipv6 *conf = + (const struct rte_flow_action_set_ipv6 *)(action->conf); + struct rte_flow_item item = { .type = RTE_FLOW_ITEM_TYPE_IPV6 }; + struct rte_flow_item_ipv6 ipv6; + struct rte_flow_item_ipv6 ipv6_mask; + + memset(&ipv6, 0, sizeof(ipv6)); + memset(&ipv6_mask, 0, sizeof(ipv6_mask)); + if (action->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC) { + memcpy(&ipv6.hdr.src_addr, &conf->ipv6_addr, + sizeof(ipv6.hdr.src_addr)); + memcpy(&ipv6_mask.hdr.src_addr, + &rte_flow_item_ipv6_mask.hdr.src_addr, + sizeof(ipv6.hdr.src_addr)); + } else { + memcpy(&ipv6.hdr.dst_addr, &conf->ipv6_addr, + sizeof(ipv6.hdr.dst_addr)); + memcpy(&ipv6_mask.hdr.dst_addr, + &rte_flow_item_ipv6_mask.hdr.dst_addr, + sizeof(ipv6.hdr.dst_addr)); + } + item.spec = &ipv6; + item.mask = &ipv6_mask; + return flow_dv_convert_modify_action(&item, modify_ipv6, NULL, resource, + MLX5_MODIFICATION_TYPE_SET, error); +} + +/** + * Convert modify-header set MAC address action to DV specification. + * + * @param[in,out] resource + * Pointer to the modify-header resource. + * @param[in] action + * Pointer to action specification. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_convert_action_modify_mac + (struct mlx5_flow_dv_modify_hdr_resource *resource, + const struct rte_flow_action *action, + struct rte_flow_error *error) +{ + const struct rte_flow_action_set_mac *conf = + (const struct rte_flow_action_set_mac *)(action->conf); + struct rte_flow_item item = { .type = RTE_FLOW_ITEM_TYPE_ETH }; + struct rte_flow_item_eth eth; + struct rte_flow_item_eth eth_mask; + + memset(ð, 0, sizeof(eth)); + memset(ð_mask, 0, sizeof(eth_mask)); + if (action->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC) { + memcpy(ð.src.addr_bytes, &conf->mac_addr, + sizeof(eth.src.addr_bytes)); + memcpy(ð_mask.src.addr_bytes, + &rte_flow_item_eth_mask.src.addr_bytes, + sizeof(eth_mask.src.addr_bytes)); + } else { + memcpy(ð.dst.addr_bytes, &conf->mac_addr, + sizeof(eth.dst.addr_bytes)); + memcpy(ð_mask.dst.addr_bytes, + &rte_flow_item_eth_mask.dst.addr_bytes, + sizeof(eth_mask.dst.addr_bytes)); + } + item.spec = ð + item.mask = ð_mask; + return flow_dv_convert_modify_action(&item, modify_eth, NULL, resource, + MLX5_MODIFICATION_TYPE_SET, error); +} + +/** + * Convert modify-header set VLAN VID action to DV specification. + * + * @param[in,out] resource + * Pointer to the modify-header resource. + * @param[in] action + * Pointer to action specification. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_convert_action_modify_vlan_vid + (struct mlx5_flow_dv_modify_hdr_resource *resource, + const struct rte_flow_action *action, + struct rte_flow_error *error) +{ + const struct rte_flow_action_of_set_vlan_vid *conf = + (const struct rte_flow_action_of_set_vlan_vid *)(action->conf); + int i = resource->actions_num; + struct mlx5_modification_cmd *actions = resource->actions; + struct field_modify_info *field = modify_vlan_out_first_vid; + + if (i >= MLX5_MAX_MODIFY_NUM) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "too many items to modify"); + actions[i] = (struct mlx5_modification_cmd) { + .action_type = MLX5_MODIFICATION_TYPE_SET, + .field = field->id, + .length = field->size, + .offset = field->offset, + }; + actions[i].data0 = rte_cpu_to_be_32(actions[i].data0); + actions[i].data1 = conf->vlan_vid; + actions[i].data1 = actions[i].data1 << 16; + resource->actions_num = ++i; + return 0; +} + +/** + * Convert modify-header set TP action to DV specification. + * + * @param[in,out] resource + * Pointer to the modify-header resource. + * @param[in] action + * Pointer to action specification. + * @param[in] items + * Pointer to rte_flow_item objects list. + * @param[in] attr + * Pointer to flow attributes structure. + * @param[in] dev_flow + * Pointer to the sub flow. + * @param[in] tunnel_decap + * Whether action is after tunnel decapsulation. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_convert_action_modify_tp + (struct mlx5_flow_dv_modify_hdr_resource *resource, + const struct rte_flow_action *action, + const struct rte_flow_item *items, + union flow_dv_attr *attr, struct mlx5_flow *dev_flow, + bool tunnel_decap, struct rte_flow_error *error) +{ + const struct rte_flow_action_set_tp *conf = + (const struct rte_flow_action_set_tp *)(action->conf); + struct rte_flow_item item; + struct rte_flow_item_udp udp; + struct rte_flow_item_udp udp_mask; + struct rte_flow_item_tcp tcp; + struct rte_flow_item_tcp tcp_mask; + struct field_modify_info *field; + + if (!attr->valid) + flow_dv_attr_init(items, attr, dev_flow, tunnel_decap); + if (attr->udp) { + memset(&udp, 0, sizeof(udp)); + memset(&udp_mask, 0, sizeof(udp_mask)); + if (action->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC) { + udp.hdr.src_port = conf->port; + udp_mask.hdr.src_port = + rte_flow_item_udp_mask.hdr.src_port; + } else { + udp.hdr.dst_port = conf->port; + udp_mask.hdr.dst_port = + rte_flow_item_udp_mask.hdr.dst_port; + } + item.type = RTE_FLOW_ITEM_TYPE_UDP; + item.spec = &udp; + item.mask = &udp_mask; + field = modify_udp; + } else { + MLX5_ASSERT(attr->tcp); + memset(&tcp, 0, sizeof(tcp)); + memset(&tcp_mask, 0, sizeof(tcp_mask)); + if (action->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC) { + tcp.hdr.src_port = conf->port; + tcp_mask.hdr.src_port = + rte_flow_item_tcp_mask.hdr.src_port; + } else { + tcp.hdr.dst_port = conf->port; + tcp_mask.hdr.dst_port = + rte_flow_item_tcp_mask.hdr.dst_port; + } + item.type = RTE_FLOW_ITEM_TYPE_TCP; + item.spec = &tcp; + item.mask = &tcp_mask; + field = modify_tcp; + } + return flow_dv_convert_modify_action(&item, field, NULL, resource, + MLX5_MODIFICATION_TYPE_SET, error); +} + +/** + * Convert modify-header set TTL action to DV specification. + * + * @param[in,out] resource + * Pointer to the modify-header resource. + * @param[in] action + * Pointer to action specification. + * @param[in] items + * Pointer to rte_flow_item objects list. + * @param[in] attr + * Pointer to flow attributes structure. + * @param[in] dev_flow + * Pointer to the sub flow. + * @param[in] tunnel_decap + * Whether action is after tunnel decapsulation. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_convert_action_modify_ttl + (struct mlx5_flow_dv_modify_hdr_resource *resource, + const struct rte_flow_action *action, + const struct rte_flow_item *items, + union flow_dv_attr *attr, struct mlx5_flow *dev_flow, + bool tunnel_decap, struct rte_flow_error *error) +{ + const struct rte_flow_action_set_ttl *conf = + (const struct rte_flow_action_set_ttl *)(action->conf); + struct rte_flow_item item; + struct rte_flow_item_ipv4 ipv4; + struct rte_flow_item_ipv4 ipv4_mask; + struct rte_flow_item_ipv6 ipv6; + struct rte_flow_item_ipv6 ipv6_mask; + struct field_modify_info *field; + + if (!attr->valid) + flow_dv_attr_init(items, attr, dev_flow, tunnel_decap); + if (attr->ipv4) { + memset(&ipv4, 0, sizeof(ipv4)); + memset(&ipv4_mask, 0, sizeof(ipv4_mask)); + ipv4.hdr.time_to_live = conf->ttl_value; + ipv4_mask.hdr.time_to_live = 0xFF; + item.type = RTE_FLOW_ITEM_TYPE_IPV4; + item.spec = &ipv4; + item.mask = &ipv4_mask; + field = modify_ipv4; + } else { + MLX5_ASSERT(attr->ipv6); + memset(&ipv6, 0, sizeof(ipv6)); + memset(&ipv6_mask, 0, sizeof(ipv6_mask)); + ipv6.hdr.hop_limits = conf->ttl_value; + ipv6_mask.hdr.hop_limits = 0xFF; + item.type = RTE_FLOW_ITEM_TYPE_IPV6; + item.spec = &ipv6; + item.mask = &ipv6_mask; + field = modify_ipv6; + } + return flow_dv_convert_modify_action(&item, field, NULL, resource, + MLX5_MODIFICATION_TYPE_SET, error); +} + +/** + * Convert modify-header decrement TTL action to DV specification. + * + * @param[in,out] resource + * Pointer to the modify-header resource. + * @param[in] action + * Pointer to action specification. + * @param[in] items + * Pointer to rte_flow_item objects list. + * @param[in] attr + * Pointer to flow attributes structure. + * @param[in] dev_flow + * Pointer to the sub flow. + * @param[in] tunnel_decap + * Whether action is after tunnel decapsulation. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_convert_action_modify_dec_ttl + (struct mlx5_flow_dv_modify_hdr_resource *resource, + const struct rte_flow_item *items, + union flow_dv_attr *attr, struct mlx5_flow *dev_flow, + bool tunnel_decap, struct rte_flow_error *error) +{ + struct rte_flow_item item; + struct rte_flow_item_ipv4 ipv4; + struct rte_flow_item_ipv4 ipv4_mask; + struct rte_flow_item_ipv6 ipv6; + struct rte_flow_item_ipv6 ipv6_mask; + struct field_modify_info *field; + + if (!attr->valid) + flow_dv_attr_init(items, attr, dev_flow, tunnel_decap); + if (attr->ipv4) { + memset(&ipv4, 0, sizeof(ipv4)); + memset(&ipv4_mask, 0, sizeof(ipv4_mask)); + ipv4.hdr.time_to_live = 0xFF; + ipv4_mask.hdr.time_to_live = 0xFF; + item.type = RTE_FLOW_ITEM_TYPE_IPV4; + item.spec = &ipv4; + item.mask = &ipv4_mask; + field = modify_ipv4; + } else { + MLX5_ASSERT(attr->ipv6); + memset(&ipv6, 0, sizeof(ipv6)); + memset(&ipv6_mask, 0, sizeof(ipv6_mask)); + ipv6.hdr.hop_limits = 0xFF; + ipv6_mask.hdr.hop_limits = 0xFF; + item.type = RTE_FLOW_ITEM_TYPE_IPV6; + item.spec = &ipv6; + item.mask = &ipv6_mask; + field = modify_ipv6; + } + return flow_dv_convert_modify_action(&item, field, NULL, resource, + MLX5_MODIFICATION_TYPE_ADD, error); +} + +/** + * Convert modify-header increment/decrement TCP Sequence number + * to DV specification. + * + * @param[in,out] resource + * Pointer to the modify-header resource. + * @param[in] action + * Pointer to action specification. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_convert_action_modify_tcp_seq + (struct mlx5_flow_dv_modify_hdr_resource *resource, + const struct rte_flow_action *action, + struct rte_flow_error *error) +{ + const rte_be32_t *conf = (const rte_be32_t *)(action->conf); + uint64_t value = rte_be_to_cpu_32(*conf); + struct rte_flow_item item; + struct rte_flow_item_tcp tcp; + struct rte_flow_item_tcp tcp_mask; + + memset(&tcp, 0, sizeof(tcp)); + memset(&tcp_mask, 0, sizeof(tcp_mask)); + if (action->type == RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ) + /* + * The HW has no decrement operation, only increment operation. + * To simulate decrement X from Y using increment operation + * we need to add UINT32_MAX X times to Y. + * Each adding of UINT32_MAX decrements Y by 1. + */ + value *= UINT32_MAX; + tcp.hdr.sent_seq = rte_cpu_to_be_32((uint32_t)value); + tcp_mask.hdr.sent_seq = RTE_BE32(UINT32_MAX); + item.type = RTE_FLOW_ITEM_TYPE_TCP; + item.spec = &tcp; + item.mask = &tcp_mask; + return flow_dv_convert_modify_action(&item, modify_tcp, NULL, resource, + MLX5_MODIFICATION_TYPE_ADD, error); +} + +/** + * Convert modify-header increment/decrement TCP Acknowledgment number + * to DV specification. + * + * @param[in,out] resource + * Pointer to the modify-header resource. + * @param[in] action + * Pointer to action specification. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_convert_action_modify_tcp_ack + (struct mlx5_flow_dv_modify_hdr_resource *resource, + const struct rte_flow_action *action, + struct rte_flow_error *error) +{ + const rte_be32_t *conf = (const rte_be32_t *)(action->conf); + uint64_t value = rte_be_to_cpu_32(*conf); + struct rte_flow_item item; + struct rte_flow_item_tcp tcp; + struct rte_flow_item_tcp tcp_mask; + + memset(&tcp, 0, sizeof(tcp)); + memset(&tcp_mask, 0, sizeof(tcp_mask)); + if (action->type == RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK) + /* + * The HW has no decrement operation, only increment operation. + * To simulate decrement X from Y using increment operation + * we need to add UINT32_MAX X times to Y. + * Each adding of UINT32_MAX decrements Y by 1. + */ + value *= UINT32_MAX; + tcp.hdr.recv_ack = rte_cpu_to_be_32((uint32_t)value); + tcp_mask.hdr.recv_ack = RTE_BE32(UINT32_MAX); + item.type = RTE_FLOW_ITEM_TYPE_TCP; + item.spec = &tcp; + item.mask = &tcp_mask; + return flow_dv_convert_modify_action(&item, modify_tcp, NULL, resource, + MLX5_MODIFICATION_TYPE_ADD, error); +} + +static enum mlx5_modification_field reg_to_field[] = { + [REG_NONE] = MLX5_MODI_OUT_NONE, + [REG_A] = MLX5_MODI_META_DATA_REG_A, + [REG_B] = MLX5_MODI_META_DATA_REG_B, + [REG_C_0] = MLX5_MODI_META_REG_C_0, + [REG_C_1] = MLX5_MODI_META_REG_C_1, + [REG_C_2] = MLX5_MODI_META_REG_C_2, + [REG_C_3] = MLX5_MODI_META_REG_C_3, + [REG_C_4] = MLX5_MODI_META_REG_C_4, + [REG_C_5] = MLX5_MODI_META_REG_C_5, + [REG_C_6] = MLX5_MODI_META_REG_C_6, + [REG_C_7] = MLX5_MODI_META_REG_C_7, +}; + +/** + * Convert register set to DV specification. + * + * @param[in,out] resource + * Pointer to the modify-header resource. + * @param[in] action + * Pointer to action specification. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_convert_action_set_reg + (struct mlx5_flow_dv_modify_hdr_resource *resource, + const struct rte_flow_action *action, + struct rte_flow_error *error) +{ + const struct mlx5_rte_flow_action_set_tag *conf = action->conf; + struct mlx5_modification_cmd *actions = resource->actions; + uint32_t i = resource->actions_num; + + if (i >= MLX5_MAX_MODIFY_NUM) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "too many items to modify"); + MLX5_ASSERT(conf->id != REG_NONE); + MLX5_ASSERT(conf->id < RTE_DIM(reg_to_field)); + actions[i] = (struct mlx5_modification_cmd) { + .action_type = MLX5_MODIFICATION_TYPE_SET, + .field = reg_to_field[conf->id], + }; + actions[i].data0 = rte_cpu_to_be_32(actions[i].data0); + actions[i].data1 = rte_cpu_to_be_32(conf->data); + ++i; + resource->actions_num = i; + return 0; +} + +/** + * Convert SET_TAG action to DV specification. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in,out] resource + * Pointer to the modify-header resource. + * @param[in] conf + * Pointer to action specification. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_convert_action_set_tag + (struct rte_eth_dev *dev, + struct mlx5_flow_dv_modify_hdr_resource *resource, + const struct rte_flow_action_set_tag *conf, + struct rte_flow_error *error) +{ + rte_be32_t data = rte_cpu_to_be_32(conf->data); + rte_be32_t mask = rte_cpu_to_be_32(conf->mask); + struct rte_flow_item item = { + .spec = &data, + .mask = &mask, + }; + struct field_modify_info reg_c_x[] = { + [1] = {0, 0, 0}, + }; + enum mlx5_modification_field reg_type; + int ret; + + ret = mlx5_flow_get_reg_id(dev, MLX5_APP_TAG, conf->index, error); + if (ret < 0) + return ret; + MLX5_ASSERT(ret != REG_NONE); + MLX5_ASSERT((unsigned int)ret < RTE_DIM(reg_to_field)); + reg_type = reg_to_field[ret]; + MLX5_ASSERT(reg_type > 0); + reg_c_x[0] = (struct field_modify_info){4, 0, reg_type}; + return flow_dv_convert_modify_action(&item, reg_c_x, NULL, resource, + MLX5_MODIFICATION_TYPE_SET, error); +} + +/** + * Convert internal COPY_REG action to DV specification. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in,out] res + * Pointer to the modify-header resource. + * @param[in] action + * Pointer to action specification. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_convert_action_copy_mreg(struct rte_eth_dev *dev, + struct mlx5_flow_dv_modify_hdr_resource *res, + const struct rte_flow_action *action, + struct rte_flow_error *error) +{ + const struct mlx5_flow_action_copy_mreg *conf = action->conf; + rte_be32_t mask = RTE_BE32(UINT32_MAX); + struct rte_flow_item item = { + .spec = NULL, + .mask = &mask, + }; + struct field_modify_info reg_src[] = { + {4, 0, reg_to_field[conf->src]}, + {0, 0, 0}, + }; + struct field_modify_info reg_dst = { + .offset = 0, + .id = reg_to_field[conf->dst], + }; + /* Adjust reg_c[0] usage according to reported mask. */ + if (conf->dst == REG_C_0 || conf->src == REG_C_0) { + struct mlx5_priv *priv = dev->data->dev_private; + uint32_t reg_c0 = priv->sh->dv_regc0_mask; + + MLX5_ASSERT(reg_c0); + MLX5_ASSERT(priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY); + if (conf->dst == REG_C_0) { + /* Copy to reg_c[0], within mask only. */ + reg_dst.offset = rte_bsf32(reg_c0); + /* + * Mask is ignoring the enianness, because + * there is no conversion in datapath. + */ +#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN + /* Copy from destination lower bits to reg_c[0]. */ + mask = reg_c0 >> reg_dst.offset; +#else + /* Copy from destination upper bits to reg_c[0]. */ + mask = reg_c0 << (sizeof(reg_c0) * CHAR_BIT - + rte_fls_u32(reg_c0)); +#endif + } else { + mask = rte_cpu_to_be_32(reg_c0); +#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN + /* Copy from reg_c[0] to destination lower bits. */ + reg_dst.offset = 0; +#else + /* Copy from reg_c[0] to destination upper bits. */ + reg_dst.offset = sizeof(reg_c0) * CHAR_BIT - + (rte_fls_u32(reg_c0) - + rte_bsf32(reg_c0)); +#endif + } + } + return flow_dv_convert_modify_action(&item, + reg_src, ®_dst, res, + MLX5_MODIFICATION_TYPE_COPY, + error); +} + +/** + * Convert MARK action to DV specification. This routine is used + * in extensive metadata only and requires metadata register to be + * handled. In legacy mode hardware tag resource is engaged. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] conf + * Pointer to MARK action specification. + * @param[in,out] resource + * Pointer to the modify-header resource. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_convert_action_mark(struct rte_eth_dev *dev, + const struct rte_flow_action_mark *conf, + struct mlx5_flow_dv_modify_hdr_resource *resource, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + rte_be32_t mask = rte_cpu_to_be_32(MLX5_FLOW_MARK_MASK & + priv->sh->dv_mark_mask); + rte_be32_t data = rte_cpu_to_be_32(conf->id) & mask; + struct rte_flow_item item = { + .spec = &data, + .mask = &mask, + }; + struct field_modify_info reg_c_x[] = { + {4, 0, 0}, /* dynamic instead of MLX5_MODI_META_REG_C_1. */ + {0, 0, 0}, + }; + int reg; + + if (!mask) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + NULL, "zero mark action mask"); + reg = mlx5_flow_get_reg_id(dev, MLX5_FLOW_MARK, 0, error); + if (reg < 0) + return reg; + MLX5_ASSERT(reg > 0); + if (reg == REG_C_0) { + uint32_t msk_c0 = priv->sh->dv_regc0_mask; + uint32_t shl_c0 = rte_bsf32(msk_c0); + + data = rte_cpu_to_be_32(rte_cpu_to_be_32(data) << shl_c0); + mask = rte_cpu_to_be_32(mask) & msk_c0; + mask = rte_cpu_to_be_32(mask << shl_c0); + } + reg_c_x[0].id = reg_to_field[reg]; + return flow_dv_convert_modify_action(&item, reg_c_x, NULL, resource, + MLX5_MODIFICATION_TYPE_SET, error); +} + +/** + * Get metadata register index for specified steering domain. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] attr + * Attributes of flow to determine steering domain. + * @param[out] error + * Pointer to the error structure. + * + * @return + * positive index on success, a negative errno value otherwise + * and rte_errno is set. + */ +static enum modify_reg +flow_dv_get_metadata_reg(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + int reg = + mlx5_flow_get_reg_id(dev, attr->transfer ? + MLX5_METADATA_FDB : + attr->egress ? + MLX5_METADATA_TX : + MLX5_METADATA_RX, 0, error); + if (reg < 0) + return rte_flow_error_set(error, + ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + NULL, "unavailable " + "metadata register"); + return reg; +} + +/** + * Convert SET_META action to DV specification. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in,out] resource + * Pointer to the modify-header resource. + * @param[in] attr + * Attributes of flow that includes this item. + * @param[in] conf + * Pointer to action specification. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_convert_action_set_meta + (struct rte_eth_dev *dev, + struct mlx5_flow_dv_modify_hdr_resource *resource, + const struct rte_flow_attr *attr, + const struct rte_flow_action_set_meta *conf, + struct rte_flow_error *error) +{ + uint32_t data = conf->data; + uint32_t mask = conf->mask; + struct rte_flow_item item = { + .spec = &data, + .mask = &mask, + }; + struct field_modify_info reg_c_x[] = { + [1] = {0, 0, 0}, + }; + int reg = flow_dv_get_metadata_reg(dev, attr, error); + + if (reg < 0) + return reg; + /* + * In datapath code there is no endianness + * coversions for perfromance reasons, all + * pattern conversions are done in rte_flow. + */ + if (reg == REG_C_0) { + struct mlx5_priv *priv = dev->data->dev_private; + uint32_t msk_c0 = priv->sh->dv_regc0_mask; + uint32_t shl_c0; + + MLX5_ASSERT(msk_c0); +#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN + shl_c0 = rte_bsf32(msk_c0); +#else + shl_c0 = sizeof(msk_c0) * CHAR_BIT - rte_fls_u32(msk_c0); +#endif + mask <<= shl_c0; + data <<= shl_c0; + MLX5_ASSERT(!(~msk_c0 & rte_cpu_to_be_32(mask))); + } + reg_c_x[0] = (struct field_modify_info){4, 0, reg_to_field[reg]}; + /* The routine expects parameters in memory as big-endian ones. */ + return flow_dv_convert_modify_action(&item, reg_c_x, NULL, resource, + MLX5_MODIFICATION_TYPE_SET, error); +} + +/** + * Convert modify-header set IPv4 DSCP action to DV specification. + * + * @param[in,out] resource + * Pointer to the modify-header resource. + * @param[in] action + * Pointer to action specification. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_convert_action_modify_ipv4_dscp + (struct mlx5_flow_dv_modify_hdr_resource *resource, + const struct rte_flow_action *action, + struct rte_flow_error *error) +{ + const struct rte_flow_action_set_dscp *conf = + (const struct rte_flow_action_set_dscp *)(action->conf); + struct rte_flow_item item = { .type = RTE_FLOW_ITEM_TYPE_IPV4 }; + struct rte_flow_item_ipv4 ipv4; + struct rte_flow_item_ipv4 ipv4_mask; + + memset(&ipv4, 0, sizeof(ipv4)); + memset(&ipv4_mask, 0, sizeof(ipv4_mask)); + ipv4.hdr.type_of_service = conf->dscp; + ipv4_mask.hdr.type_of_service = RTE_IPV4_HDR_DSCP_MASK >> 2; + item.spec = &ipv4; + item.mask = &ipv4_mask; + return flow_dv_convert_modify_action(&item, modify_ipv4, NULL, resource, + MLX5_MODIFICATION_TYPE_SET, error); +} + +/** + * Convert modify-header set IPv6 DSCP action to DV specification. + * + * @param[in,out] resource + * Pointer to the modify-header resource. + * @param[in] action + * Pointer to action specification. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_convert_action_modify_ipv6_dscp + (struct mlx5_flow_dv_modify_hdr_resource *resource, + const struct rte_flow_action *action, + struct rte_flow_error *error) +{ + const struct rte_flow_action_set_dscp *conf = + (const struct rte_flow_action_set_dscp *)(action->conf); + struct rte_flow_item item = { .type = RTE_FLOW_ITEM_TYPE_IPV6 }; + struct rte_flow_item_ipv6 ipv6; + struct rte_flow_item_ipv6 ipv6_mask; + + memset(&ipv6, 0, sizeof(ipv6)); + memset(&ipv6_mask, 0, sizeof(ipv6_mask)); + /* + * Even though the DSCP bits offset of IPv6 is not byte aligned, + * rdma-core only accept the DSCP bits byte aligned start from + * bit 0 to 5 as to be compatible with IPv4. No need to shift the + * bits in IPv6 case as rdma-core requires byte aligned value. + */ + ipv6.hdr.vtc_flow = conf->dscp; + ipv6_mask.hdr.vtc_flow = RTE_IPV6_HDR_DSCP_MASK >> 22; + item.spec = &ipv6; + item.mask = &ipv6_mask; + return flow_dv_convert_modify_action(&item, modify_ipv6, NULL, resource, + MLX5_MODIFICATION_TYPE_SET, error); +} + +/** + * Validate MARK item. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] item + * Item specification. + * @param[in] attr + * Attributes of flow that includes this item. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_item_mark(struct rte_eth_dev *dev, + const struct rte_flow_item *item, + const struct rte_flow_attr *attr __rte_unused, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_dev_config *config = &priv->config; + const struct rte_flow_item_mark *spec = item->spec; + const struct rte_flow_item_mark *mask = item->mask; + const struct rte_flow_item_mark nic_mask = { + .id = priv->sh->dv_mark_mask, + }; + int ret; + + if (config->dv_xmeta_en == MLX5_XMETA_MODE_LEGACY) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "extended metadata feature" + " isn't enabled"); + if (!mlx5_flow_ext_mreg_supported(dev)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "extended metadata register" + " isn't supported"); + if (!nic_mask.id) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "extended metadata register" + " isn't available"); + ret = mlx5_flow_get_reg_id(dev, MLX5_FLOW_MARK, 0, error); + if (ret < 0) + return ret; + if (!spec) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM_SPEC, + item->spec, + "data cannot be empty"); + if (spec->id >= (MLX5_FLOW_MARK_MAX & nic_mask.id)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &spec->id, + "mark id exceeds the limit"); + if (!mask) + mask = &nic_mask; + if (!mask->id) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM_SPEC, NULL, + "mask cannot be zero"); + + ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask, + (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_mark), + error); + if (ret < 0) + return ret; + return 0; +} + +/** + * Validate META item. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] item + * Item specification. + * @param[in] attr + * Attributes of flow that includes this item. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_item_meta(struct rte_eth_dev *dev __rte_unused, + const struct rte_flow_item *item, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_dev_config *config = &priv->config; + const struct rte_flow_item_meta *spec = item->spec; + const struct rte_flow_item_meta *mask = item->mask; + struct rte_flow_item_meta nic_mask = { + .data = UINT32_MAX + }; + int reg; + int ret; + + if (!spec) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM_SPEC, + item->spec, + "data cannot be empty"); + if (config->dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { + if (!mlx5_flow_ext_mreg_supported(dev)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "extended metadata register" + " isn't supported"); + reg = flow_dv_get_metadata_reg(dev, attr, error); + if (reg < 0) + return reg; + if (reg == REG_B) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "match on reg_b " + "isn't supported"); + if (reg != REG_A) + nic_mask.data = priv->sh->dv_meta_mask; + } + if (!mask) + mask = &rte_flow_item_meta_mask; + if (!mask->data) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM_SPEC, NULL, + "mask cannot be zero"); + + ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask, + (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_meta), + error); + return ret; +} + +/** + * Validate TAG item. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] item + * Item specification. + * @param[in] attr + * Attributes of flow that includes this item. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_item_tag(struct rte_eth_dev *dev, + const struct rte_flow_item *item, + const struct rte_flow_attr *attr __rte_unused, + struct rte_flow_error *error) +{ + const struct rte_flow_item_tag *spec = item->spec; + const struct rte_flow_item_tag *mask = item->mask; + const struct rte_flow_item_tag nic_mask = { + .data = RTE_BE32(UINT32_MAX), + .index = 0xff, + }; + int ret; + + if (!mlx5_flow_ext_mreg_supported(dev)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "extensive metadata register" + " isn't supported"); + if (!spec) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM_SPEC, + item->spec, + "data cannot be empty"); + if (!mask) + mask = &rte_flow_item_tag_mask; + if (!mask->data) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM_SPEC, NULL, + "mask cannot be zero"); + + ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask, + (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_tag), + error); + if (ret < 0) + return ret; + if (mask->index != 0xff) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM_SPEC, NULL, + "partial mask for tag index" + " is not supported"); + ret = mlx5_flow_get_reg_id(dev, MLX5_APP_TAG, spec->index, error); + if (ret < 0) + return ret; + MLX5_ASSERT(ret != REG_NONE); + return 0; +} + +/** + * Validate vport item. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] item + * Item specification. + * @param[in] attr + * Attributes of flow that includes this item. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_item_port_id(struct rte_eth_dev *dev, + const struct rte_flow_item *item, + const struct rte_flow_attr *attr, + uint64_t item_flags, + struct rte_flow_error *error) +{ + const struct rte_flow_item_port_id *spec = item->spec; + const struct rte_flow_item_port_id *mask = item->mask; + const struct rte_flow_item_port_id switch_mask = { + .id = 0xffffffff, + }; + struct mlx5_priv *esw_priv; + struct mlx5_priv *dev_priv; + int ret; + + if (!attr->transfer) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + NULL, + "match on port id is valid only" + " when transfer flag is enabled"); + if (item_flags & MLX5_FLOW_ITEM_PORT_ID) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "multiple source ports are not" + " supported"); + if (!mask) + mask = &switch_mask; + if (mask->id != 0xffffffff) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, + mask, + "no support for partial mask on" + " \"id\" field"); + ret = mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)&rte_flow_item_port_id_mask, + sizeof(struct rte_flow_item_port_id), + error); + if (ret) + return ret; + if (!spec) + return 0; + esw_priv = mlx5_port_to_eswitch_info(spec->id, false); + if (!esw_priv) + return rte_flow_error_set(error, rte_errno, + RTE_FLOW_ERROR_TYPE_ITEM_SPEC, spec, + "failed to obtain E-Switch info for" + " port"); + dev_priv = mlx5_dev_to_eswitch_info(dev); + if (!dev_priv) + return rte_flow_error_set(error, rte_errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "failed to obtain E-Switch info"); + if (esw_priv->domain_id != dev_priv->domain_id) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM_SPEC, spec, + "cannot match on a port from a" + " different E-Switch"); + return 0; +} + +/* + * GTP flags are contained in 1 byte of the format: + * ------------------------------------------- + * | bit | 0 - 2 | 3 | 4 | 5 | 6 | 7 | + * |-----------------------------------------| + * | value | Version | PT | Res | E | S | PN | + * ------------------------------------------- + * + * Matching is supported only for GTP flags E, S, PN. + */ +#define MLX5_GTP_FLAGS_MASK 0x07 + +/** + * Validate VLAN item. + * + * @param[in] item + * Item specification. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. + * @param[in] dev + * Ethernet device flow is being created on. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_item_vlan(const struct rte_flow_item *item, + uint64_t item_flags, + struct rte_eth_dev *dev, + struct rte_flow_error *error) +{ + const struct rte_flow_item_vlan *mask = item->mask; + const struct rte_flow_item_vlan nic_mask = { + .tci = RTE_BE16(UINT16_MAX), + .inner_type = RTE_BE16(UINT16_MAX), + }; + const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + int ret; + const uint64_t l34m = tunnel ? (MLX5_FLOW_LAYER_INNER_L3 | + MLX5_FLOW_LAYER_INNER_L4) : + (MLX5_FLOW_LAYER_OUTER_L3 | + MLX5_FLOW_LAYER_OUTER_L4); + const uint64_t vlanm = tunnel ? MLX5_FLOW_LAYER_INNER_VLAN : + MLX5_FLOW_LAYER_OUTER_VLAN; + + if (item_flags & vlanm) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "multiple VLAN layers not supported"); + else if ((item_flags & l34m) != 0) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "VLAN cannot follow L3/L4 layer"); + if (!mask) + mask = &rte_flow_item_vlan_mask; + ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask, + (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_vlan), + error); + if (ret) + return ret; + if (!tunnel && mask->tci != RTE_BE16(0x0fff)) { + struct mlx5_priv *priv = dev->data->dev_private; + + if (priv->vmwa_context) { + /* + * Non-NULL context means we have a virtual machine + * and SR-IOV enabled, we have to create VLAN interface + * to make hypervisor to setup E-Switch vport + * context correctly. We avoid creating the multiple + * VLAN interfaces, so we cannot support VLAN tag mask. + */ + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "VLAN tag mask is not" + " supported in virtual" + " environment"); + } + } + return 0; +} + +/** + * Validate GTP item. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] item + * Item specification. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_item_gtp(struct rte_eth_dev *dev, + const struct rte_flow_item *item, + uint64_t item_flags, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + const struct rte_flow_item_gtp *spec = item->spec; + const struct rte_flow_item_gtp *mask = item->mask; + const struct rte_flow_item_gtp nic_mask = { + .v_pt_rsv_flags = MLX5_GTP_FLAGS_MASK, + .msg_type = 0xff, + .teid = RTE_BE32(0xffffffff), + }; + + if (!priv->config.hca_attr.tunnel_stateless_gtp) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "GTP support is not enabled"); + if (item_flags & MLX5_FLOW_LAYER_TUNNEL) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "multiple tunnel layers not" + " supported"); + if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "no outer UDP layer found"); + if (!mask) + mask = &rte_flow_item_gtp_mask; + if (spec && spec->v_pt_rsv_flags & ~MLX5_GTP_FLAGS_MASK) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "Match is supported for GTP" + " flags only"); + return mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_gtp), + error); +} + +/** + * Validate the pop VLAN action. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] action + * Pointer to the pop vlan action. + * @param[in] item_flags + * The items found in this flow rule. + * @param[in] attr + * Pointer to flow attributes. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_pop_vlan(struct rte_eth_dev *dev, + uint64_t action_flags, + const struct rte_flow_action *action, + uint64_t item_flags, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + const struct mlx5_priv *priv = dev->data->dev_private; + + (void)action; + (void)attr; + if (!priv->sh->pop_vlan_action) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "pop vlan action is not supported"); + if (attr->egress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, + NULL, + "pop vlan action not supported for " + "egress"); + if (action_flags & MLX5_FLOW_VLAN_ACTIONS) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "no support for multiple VLAN " + "actions"); + if (!(item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "cannot pop vlan without a " + "match on (outer) vlan in the flow"); + if (action_flags & MLX5_FLOW_ACTION_PORT_ID) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "wrong action order, port_id should " + "be after pop VLAN action"); + if (!attr->transfer && priv->representor) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "pop vlan action for VF representor " + "not supported on NIC table"); + return 0; +} + +/** + * Get VLAN default info from vlan match info. + * + * @param[in] items + * the list of item specifications. + * @param[out] vlan + * pointer VLAN info to fill to. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static void +flow_dev_get_vlan_info_from_items(const struct rte_flow_item *items, + struct rte_vlan_hdr *vlan) +{ + const struct rte_flow_item_vlan nic_mask = { + .tci = RTE_BE16(MLX5DV_FLOW_VLAN_PCP_MASK | + MLX5DV_FLOW_VLAN_VID_MASK), + .inner_type = RTE_BE16(0xffff), + }; + + if (items == NULL) + return; + for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + int type = items->type; + + if (type == RTE_FLOW_ITEM_TYPE_VLAN || + type == MLX5_RTE_FLOW_ITEM_TYPE_VLAN) + break; + } + if (items->type != RTE_FLOW_ITEM_TYPE_END) { + const struct rte_flow_item_vlan *vlan_m = items->mask; + const struct rte_flow_item_vlan *vlan_v = items->spec; + + /* If VLAN item in pattern doesn't contain data, return here. */ + if (!vlan_v) + return; + if (!vlan_m) + vlan_m = &nic_mask; + /* Only full match values are accepted */ + if ((vlan_m->tci & MLX5DV_FLOW_VLAN_PCP_MASK_BE) == + MLX5DV_FLOW_VLAN_PCP_MASK_BE) { + vlan->vlan_tci &= ~MLX5DV_FLOW_VLAN_PCP_MASK; + vlan->vlan_tci |= + rte_be_to_cpu_16(vlan_v->tci & + MLX5DV_FLOW_VLAN_PCP_MASK_BE); + } + if ((vlan_m->tci & MLX5DV_FLOW_VLAN_VID_MASK_BE) == + MLX5DV_FLOW_VLAN_VID_MASK_BE) { + vlan->vlan_tci &= ~MLX5DV_FLOW_VLAN_VID_MASK; + vlan->vlan_tci |= + rte_be_to_cpu_16(vlan_v->tci & + MLX5DV_FLOW_VLAN_VID_MASK_BE); + } + if (vlan_m->inner_type == nic_mask.inner_type) + vlan->eth_proto = rte_be_to_cpu_16(vlan_v->inner_type & + vlan_m->inner_type); + } +} + +/** + * Validate the push VLAN action. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] item_flags + * The items found in this flow rule. + * @param[in] action + * Pointer to the action structure. + * @param[in] attr + * Pointer to flow attributes + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_push_vlan(struct rte_eth_dev *dev, + uint64_t action_flags, + const struct rte_flow_item_vlan *vlan_m, + const struct rte_flow_action *action, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + const struct rte_flow_action_of_push_vlan *push_vlan = action->conf; + const struct mlx5_priv *priv = dev->data->dev_private; + + if (!attr->transfer && attr->ingress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, + NULL, + "push VLAN action not supported for " + "ingress"); + if (push_vlan->ethertype != RTE_BE16(RTE_ETHER_TYPE_VLAN) && + push_vlan->ethertype != RTE_BE16(RTE_ETHER_TYPE_QINQ)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "invalid vlan ethertype"); + if (action_flags & MLX5_FLOW_VLAN_ACTIONS) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "no support for multiple VLAN " + "actions"); + if (action_flags & MLX5_FLOW_ACTION_PORT_ID) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "wrong action order, port_id should " + "be after push VLAN"); + if (!attr->transfer && priv->representor) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "push vlan action for VF representor " + "not supported on NIC table"); + if (vlan_m && + (vlan_m->tci & MLX5DV_FLOW_VLAN_PCP_MASK_BE) && + (vlan_m->tci & MLX5DV_FLOW_VLAN_PCP_MASK_BE) != + MLX5DV_FLOW_VLAN_PCP_MASK_BE && + !(action_flags & MLX5_FLOW_ACTION_OF_SET_VLAN_PCP) && + !(mlx5_flow_find_action + (action + 1, RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP))) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "not full match mask on VLAN PCP and " + "there is no of_set_vlan_pcp action, " + "push VLAN action cannot figure out " + "PCP value"); + if (vlan_m && + (vlan_m->tci & MLX5DV_FLOW_VLAN_VID_MASK_BE) && + (vlan_m->tci & MLX5DV_FLOW_VLAN_VID_MASK_BE) != + MLX5DV_FLOW_VLAN_VID_MASK_BE && + !(action_flags & MLX5_FLOW_ACTION_OF_SET_VLAN_VID) && + !(mlx5_flow_find_action + (action + 1, RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID))) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "not full match mask on VLAN VID and " + "there is no of_set_vlan_vid action, " + "push VLAN action cannot figure out " + "VID value"); + (void)attr; + return 0; +} + +/** + * Validate the set VLAN PCP. + * + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] actions + * Pointer to the list of actions remaining in the flow rule. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_set_vlan_pcp(uint64_t action_flags, + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + const struct rte_flow_action *action = actions; + const struct rte_flow_action_of_set_vlan_pcp *conf = action->conf; + + if (conf->vlan_pcp > 7) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "VLAN PCP value is too big"); + if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "set VLAN PCP action must follow " + "the push VLAN action"); + if (action_flags & MLX5_FLOW_ACTION_OF_SET_VLAN_PCP) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "Multiple VLAN PCP modification are " + "not supported"); + if (action_flags & MLX5_FLOW_ACTION_PORT_ID) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "wrong action order, port_id should " + "be after set VLAN PCP"); + return 0; +} + +/** + * Validate the set VLAN VID. + * + * @param[in] item_flags + * Holds the items detected in this rule. + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] actions + * Pointer to the list of actions remaining in the flow rule. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_set_vlan_vid(uint64_t item_flags, + uint64_t action_flags, + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + const struct rte_flow_action *action = actions; + const struct rte_flow_action_of_set_vlan_vid *conf = action->conf; + + if (rte_be_to_cpu_16(conf->vlan_vid) > 0xFFE) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "VLAN VID value is too big"); + if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) && + !(item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "set VLAN VID action must follow push" + " VLAN action or match on VLAN item"); + if (action_flags & MLX5_FLOW_ACTION_OF_SET_VLAN_VID) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "Multiple VLAN VID modifications are " + "not supported"); + if (action_flags & MLX5_FLOW_ACTION_PORT_ID) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "wrong action order, port_id should " + "be after set VLAN VID"); + return 0; +} + +/* + * Validate the FLAG action. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] attr + * Pointer to flow attributes + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_flag(struct rte_eth_dev *dev, + uint64_t action_flags, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_dev_config *config = &priv->config; + int ret; + + /* Fall back if no extended metadata register support. */ + if (config->dv_xmeta_en == MLX5_XMETA_MODE_LEGACY) + return mlx5_flow_validate_action_flag(action_flags, attr, + error); + /* Extensive metadata mode requires registers. */ + if (!mlx5_flow_ext_mreg_supported(dev)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "no metadata registers " + "to support flag action"); + if (!(priv->sh->dv_mark_mask & MLX5_FLOW_MARK_DEFAULT)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "extended metadata register" + " isn't available"); + ret = mlx5_flow_get_reg_id(dev, MLX5_FLOW_MARK, 0, error); + if (ret < 0) + return ret; + MLX5_ASSERT(ret > 0); + if (action_flags & MLX5_FLOW_ACTION_MARK) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't mark and flag in same flow"); + if (action_flags & MLX5_FLOW_ACTION_FLAG) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't have 2 flag" + " actions in same flow"); + return 0; +} + +/** + * Validate MARK action. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] action + * Pointer to action. + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] attr + * Pointer to flow attributes + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_mark(struct rte_eth_dev *dev, + const struct rte_flow_action *action, + uint64_t action_flags, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_dev_config *config = &priv->config; + const struct rte_flow_action_mark *mark = action->conf; + int ret; + + /* Fall back if no extended metadata register support. */ + if (config->dv_xmeta_en == MLX5_XMETA_MODE_LEGACY) + return mlx5_flow_validate_action_mark(action, action_flags, + attr, error); + /* Extensive metadata mode requires registers. */ + if (!mlx5_flow_ext_mreg_supported(dev)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "no metadata registers " + "to support mark action"); + if (!priv->sh->dv_mark_mask) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "extended metadata register" + " isn't available"); + ret = mlx5_flow_get_reg_id(dev, MLX5_FLOW_MARK, 0, error); + if (ret < 0) + return ret; + MLX5_ASSERT(ret > 0); + if (!mark) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "configuration cannot be null"); + if (mark->id >= (MLX5_FLOW_MARK_MAX & priv->sh->dv_mark_mask)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &mark->id, + "mark id exceeds the limit"); + if (action_flags & MLX5_FLOW_ACTION_FLAG) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't flag and mark in same flow"); + if (action_flags & MLX5_FLOW_ACTION_MARK) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't have 2 mark actions in same" + " flow"); + return 0; +} + +/** + * Validate SET_META action. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] action + * Pointer to the action structure. + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] attr + * Pointer to flow attributes + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_set_meta(struct rte_eth_dev *dev, + const struct rte_flow_action *action, + uint64_t action_flags __rte_unused, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + const struct rte_flow_action_set_meta *conf; + uint32_t nic_mask = UINT32_MAX; + int reg; + + if (!mlx5_flow_ext_mreg_supported(dev)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "extended metadata register" + " isn't supported"); + reg = flow_dv_get_metadata_reg(dev, attr, error); + if (reg < 0) + return reg; + if (reg != REG_A && reg != REG_B) { + struct mlx5_priv *priv = dev->data->dev_private; + + nic_mask = priv->sh->dv_meta_mask; + } + if (!(action->conf)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "configuration cannot be null"); + conf = (const struct rte_flow_action_set_meta *)action->conf; + if (!conf->mask) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "zero mask doesn't have any effect"); + if (conf->mask & ~nic_mask) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "meta data must be within reg C0"); + return 0; +} + +/** + * Validate SET_TAG action. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] action + * Pointer to the action structure. + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] attr + * Pointer to flow attributes + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_set_tag(struct rte_eth_dev *dev, + const struct rte_flow_action *action, + uint64_t action_flags, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + const struct rte_flow_action_set_tag *conf; + const uint64_t terminal_action_flags = + MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_QUEUE | + MLX5_FLOW_ACTION_RSS; + int ret; + + if (!mlx5_flow_ext_mreg_supported(dev)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "extensive metadata register" + " isn't supported"); + if (!(action->conf)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "configuration cannot be null"); + conf = (const struct rte_flow_action_set_tag *)action->conf; + if (!conf->mask) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "zero mask doesn't have any effect"); + ret = mlx5_flow_get_reg_id(dev, MLX5_APP_TAG, conf->index, error); + if (ret < 0) + return ret; + if (!attr->transfer && attr->ingress && + (action_flags & terminal_action_flags)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "set_tag has no effect" + " with terminal actions"); + return 0; +} + +/** + * Validate count action. + * + * @param[in] dev + * Pointer to rte_eth_dev structure. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_count(struct rte_eth_dev *dev, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + + if (!priv->config.devx) + goto notsup_err; +#ifdef HAVE_IBV_FLOW_DEVX_COUNTERS + return 0; +#endif +notsup_err: + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "count action not supported"); +} + +/** + * Validate the L2 encap action. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] action + * Pointer to the action structure. + * @param[in] attr + * Pointer to flow attributes. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_l2_encap(struct rte_eth_dev *dev, + uint64_t action_flags, + const struct rte_flow_action *action, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + const struct mlx5_priv *priv = dev->data->dev_private; + + if (!(action->conf)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "configuration cannot be null"); + if (action_flags & MLX5_FLOW_ACTION_ENCAP) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can only have a single encap action " + "in a flow"); + if (!attr->transfer && priv->representor) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "encap action for VF representor " + "not supported on NIC table"); + return 0; +} + +/** + * Validate a decap action. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] attr + * Pointer to flow attributes + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_decap(struct rte_eth_dev *dev, + uint64_t action_flags, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + const struct mlx5_priv *priv = dev->data->dev_private; + + if (action_flags & MLX5_FLOW_XCAP_ACTIONS) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + action_flags & + MLX5_FLOW_ACTION_DECAP ? "can only " + "have a single decap action" : "decap " + "after encap is not supported"); + if (action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't have decap action after" + " modify action"); + if (attr->egress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, + NULL, + "decap action not supported for " + "egress"); + if (!attr->transfer && priv->representor) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "decap action for VF representor " + "not supported on NIC table"); + return 0; +} + +const struct rte_flow_action_raw_decap empty_decap = {.data = NULL, .size = 0,}; + +/** + * Validate the raw encap and decap actions. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] decap + * Pointer to the decap action. + * @param[in] encap + * Pointer to the encap action. + * @param[in] attr + * Pointer to flow attributes + * @param[in/out] action_flags + * Holds the actions detected until now. + * @param[out] actions_n + * pointer to the number of actions counter. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_raw_encap_decap + (struct rte_eth_dev *dev, + const struct rte_flow_action_raw_decap *decap, + const struct rte_flow_action_raw_encap *encap, + const struct rte_flow_attr *attr, uint64_t *action_flags, + int *actions_n, struct rte_flow_error *error) +{ + const struct mlx5_priv *priv = dev->data->dev_private; + int ret; + + if (encap && (!encap->size || !encap->data)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "raw encap data cannot be empty"); + if (decap && encap) { + if (decap->size <= MLX5_ENCAPSULATION_DECISION_SIZE && + encap->size > MLX5_ENCAPSULATION_DECISION_SIZE) + /* L3 encap. */ + decap = NULL; + else if (encap->size <= + MLX5_ENCAPSULATION_DECISION_SIZE && + decap->size > + MLX5_ENCAPSULATION_DECISION_SIZE) + /* L3 decap. */ + encap = NULL; + else if (encap->size > + MLX5_ENCAPSULATION_DECISION_SIZE && + decap->size > + MLX5_ENCAPSULATION_DECISION_SIZE) + /* 2 L2 actions: encap and decap. */ + ; + else + return rte_flow_error_set(error, + ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "unsupported too small " + "raw decap and too small raw " + "encap combination"); + } + if (decap) { + ret = flow_dv_validate_action_decap(dev, *action_flags, attr, + error); + if (ret < 0) + return ret; + *action_flags |= MLX5_FLOW_ACTION_DECAP; + ++(*actions_n); + } + if (encap) { + if (encap->size <= MLX5_ENCAPSULATION_DECISION_SIZE) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "small raw encap size"); + if (*action_flags & MLX5_FLOW_ACTION_ENCAP) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "more than one encap action"); + if (!attr->transfer && priv->representor) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "encap action for VF representor " + "not supported on NIC table"); + *action_flags |= MLX5_FLOW_ACTION_ENCAP; + ++(*actions_n); + } + return 0; +} + +/** + * Find existing encap/decap resource or create and register a new one. + * + * @param[in, out] dev + * Pointer to rte_eth_dev structure. + * @param[in, out] resource + * Pointer to encap/decap resource. + * @parm[in, out] dev_flow + * Pointer to the dev_flow. + * @param[out] error + * pointer to error structure. + * + * @return + * 0 on success otherwise -errno and errno is set. + */ +static int +flow_dv_encap_decap_resource_register + (struct rte_eth_dev *dev, + struct mlx5_flow_dv_encap_decap_resource *resource, + struct mlx5_flow *dev_flow, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; + struct mlx5_flow_dv_encap_decap_resource *cache_resource; + struct mlx5dv_dr_domain *domain; + uint32_t idx = 0; + + resource->flags = dev_flow->dv.group ? 0 : 1; + if (resource->ft_type == MLX5DV_FLOW_TABLE_TYPE_FDB) + domain = sh->fdb_domain; + else if (resource->ft_type == MLX5DV_FLOW_TABLE_TYPE_NIC_RX) + domain = sh->rx_domain; + else + domain = sh->tx_domain; + /* Lookup a matching resource from cache. */ + ILIST_FOREACH(sh->ipool[MLX5_IPOOL_DECAP_ENCAP], sh->encaps_decaps, idx, + cache_resource, next) { + if (resource->reformat_type == cache_resource->reformat_type && + resource->ft_type == cache_resource->ft_type && + resource->flags == cache_resource->flags && + resource->size == cache_resource->size && + !memcmp((const void *)resource->buf, + (const void *)cache_resource->buf, + resource->size)) { + DRV_LOG(DEBUG, "encap/decap resource %p: refcnt %d++", + (void *)cache_resource, + rte_atomic32_read(&cache_resource->refcnt)); + rte_atomic32_inc(&cache_resource->refcnt); + dev_flow->handle->dvh.rix_encap_decap = idx; + dev_flow->dv.encap_decap = cache_resource; + return 0; + } + } + /* Register new encap/decap resource. */ + cache_resource = mlx5_ipool_zmalloc(sh->ipool[MLX5_IPOOL_DECAP_ENCAP], + &dev_flow->handle->dvh.rix_encap_decap); + if (!cache_resource) + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "cannot allocate resource memory"); + *cache_resource = *resource; + cache_resource->verbs_action = + mlx5_glue->dv_create_flow_action_packet_reformat + (sh->ctx, cache_resource->reformat_type, + cache_resource->ft_type, domain, cache_resource->flags, + cache_resource->size, + (cache_resource->size ? cache_resource->buf : NULL)); + if (!cache_resource->verbs_action) { + rte_free(cache_resource); + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "cannot create action"); + } + rte_atomic32_init(&cache_resource->refcnt); + rte_atomic32_inc(&cache_resource->refcnt); + ILIST_INSERT(sh->ipool[MLX5_IPOOL_DECAP_ENCAP], &sh->encaps_decaps, + dev_flow->handle->dvh.rix_encap_decap, cache_resource, + next); + dev_flow->dv.encap_decap = cache_resource; + DRV_LOG(DEBUG, "new encap/decap resource %p: refcnt %d++", + (void *)cache_resource, + rte_atomic32_read(&cache_resource->refcnt)); + return 0; +} + +/** + * Find existing table jump resource or create and register a new one. + * + * @param[in, out] dev + * Pointer to rte_eth_dev structure. + * @param[in, out] tbl + * Pointer to flow table resource. + * @parm[in, out] dev_flow + * Pointer to the dev_flow. + * @param[out] error + * pointer to error structure. + * + * @return + * 0 on success otherwise -errno and errno is set. + */ +static int +flow_dv_jump_tbl_resource_register + (struct rte_eth_dev *dev __rte_unused, + struct mlx5_flow_tbl_resource *tbl, + struct mlx5_flow *dev_flow, + struct rte_flow_error *error) +{ + struct mlx5_flow_tbl_data_entry *tbl_data = + container_of(tbl, struct mlx5_flow_tbl_data_entry, tbl); + int cnt; + + MLX5_ASSERT(tbl); + cnt = rte_atomic32_read(&tbl_data->jump.refcnt); + if (!cnt) { + tbl_data->jump.action = + mlx5_glue->dr_create_flow_action_dest_flow_tbl + (tbl->obj); + if (!tbl_data->jump.action) + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "cannot create jump action"); + DRV_LOG(DEBUG, "new jump table resource %p: refcnt %d++", + (void *)&tbl_data->jump, cnt); + } else { + /* old jump should not make the table ref++. */ + flow_dv_tbl_resource_release(dev, &tbl_data->tbl); + MLX5_ASSERT(tbl_data->jump.action); + DRV_LOG(DEBUG, "existed jump table resource %p: refcnt %d++", + (void *)&tbl_data->jump, cnt); + } + rte_atomic32_inc(&tbl_data->jump.refcnt); + dev_flow->handle->rix_jump = tbl_data->idx; + dev_flow->dv.jump = &tbl_data->jump; + return 0; +} + +/** + * Find existing table port ID resource or create and register a new one. + * + * @param[in, out] dev + * Pointer to rte_eth_dev structure. + * @param[in, out] resource + * Pointer to port ID action resource. + * @parm[in, out] dev_flow + * Pointer to the dev_flow. + * @param[out] error + * pointer to error structure. + * + * @return + * 0 on success otherwise -errno and errno is set. + */ +static int +flow_dv_port_id_action_resource_register + (struct rte_eth_dev *dev, + struct mlx5_flow_dv_port_id_action_resource *resource, + struct mlx5_flow *dev_flow, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; + struct mlx5_flow_dv_port_id_action_resource *cache_resource; + uint32_t idx = 0; + + /* Lookup a matching resource from cache. */ + ILIST_FOREACH(sh->ipool[MLX5_IPOOL_PORT_ID], sh->port_id_action_list, + idx, cache_resource, next) { + if (resource->port_id == cache_resource->port_id) { + DRV_LOG(DEBUG, "port id action resource resource %p: " + "refcnt %d++", + (void *)cache_resource, + rte_atomic32_read(&cache_resource->refcnt)); + rte_atomic32_inc(&cache_resource->refcnt); + dev_flow->handle->rix_port_id_action = idx; + dev_flow->dv.port_id_action = cache_resource; + return 0; + } + } + /* Register new port id action resource. */ + cache_resource = mlx5_ipool_zmalloc(sh->ipool[MLX5_IPOOL_PORT_ID], + &dev_flow->handle->rix_port_id_action); + if (!cache_resource) + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "cannot allocate resource memory"); + *cache_resource = *resource; + /* + * Depending on rdma_core version the glue routine calls + * either mlx5dv_dr_action_create_dest_ib_port(domain, ibv_port) + * or mlx5dv_dr_action_create_dest_vport(domain, vport_id). + */ + cache_resource->action = + mlx5_glue->dr_create_flow_action_dest_port + (priv->sh->fdb_domain, resource->port_id); + if (!cache_resource->action) { + rte_free(cache_resource); + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "cannot create action"); + } + rte_atomic32_init(&cache_resource->refcnt); + rte_atomic32_inc(&cache_resource->refcnt); + ILIST_INSERT(sh->ipool[MLX5_IPOOL_PORT_ID], &sh->port_id_action_list, + dev_flow->handle->rix_port_id_action, cache_resource, + next); + dev_flow->dv.port_id_action = cache_resource; + DRV_LOG(DEBUG, "new port id action resource %p: refcnt %d++", + (void *)cache_resource, + rte_atomic32_read(&cache_resource->refcnt)); + return 0; +} + +/** + * Find existing push vlan resource or create and register a new one. + * + * @param [in, out] dev + * Pointer to rte_eth_dev structure. + * @param[in, out] resource + * Pointer to port ID action resource. + * @parm[in, out] dev_flow + * Pointer to the dev_flow. + * @param[out] error + * pointer to error structure. + * + * @return + * 0 on success otherwise -errno and errno is set. + */ +static int +flow_dv_push_vlan_action_resource_register + (struct rte_eth_dev *dev, + struct mlx5_flow_dv_push_vlan_action_resource *resource, + struct mlx5_flow *dev_flow, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; + struct mlx5_flow_dv_push_vlan_action_resource *cache_resource; + struct mlx5dv_dr_domain *domain; + uint32_t idx = 0; + + /* Lookup a matching resource from cache. */ + ILIST_FOREACH(sh->ipool[MLX5_IPOOL_PUSH_VLAN], + sh->push_vlan_action_list, idx, cache_resource, next) { + if (resource->vlan_tag == cache_resource->vlan_tag && + resource->ft_type == cache_resource->ft_type) { + DRV_LOG(DEBUG, "push-VLAN action resource resource %p: " + "refcnt %d++", + (void *)cache_resource, + rte_atomic32_read(&cache_resource->refcnt)); + rte_atomic32_inc(&cache_resource->refcnt); + dev_flow->handle->dvh.rix_push_vlan = idx; + dev_flow->dv.push_vlan_res = cache_resource; + return 0; + } + } + /* Register new push_vlan action resource. */ + cache_resource = mlx5_ipool_zmalloc(sh->ipool[MLX5_IPOOL_PUSH_VLAN], + &dev_flow->handle->dvh.rix_push_vlan); + if (!cache_resource) + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "cannot allocate resource memory"); + *cache_resource = *resource; + if (resource->ft_type == MLX5DV_FLOW_TABLE_TYPE_FDB) + domain = sh->fdb_domain; + else if (resource->ft_type == MLX5DV_FLOW_TABLE_TYPE_NIC_RX) + domain = sh->rx_domain; + else + domain = sh->tx_domain; + cache_resource->action = + mlx5_glue->dr_create_flow_action_push_vlan(domain, + resource->vlan_tag); + if (!cache_resource->action) { + rte_free(cache_resource); + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "cannot create action"); + } + rte_atomic32_init(&cache_resource->refcnt); + rte_atomic32_inc(&cache_resource->refcnt); + ILIST_INSERT(sh->ipool[MLX5_IPOOL_PUSH_VLAN], + &sh->push_vlan_action_list, + dev_flow->handle->dvh.rix_push_vlan, + cache_resource, next); + dev_flow->dv.push_vlan_res = cache_resource; + DRV_LOG(DEBUG, "new push vlan action resource %p: refcnt %d++", + (void *)cache_resource, + rte_atomic32_read(&cache_resource->refcnt)); + return 0; +} +/** + * Get the size of specific rte_flow_item_type + * + * @param[in] item_type + * Tested rte_flow_item_type. + * + * @return + * sizeof struct item_type, 0 if void or irrelevant. + */ +static size_t +flow_dv_get_item_len(const enum rte_flow_item_type item_type) +{ + size_t retval; + + switch (item_type) { + case RTE_FLOW_ITEM_TYPE_ETH: + retval = sizeof(struct rte_flow_item_eth); + break; + case RTE_FLOW_ITEM_TYPE_VLAN: + retval = sizeof(struct rte_flow_item_vlan); + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + retval = sizeof(struct rte_flow_item_ipv4); + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + retval = sizeof(struct rte_flow_item_ipv6); + break; + case RTE_FLOW_ITEM_TYPE_UDP: + retval = sizeof(struct rte_flow_item_udp); + break; + case RTE_FLOW_ITEM_TYPE_TCP: + retval = sizeof(struct rte_flow_item_tcp); + break; + case RTE_FLOW_ITEM_TYPE_VXLAN: + retval = sizeof(struct rte_flow_item_vxlan); + break; + case RTE_FLOW_ITEM_TYPE_GRE: + retval = sizeof(struct rte_flow_item_gre); + break; + case RTE_FLOW_ITEM_TYPE_NVGRE: + retval = sizeof(struct rte_flow_item_nvgre); + break; + case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: + retval = sizeof(struct rte_flow_item_vxlan_gpe); + break; + case RTE_FLOW_ITEM_TYPE_MPLS: + retval = sizeof(struct rte_flow_item_mpls); + break; + case RTE_FLOW_ITEM_TYPE_VOID: /* Fall through. */ + default: + retval = 0; + break; + } + return retval; +} + +#define MLX5_ENCAP_IPV4_VERSION 0x40 +#define MLX5_ENCAP_IPV4_IHL_MIN 0x05 +#define MLX5_ENCAP_IPV4_TTL_DEF 0x40 +#define MLX5_ENCAP_IPV6_VTC_FLOW 0x60000000 +#define MLX5_ENCAP_IPV6_HOP_LIMIT 0xff +#define MLX5_ENCAP_VXLAN_FLAGS 0x08000000 +#define MLX5_ENCAP_VXLAN_GPE_FLAGS 0x04 + +/** + * Convert the encap action data from list of rte_flow_item to raw buffer + * + * @param[in] items + * Pointer to rte_flow_item objects list. + * @param[out] buf + * Pointer to the output buffer. + * @param[out] size + * Pointer to the output buffer size. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_convert_encap_data(const struct rte_flow_item *items, uint8_t *buf, + size_t *size, struct rte_flow_error *error) +{ + struct rte_ether_hdr *eth = NULL; + struct rte_vlan_hdr *vlan = NULL; + struct rte_ipv4_hdr *ipv4 = NULL; + struct rte_ipv6_hdr *ipv6 = NULL; + struct rte_udp_hdr *udp = NULL; + struct rte_vxlan_hdr *vxlan = NULL; + struct rte_vxlan_gpe_hdr *vxlan_gpe = NULL; + struct rte_gre_hdr *gre = NULL; + size_t len; + size_t temp_size = 0; + + if (!items) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "invalid empty data"); + for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + len = flow_dv_get_item_len(items->type); + if (len + temp_size > MLX5_ENCAP_MAX_LEN) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "items total size is too big" + " for encap action"); + rte_memcpy((void *)&buf[temp_size], items->spec, len); + switch (items->type) { + case RTE_FLOW_ITEM_TYPE_ETH: + eth = (struct rte_ether_hdr *)&buf[temp_size]; + break; + case RTE_FLOW_ITEM_TYPE_VLAN: + vlan = (struct rte_vlan_hdr *)&buf[temp_size]; + if (!eth) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "eth header not found"); + if (!eth->ether_type) + eth->ether_type = RTE_BE16(RTE_ETHER_TYPE_VLAN); + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + ipv4 = (struct rte_ipv4_hdr *)&buf[temp_size]; + if (!vlan && !eth) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "neither eth nor vlan" + " header found"); + if (vlan && !vlan->eth_proto) + vlan->eth_proto = RTE_BE16(RTE_ETHER_TYPE_IPV4); + else if (eth && !eth->ether_type) + eth->ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV4); + if (!ipv4->version_ihl) + ipv4->version_ihl = MLX5_ENCAP_IPV4_VERSION | + MLX5_ENCAP_IPV4_IHL_MIN; + if (!ipv4->time_to_live) + ipv4->time_to_live = MLX5_ENCAP_IPV4_TTL_DEF; + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + ipv6 = (struct rte_ipv6_hdr *)&buf[temp_size]; + if (!vlan && !eth) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "neither eth nor vlan" + " header found"); + if (vlan && !vlan->eth_proto) + vlan->eth_proto = RTE_BE16(RTE_ETHER_TYPE_IPV6); + else if (eth && !eth->ether_type) + eth->ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV6); + if (!ipv6->vtc_flow) + ipv6->vtc_flow = + RTE_BE32(MLX5_ENCAP_IPV6_VTC_FLOW); + if (!ipv6->hop_limits) + ipv6->hop_limits = MLX5_ENCAP_IPV6_HOP_LIMIT; + break; + case RTE_FLOW_ITEM_TYPE_UDP: + udp = (struct rte_udp_hdr *)&buf[temp_size]; + if (!ipv4 && !ipv6) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "ip header not found"); + if (ipv4 && !ipv4->next_proto_id) + ipv4->next_proto_id = IPPROTO_UDP; + else if (ipv6 && !ipv6->proto) + ipv6->proto = IPPROTO_UDP; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN: + vxlan = (struct rte_vxlan_hdr *)&buf[temp_size]; + if (!udp) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "udp header not found"); + if (!udp->dst_port) + udp->dst_port = RTE_BE16(MLX5_UDP_PORT_VXLAN); + if (!vxlan->vx_flags) + vxlan->vx_flags = + RTE_BE32(MLX5_ENCAP_VXLAN_FLAGS); + break; + case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: + vxlan_gpe = (struct rte_vxlan_gpe_hdr *)&buf[temp_size]; + if (!udp) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "udp header not found"); + if (!vxlan_gpe->proto) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "next protocol not found"); + if (!udp->dst_port) + udp->dst_port = + RTE_BE16(MLX5_UDP_PORT_VXLAN_GPE); + if (!vxlan_gpe->vx_flags) + vxlan_gpe->vx_flags = + MLX5_ENCAP_VXLAN_GPE_FLAGS; + break; + case RTE_FLOW_ITEM_TYPE_GRE: + case RTE_FLOW_ITEM_TYPE_NVGRE: + gre = (struct rte_gre_hdr *)&buf[temp_size]; + if (!gre->proto) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "next protocol not found"); + if (!ipv4 && !ipv6) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "ip header not found"); + if (ipv4 && !ipv4->next_proto_id) + ipv4->next_proto_id = IPPROTO_GRE; + else if (ipv6 && !ipv6->proto) + ipv6->proto = IPPROTO_GRE; + break; + case RTE_FLOW_ITEM_TYPE_VOID: + break; + default: + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "unsupported item type"); + break; + } + temp_size += len; + } + *size = temp_size; + return 0; +} + +static int +flow_dv_zero_encap_udp_csum(void *data, struct rte_flow_error *error) +{ + struct rte_ether_hdr *eth = NULL; + struct rte_vlan_hdr *vlan = NULL; + struct rte_ipv6_hdr *ipv6 = NULL; + struct rte_udp_hdr *udp = NULL; + char *next_hdr; + uint16_t proto; + + eth = (struct rte_ether_hdr *)data; + next_hdr = (char *)(eth + 1); + proto = RTE_BE16(eth->ether_type); + + /* VLAN skipping */ + while (proto == RTE_ETHER_TYPE_VLAN || proto == RTE_ETHER_TYPE_QINQ) { + vlan = (struct rte_vlan_hdr *)next_hdr; + proto = RTE_BE16(vlan->eth_proto); + next_hdr += sizeof(struct rte_vlan_hdr); + } + + /* HW calculates IPv4 csum. no need to proceed */ + if (proto == RTE_ETHER_TYPE_IPV4) + return 0; + + /* non IPv4/IPv6 header. not supported */ + if (proto != RTE_ETHER_TYPE_IPV6) { + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "Cannot offload non IPv4/IPv6"); + } + + ipv6 = (struct rte_ipv6_hdr *)next_hdr; + + /* ignore non UDP */ + if (ipv6->proto != IPPROTO_UDP) + return 0; + + udp = (struct rte_udp_hdr *)(ipv6 + 1); + udp->dgram_cksum = 0; + + return 0; +} + +/** + * Convert L2 encap action to DV specification. + * + * @param[in] dev + * Pointer to rte_eth_dev structure. + * @param[in] action + * Pointer to action structure. + * @param[in, out] dev_flow + * Pointer to the mlx5_flow. + * @param[in] transfer + * Mark if the flow is E-Switch flow. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_create_action_l2_encap(struct rte_eth_dev *dev, + const struct rte_flow_action *action, + struct mlx5_flow *dev_flow, + uint8_t transfer, + struct rte_flow_error *error) +{ + const struct rte_flow_item *encap_data; + const struct rte_flow_action_raw_encap *raw_encap_data; + struct mlx5_flow_dv_encap_decap_resource res = { + .reformat_type = + MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL, + .ft_type = transfer ? MLX5DV_FLOW_TABLE_TYPE_FDB : + MLX5DV_FLOW_TABLE_TYPE_NIC_TX, + }; + + if (action->type == RTE_FLOW_ACTION_TYPE_RAW_ENCAP) { + raw_encap_data = + (const struct rte_flow_action_raw_encap *)action->conf; + res.size = raw_encap_data->size; + memcpy(res.buf, raw_encap_data->data, res.size); + } else { + if (action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP) + encap_data = + ((const struct rte_flow_action_vxlan_encap *) + action->conf)->definition; + else + encap_data = + ((const struct rte_flow_action_nvgre_encap *) + action->conf)->definition; + if (flow_dv_convert_encap_data(encap_data, res.buf, + &res.size, error)) + return -rte_errno; + } + if (flow_dv_zero_encap_udp_csum(res.buf, error)) + return -rte_errno; + if (flow_dv_encap_decap_resource_register(dev, &res, dev_flow, error)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "can't create L2 encap action"); + return 0; +} + +/** + * Convert L2 decap action to DV specification. + * + * @param[in] dev + * Pointer to rte_eth_dev structure. + * @param[in, out] dev_flow + * Pointer to the mlx5_flow. + * @param[in] transfer + * Mark if the flow is E-Switch flow. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_create_action_l2_decap(struct rte_eth_dev *dev, + struct mlx5_flow *dev_flow, + uint8_t transfer, + struct rte_flow_error *error) +{ + struct mlx5_flow_dv_encap_decap_resource res = { + .size = 0, + .reformat_type = + MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2, + .ft_type = transfer ? MLX5DV_FLOW_TABLE_TYPE_FDB : + MLX5DV_FLOW_TABLE_TYPE_NIC_RX, + }; + + if (flow_dv_encap_decap_resource_register(dev, &res, dev_flow, error)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "can't create L2 decap action"); + return 0; +} + +/** + * Convert raw decap/encap (L3 tunnel) action to DV specification. + * + * @param[in] dev + * Pointer to rte_eth_dev structure. + * @param[in] action + * Pointer to action structure. + * @param[in, out] dev_flow + * Pointer to the mlx5_flow. + * @param[in] attr + * Pointer to the flow attributes. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_create_action_raw_encap(struct rte_eth_dev *dev, + const struct rte_flow_action *action, + struct mlx5_flow *dev_flow, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + const struct rte_flow_action_raw_encap *encap_data; + struct mlx5_flow_dv_encap_decap_resource res; + + memset(&res, 0, sizeof(res)); + encap_data = (const struct rte_flow_action_raw_encap *)action->conf; + res.size = encap_data->size; + memcpy(res.buf, encap_data->data, res.size); + res.reformat_type = res.size < MLX5_ENCAPSULATION_DECISION_SIZE ? + MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2 : + MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL; + if (attr->transfer) + res.ft_type = MLX5DV_FLOW_TABLE_TYPE_FDB; + else + res.ft_type = attr->egress ? MLX5DV_FLOW_TABLE_TYPE_NIC_TX : + MLX5DV_FLOW_TABLE_TYPE_NIC_RX; + if (flow_dv_encap_decap_resource_register(dev, &res, dev_flow, error)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "can't create encap action"); + return 0; +} + +/** + * Create action push VLAN. + * + * @param[in] dev + * Pointer to rte_eth_dev structure. + * @param[in] attr + * Pointer to the flow attributes. + * @param[in] vlan + * Pointer to the vlan to push to the Ethernet header. + * @param[in, out] dev_flow + * Pointer to the mlx5_flow. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_create_action_push_vlan(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_vlan_hdr *vlan, + struct mlx5_flow *dev_flow, + struct rte_flow_error *error) +{ + struct mlx5_flow_dv_push_vlan_action_resource res; + + memset(&res, 0, sizeof(res)); + res.vlan_tag = + rte_cpu_to_be_32(((uint32_t)vlan->eth_proto) << 16 | + vlan->vlan_tci); + if (attr->transfer) + res.ft_type = MLX5DV_FLOW_TABLE_TYPE_FDB; + else + res.ft_type = attr->egress ? MLX5DV_FLOW_TABLE_TYPE_NIC_TX : + MLX5DV_FLOW_TABLE_TYPE_NIC_RX; + return flow_dv_push_vlan_action_resource_register + (dev, &res, dev_flow, error); +} + +/** + * Validate the modify-header actions. + * + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] action + * Pointer to the modify action. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_modify_hdr(const uint64_t action_flags, + const struct rte_flow_action *action, + struct rte_flow_error *error) +{ + if (action->type != RTE_FLOW_ACTION_TYPE_DEC_TTL && !action->conf) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + NULL, "action configuration not set"); + if (action_flags & MLX5_FLOW_ACTION_ENCAP) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't have encap action before" + " modify action"); + return 0; +} + +/** + * Validate the modify-header MAC address actions. + * + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] action + * Pointer to the modify action. + * @param[in] item_flags + * Holds the items detected. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_modify_mac(const uint64_t action_flags, + const struct rte_flow_action *action, + const uint64_t item_flags, + struct rte_flow_error *error) +{ + int ret = 0; + + ret = flow_dv_validate_action_modify_hdr(action_flags, action, error); + if (!ret) { + if (!(item_flags & MLX5_FLOW_LAYER_L2)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "no L2 item in pattern"); + } + return ret; +} + +/** + * Validate the modify-header IPv4 address actions. + * + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] action + * Pointer to the modify action. + * @param[in] item_flags + * Holds the items detected. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_modify_ipv4(const uint64_t action_flags, + const struct rte_flow_action *action, + const uint64_t item_flags, + struct rte_flow_error *error) +{ + int ret = 0; + uint64_t layer; + + ret = flow_dv_validate_action_modify_hdr(action_flags, action, error); + if (!ret) { + layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ? + MLX5_FLOW_LAYER_INNER_L3_IPV4 : + MLX5_FLOW_LAYER_OUTER_L3_IPV4; + if (!(item_flags & layer)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "no ipv4 item in pattern"); + } + return ret; +} + +/** + * Validate the modify-header IPv6 address actions. + * + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] action + * Pointer to the modify action. + * @param[in] item_flags + * Holds the items detected. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_modify_ipv6(const uint64_t action_flags, + const struct rte_flow_action *action, + const uint64_t item_flags, + struct rte_flow_error *error) +{ + int ret = 0; + uint64_t layer; + + ret = flow_dv_validate_action_modify_hdr(action_flags, action, error); + if (!ret) { + layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ? + MLX5_FLOW_LAYER_INNER_L3_IPV6 : + MLX5_FLOW_LAYER_OUTER_L3_IPV6; + if (!(item_flags & layer)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "no ipv6 item in pattern"); + } + return ret; +} + +/** + * Validate the modify-header TP actions. + * + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] action + * Pointer to the modify action. + * @param[in] item_flags + * Holds the items detected. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_modify_tp(const uint64_t action_flags, + const struct rte_flow_action *action, + const uint64_t item_flags, + struct rte_flow_error *error) +{ + int ret = 0; + uint64_t layer; + + ret = flow_dv_validate_action_modify_hdr(action_flags, action, error); + if (!ret) { + layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ? + MLX5_FLOW_LAYER_INNER_L4 : + MLX5_FLOW_LAYER_OUTER_L4; + if (!(item_flags & layer)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "no transport layer " + "in pattern"); + } + return ret; +} + +/** + * Validate the modify-header actions of increment/decrement + * TCP Sequence-number. + * + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] action + * Pointer to the modify action. + * @param[in] item_flags + * Holds the items detected. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_modify_tcp_seq(const uint64_t action_flags, + const struct rte_flow_action *action, + const uint64_t item_flags, + struct rte_flow_error *error) +{ + int ret = 0; + uint64_t layer; + + ret = flow_dv_validate_action_modify_hdr(action_flags, action, error); + if (!ret) { + layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ? + MLX5_FLOW_LAYER_INNER_L4_TCP : + MLX5_FLOW_LAYER_OUTER_L4_TCP; + if (!(item_flags & layer)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "no TCP item in" + " pattern"); + if ((action->type == RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ && + (action_flags & MLX5_FLOW_ACTION_DEC_TCP_SEQ)) || + (action->type == RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ && + (action_flags & MLX5_FLOW_ACTION_INC_TCP_SEQ))) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "cannot decrease and increase" + " TCP sequence number" + " at the same time"); + } + return ret; +} + +/** + * Validate the modify-header actions of increment/decrement + * TCP Acknowledgment number. + * + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] action + * Pointer to the modify action. + * @param[in] item_flags + * Holds the items detected. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_modify_tcp_ack(const uint64_t action_flags, + const struct rte_flow_action *action, + const uint64_t item_flags, + struct rte_flow_error *error) +{ + int ret = 0; + uint64_t layer; + + ret = flow_dv_validate_action_modify_hdr(action_flags, action, error); + if (!ret) { + layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ? + MLX5_FLOW_LAYER_INNER_L4_TCP : + MLX5_FLOW_LAYER_OUTER_L4_TCP; + if (!(item_flags & layer)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "no TCP item in" + " pattern"); + if ((action->type == RTE_FLOW_ACTION_TYPE_INC_TCP_ACK && + (action_flags & MLX5_FLOW_ACTION_DEC_TCP_ACK)) || + (action->type == RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK && + (action_flags & MLX5_FLOW_ACTION_INC_TCP_ACK))) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "cannot decrease and increase" + " TCP acknowledgment number" + " at the same time"); + } + return ret; +} + +/** + * Validate the modify-header TTL actions. + * + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] action + * Pointer to the modify action. + * @param[in] item_flags + * Holds the items detected. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_modify_ttl(const uint64_t action_flags, + const struct rte_flow_action *action, + const uint64_t item_flags, + struct rte_flow_error *error) +{ + int ret = 0; + uint64_t layer; + + ret = flow_dv_validate_action_modify_hdr(action_flags, action, error); + if (!ret) { + layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ? + MLX5_FLOW_LAYER_INNER_L3 : + MLX5_FLOW_LAYER_OUTER_L3; + if (!(item_flags & layer)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "no IP protocol in pattern"); + } + return ret; +} + +/** + * Validate jump action. + * + * @param[in] action + * Pointer to the jump action. + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] attributes + * Pointer to flow attributes + * @param[in] external + * Action belongs to flow rule created by request external to PMD. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_jump(const struct rte_flow_action *action, + uint64_t action_flags, + const struct rte_flow_attr *attributes, + bool external, struct rte_flow_error *error) +{ + uint32_t target_group, table; + int ret = 0; + + if (action_flags & (MLX5_FLOW_FATE_ACTIONS | + MLX5_FLOW_FATE_ESWITCH_ACTIONS)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't have 2 fate actions in" + " same flow"); + if (action_flags & MLX5_FLOW_ACTION_METER) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "jump with meter not support"); + if (!action->conf) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + NULL, "action configuration not set"); + target_group = + ((const struct rte_flow_action_jump *)action->conf)->group; + ret = mlx5_flow_group_to_table(attributes, external, target_group, + true, &table, error); + if (ret) + return ret; + if (attributes->group == target_group) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "target group must be other than" + " the current flow group"); + return 0; +} + +/* + * Validate the port_id action. + * + * @param[in] dev + * Pointer to rte_eth_dev structure. + * @param[in] action_flags + * Bit-fields that holds the actions detected until now. + * @param[in] action + * Port_id RTE action structure. + * @param[in] attr + * Attributes of flow that includes this action. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_port_id(struct rte_eth_dev *dev, + uint64_t action_flags, + const struct rte_flow_action *action, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + const struct rte_flow_action_port_id *port_id; + struct mlx5_priv *act_priv; + struct mlx5_priv *dev_priv; + uint16_t port; + + if (!attr->transfer) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "port id action is valid in transfer" + " mode only"); + if (!action || !action->conf) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + NULL, + "port id action parameters must be" + " specified"); + if (action_flags & (MLX5_FLOW_FATE_ACTIONS | + MLX5_FLOW_FATE_ESWITCH_ACTIONS)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can have only one fate actions in" + " a flow"); + dev_priv = mlx5_dev_to_eswitch_info(dev); + if (!dev_priv) + return rte_flow_error_set(error, rte_errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "failed to obtain E-Switch info"); + port_id = action->conf; + port = port_id->original ? dev->data->port_id : port_id->id; + act_priv = mlx5_port_to_eswitch_info(port, false); + if (!act_priv) + return rte_flow_error_set + (error, rte_errno, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, port_id, + "failed to obtain E-Switch port id for port"); + if (act_priv->domain_id != dev_priv->domain_id) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "port does not belong to" + " E-Switch being configured"); + return 0; +} + +/** + * Get the maximum number of modify header actions. + * + * @param dev + * Pointer to rte_eth_dev structure. + * @param flags + * Flags bits to check if root level. + * + * @return + * Max number of modify header actions device can support. + */ +static inline unsigned int +flow_dv_modify_hdr_action_max(struct rte_eth_dev *dev __rte_unused, + uint64_t flags) +{ + /* + * There's no way to directly query the max capacity from FW. + * The maximal value on root table should be assumed to be supported. + */ + if (!(flags & MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL)) + return MLX5_MAX_MODIFY_NUM; + else + return MLX5_ROOT_TBL_MODIFY_NUM; +} + +/** + * Validate the meter action. + * + * @param[in] dev + * Pointer to rte_eth_dev structure. + * @param[in] action_flags + * Bit-fields that holds the actions detected until now. + * @param[in] action + * Pointer to the meter action. + * @param[in] attr + * Attributes of flow that includes this action. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_ernno is set. + */ +static int +mlx5_flow_validate_action_meter(struct rte_eth_dev *dev, + uint64_t action_flags, + const struct rte_flow_action *action, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + const struct rte_flow_action_meter *am = action->conf; + struct mlx5_flow_meter *fm; + + if (!am) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "meter action conf is NULL"); + + if (action_flags & MLX5_FLOW_ACTION_METER) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "meter chaining not support"); + if (action_flags & MLX5_FLOW_ACTION_JUMP) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "meter with jump not support"); + if (!priv->mtr_en) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "meter action not supported"); + fm = mlx5_flow_meter_find(priv, am->mtr_id); + if (!fm) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "Meter not found"); + if (fm->ref_cnt && (!(fm->transfer == attr->transfer || + (!fm->ingress && !attr->ingress && attr->egress) || + (!fm->egress && !attr->egress && attr->ingress)))) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "Flow attributes are either invalid " + "or have a conflict with current " + "meter attributes"); + return 0; +} + +/** + * Validate the age action. + * + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] action + * Pointer to the age action. + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_age(uint64_t action_flags, + const struct rte_flow_action *action, + struct rte_eth_dev *dev, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + const struct rte_flow_action_age *age = action->conf; + + if (!priv->config.devx || priv->counter_fallback) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "age action not supported"); + if (!(action->conf)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "configuration cannot be null"); + if (age->timeout >= UINT16_MAX / 2 / 10) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "Max age time: 3275 seconds"); + if (action_flags & MLX5_FLOW_ACTION_AGE) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "Duplicate age ctions set"); + return 0; +} + +/** + * Validate the modify-header IPv4 DSCP actions. + * + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] action + * Pointer to the modify action. + * @param[in] item_flags + * Holds the items detected. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_modify_ipv4_dscp(const uint64_t action_flags, + const struct rte_flow_action *action, + const uint64_t item_flags, + struct rte_flow_error *error) +{ + int ret = 0; + + ret = flow_dv_validate_action_modify_hdr(action_flags, action, error); + if (!ret) { + if (!(item_flags & MLX5_FLOW_LAYER_L3_IPV4)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "no ipv4 item in pattern"); + } + return ret; +} + +/** + * Validate the modify-header IPv6 DSCP actions. + * + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] action + * Pointer to the modify action. + * @param[in] item_flags + * Holds the items detected. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_modify_ipv6_dscp(const uint64_t action_flags, + const struct rte_flow_action *action, + const uint64_t item_flags, + struct rte_flow_error *error) +{ + int ret = 0; + + ret = flow_dv_validate_action_modify_hdr(action_flags, action, error); + if (!ret) { + if (!(item_flags & MLX5_FLOW_LAYER_L3_IPV6)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "no ipv6 item in pattern"); + } + return ret; +} + +/** + * Find existing modify-header resource or create and register a new one. + * + * @param dev[in, out] + * Pointer to rte_eth_dev structure. + * @param[in, out] resource + * Pointer to modify-header resource. + * @parm[in, out] dev_flow + * Pointer to the dev_flow. + * @param[out] error + * pointer to error structure. + * + * @return + * 0 on success otherwise -errno and errno is set. + */ +static int +flow_dv_modify_hdr_resource_register + (struct rte_eth_dev *dev, + struct mlx5_flow_dv_modify_hdr_resource *resource, + struct mlx5_flow *dev_flow, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; + struct mlx5_flow_dv_modify_hdr_resource *cache_resource; + struct mlx5dv_dr_domain *ns; + uint32_t actions_len; + + resource->flags = dev_flow->dv.group ? 0 : + MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL; + if (resource->actions_num > flow_dv_modify_hdr_action_max(dev, + resource->flags)) + return rte_flow_error_set(error, EOVERFLOW, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "too many modify header items"); + if (resource->ft_type == MLX5DV_FLOW_TABLE_TYPE_FDB) + ns = sh->fdb_domain; + else if (resource->ft_type == MLX5DV_FLOW_TABLE_TYPE_NIC_TX) + ns = sh->tx_domain; + else + ns = sh->rx_domain; + /* Lookup a matching resource from cache. */ + actions_len = resource->actions_num * sizeof(resource->actions[0]); + LIST_FOREACH(cache_resource, &sh->modify_cmds, next) { + if (resource->ft_type == cache_resource->ft_type && + resource->actions_num == cache_resource->actions_num && + resource->flags == cache_resource->flags && + !memcmp((const void *)resource->actions, + (const void *)cache_resource->actions, + actions_len)) { + DRV_LOG(DEBUG, "modify-header resource %p: refcnt %d++", + (void *)cache_resource, + rte_atomic32_read(&cache_resource->refcnt)); + rte_atomic32_inc(&cache_resource->refcnt); + dev_flow->handle->dvh.modify_hdr = cache_resource; + return 0; + } + } + /* Register new modify-header resource. */ + cache_resource = rte_calloc(__func__, 1, + sizeof(*cache_resource) + actions_len, 0); + if (!cache_resource) + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "cannot allocate resource memory"); + *cache_resource = *resource; + rte_memcpy(cache_resource->actions, resource->actions, actions_len); + cache_resource->verbs_action = + mlx5_glue->dv_create_flow_action_modify_header + (sh->ctx, cache_resource->ft_type, ns, + cache_resource->flags, actions_len, + (uint64_t *)cache_resource->actions); + if (!cache_resource->verbs_action) { + rte_free(cache_resource); + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "cannot create action"); + } + rte_atomic32_init(&cache_resource->refcnt); + rte_atomic32_inc(&cache_resource->refcnt); + LIST_INSERT_HEAD(&sh->modify_cmds, cache_resource, next); + dev_flow->handle->dvh.modify_hdr = cache_resource; + DRV_LOG(DEBUG, "new modify-header resource %p: refcnt %d++", + (void *)cache_resource, + rte_atomic32_read(&cache_resource->refcnt)); + return 0; +} + +/** + * Get DV flow counter by index. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] idx + * mlx5 flow counter index in the container. + * @param[out] ppool + * mlx5 flow counter pool in the container, + * + * @return + * Pointer to the counter, NULL otherwise. + */ +static struct mlx5_flow_counter * +flow_dv_counter_get_by_idx(struct rte_eth_dev *dev, + uint32_t idx, + struct mlx5_flow_counter_pool **ppool) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_pools_container *cont; + struct mlx5_flow_counter_pool *pool; + uint32_t batch = 0, age = 0; + + idx--; + age = MLX_CNT_IS_AGE(idx); + idx = age ? idx - MLX5_CNT_AGE_OFFSET : idx; + if (idx >= MLX5_CNT_BATCH_OFFSET) { + idx -= MLX5_CNT_BATCH_OFFSET; + batch = 1; + } + cont = MLX5_CNT_CONTAINER(priv->sh, batch, age); + MLX5_ASSERT(idx / MLX5_COUNTERS_PER_POOL < cont->n); + pool = cont->pools[idx / MLX5_COUNTERS_PER_POOL]; + MLX5_ASSERT(pool); + if (ppool) + *ppool = pool; + return MLX5_POOL_GET_CNT(pool, idx % MLX5_COUNTERS_PER_POOL); +} + +/** + * Get a pool by devx counter ID. + * + * @param[in] cont + * Pointer to the counter container. + * @param[in] id + * The counter devx ID. + * + * @return + * The counter pool pointer if exists, NULL otherwise, + */ +static struct mlx5_flow_counter_pool * +flow_dv_find_pool_by_id(struct mlx5_pools_container *cont, int id) +{ + uint32_t i; + uint32_t n_valid = rte_atomic16_read(&cont->n_valid); + + for (i = 0; i < n_valid; i++) { + struct mlx5_flow_counter_pool *pool = cont->pools[i]; + int base = (pool->min_dcs->id / MLX5_COUNTERS_PER_POOL) * + MLX5_COUNTERS_PER_POOL; + + if (id >= base && id < base + MLX5_COUNTERS_PER_POOL) { + /* + * Move the pool to the head, as counter allocate + * always gets the first pool in the container. + */ + if (pool != TAILQ_FIRST(&cont->pool_list)) { + TAILQ_REMOVE(&cont->pool_list, pool, next); + TAILQ_INSERT_HEAD(&cont->pool_list, pool, next); + } + return pool; + } + } + return NULL; +} + +/** + * Allocate a new memory for the counter values wrapped by all the needed + * management. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] raws_n + * The raw memory areas - each one for MLX5_COUNTERS_PER_POOL counters. + * + * @return + * The new memory management pointer on success, otherwise NULL and rte_errno + * is set. + */ +static struct mlx5_counter_stats_mem_mng * +flow_dv_create_counter_stat_mem_mng(struct rte_eth_dev *dev, int raws_n) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; + struct mlx5_devx_mkey_attr mkey_attr; + struct mlx5_counter_stats_mem_mng *mem_mng; + volatile struct flow_counter_stats *raw_data; + int size = (sizeof(struct flow_counter_stats) * + MLX5_COUNTERS_PER_POOL + + sizeof(struct mlx5_counter_stats_raw)) * raws_n + + sizeof(struct mlx5_counter_stats_mem_mng); + uint8_t *mem = rte_calloc(__func__, 1, size, sysconf(_SC_PAGESIZE)); + int i; + + if (!mem) { + rte_errno = ENOMEM; + return NULL; + } + mem_mng = (struct mlx5_counter_stats_mem_mng *)(mem + size) - 1; + size = sizeof(*raw_data) * MLX5_COUNTERS_PER_POOL * raws_n; + mem_mng->umem = mlx5_glue->devx_umem_reg(sh->ctx, mem, size, + IBV_ACCESS_LOCAL_WRITE); + if (!mem_mng->umem) { + rte_errno = errno; + rte_free(mem); + return NULL; + } + mkey_attr.addr = (uintptr_t)mem; + mkey_attr.size = size; + mkey_attr.umem_id = mem_mng->umem->umem_id; + mkey_attr.pd = sh->pdn; + mkey_attr.log_entity_size = 0; + mkey_attr.pg_access = 0; + mkey_attr.klm_array = NULL; + mkey_attr.klm_num = 0; + if (priv->config.hca_attr.relaxed_ordering_write && + priv->config.hca_attr.relaxed_ordering_read && + !haswell_broadwell_cpu) + mkey_attr.relaxed_ordering = 1; + mem_mng->dm = mlx5_devx_cmd_mkey_create(sh->ctx, &mkey_attr); + if (!mem_mng->dm) { + mlx5_glue->devx_umem_dereg(mem_mng->umem); + rte_errno = errno; + rte_free(mem); + return NULL; + } + mem_mng->raws = (struct mlx5_counter_stats_raw *)(mem + size); + raw_data = (volatile struct flow_counter_stats *)mem; + for (i = 0; i < raws_n; ++i) { + mem_mng->raws[i].mem_mng = mem_mng; + mem_mng->raws[i].data = raw_data + i * MLX5_COUNTERS_PER_POOL; + } + LIST_INSERT_HEAD(&sh->cmng.mem_mngs, mem_mng, next); + return mem_mng; +} + +/** + * Resize a counter container. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] batch + * Whether the pool is for counter that was allocated by batch command. + * @param[in] age + * Whether the pool is for Aging counter. + * + * @return + * 0 on success, otherwise negative errno value and rte_errno is set. + */ +static int +flow_dv_container_resize(struct rte_eth_dev *dev, + uint32_t batch, uint32_t age) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch, + age); + struct mlx5_counter_stats_mem_mng *mem_mng = NULL; + void *old_pools = cont->pools; + uint32_t resize = cont->n + MLX5_CNT_CONTAINER_RESIZE; + uint32_t mem_size = sizeof(struct mlx5_flow_counter_pool *) * resize; + void *pools = rte_calloc(__func__, 1, mem_size, 0); + + if (!pools) { + rte_errno = ENOMEM; + return -ENOMEM; + } + if (old_pools) + memcpy(pools, old_pools, cont->n * + sizeof(struct mlx5_flow_counter_pool *)); + /* + * Fallback mode query the counter directly, no background query + * resources are needed. + */ + if (!priv->counter_fallback) { + int i; + + mem_mng = flow_dv_create_counter_stat_mem_mng(dev, + MLX5_CNT_CONTAINER_RESIZE + MLX5_MAX_PENDING_QUERIES); + if (!mem_mng) { + rte_free(pools); + return -ENOMEM; + } + for (i = 0; i < MLX5_MAX_PENDING_QUERIES; ++i) + LIST_INSERT_HEAD(&priv->sh->cmng.free_stat_raws, + mem_mng->raws + + MLX5_CNT_CONTAINER_RESIZE + + i, next); + } + rte_spinlock_lock(&cont->resize_sl); + cont->n = resize; + cont->mem_mng = mem_mng; + cont->pools = pools; + rte_spinlock_unlock(&cont->resize_sl); + if (old_pools) + rte_free(old_pools); + return 0; +} + +/** + * Query a devx flow counter. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] cnt + * Index to the flow counter. + * @param[out] pkts + * The statistics value of packets. + * @param[out] bytes + * The statistics value of bytes. + * + * @return + * 0 on success, otherwise a negative errno value and rte_errno is set. + */ +static inline int +_flow_dv_query_count(struct rte_eth_dev *dev, uint32_t counter, uint64_t *pkts, + uint64_t *bytes) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_counter_pool *pool = NULL; + struct mlx5_flow_counter *cnt; + struct mlx5_flow_counter_ext *cnt_ext = NULL; + int offset; + + cnt = flow_dv_counter_get_by_idx(dev, counter, &pool); + MLX5_ASSERT(pool); + if (counter < MLX5_CNT_BATCH_OFFSET) { + cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt); + if (priv->counter_fallback) + return mlx5_devx_cmd_flow_counter_query(cnt_ext->dcs, 0, + 0, pkts, bytes, 0, NULL, NULL, 0); + } + + rte_spinlock_lock(&pool->sl); + /* + * The single counters allocation may allocate smaller ID than the + * current allocated in parallel to the host reading. + * In this case the new counter values must be reported as 0. + */ + if (unlikely(cnt_ext && cnt_ext->dcs->id < pool->raw->min_dcs_id)) { + *pkts = 0; + *bytes = 0; + } else { + offset = MLX5_CNT_ARRAY_IDX(pool, cnt); + *pkts = rte_be_to_cpu_64(pool->raw->data[offset].hits); + *bytes = rte_be_to_cpu_64(pool->raw->data[offset].bytes); + } + rte_spinlock_unlock(&pool->sl); + return 0; +} + +/** + * Create and initialize a new counter pool. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[out] dcs + * The devX counter handle. + * @param[in] batch + * Whether the pool is for counter that was allocated by batch command. + * @param[in] age + * Whether the pool is for counter that was allocated for aging. + * @param[in/out] cont_cur + * Pointer to the container pointer, it will be update in pool resize. + * + * @return + * The pool container pointer on success, NULL otherwise and rte_errno is set. + */ +static struct mlx5_flow_counter_pool * +flow_dv_pool_create(struct rte_eth_dev *dev, struct mlx5_devx_obj *dcs, + uint32_t batch, uint32_t age) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_counter_pool *pool; + struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch, + age); + int16_t n_valid = rte_atomic16_read(&cont->n_valid); + uint32_t size = sizeof(*pool); + + if (cont->n == n_valid && flow_dv_container_resize(dev, batch, age)) + return NULL; + size += MLX5_COUNTERS_PER_POOL * CNT_SIZE; + size += (batch ? 0 : MLX5_COUNTERS_PER_POOL * CNTEXT_SIZE); + size += (!age ? 0 : MLX5_COUNTERS_PER_POOL * AGE_SIZE); + pool = rte_calloc(__func__, 1, size, 0); + if (!pool) { + rte_errno = ENOMEM; + return NULL; + } + pool->min_dcs = dcs; + if (!priv->counter_fallback) + pool->raw = cont->mem_mng->raws + n_valid % + MLX5_CNT_CONTAINER_RESIZE; + pool->raw_hw = NULL; + pool->type = 0; + pool->type |= (batch ? 0 : CNT_POOL_TYPE_EXT); + pool->type |= (!age ? 0 : CNT_POOL_TYPE_AGE); + rte_spinlock_init(&pool->sl); + /* + * The generation of the new allocated counters in this pool is 0, 2 in + * the pool generation makes all the counters valid for allocation. + * The start and end query generation protect the counters be released + * between the query and update gap period will not be reallocated + * without the last query finished and stats updated to the memory. + */ + rte_atomic64_set(&pool->start_query_gen, 0x2); + /* + * There's no background query thread for fallback mode, set the + * end_query_gen to the maximum value since no need to wait for + * statistics update. + */ + rte_atomic64_set(&pool->end_query_gen, priv->counter_fallback ? + INT64_MAX : 0x2); + TAILQ_INIT(&pool->counters); + TAILQ_INSERT_HEAD(&cont->pool_list, pool, next); + pool->index = n_valid; + cont->pools[n_valid] = pool; + /* Pool initialization must be updated before host thread access. */ + rte_cio_wmb(); + rte_atomic16_add(&cont->n_valid, 1); + return pool; +} + +/** + * Update the minimum dcs-id for aged or no-aged counter pool. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] pool + * Current counter pool. + * @param[in] batch + * Whether the pool is for counter that was allocated by batch command. + * @param[in] age + * Whether the counter is for aging. + */ +static void +flow_dv_counter_update_min_dcs(struct rte_eth_dev *dev, + struct mlx5_flow_counter_pool *pool, + uint32_t batch, uint32_t age) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_counter_pool *other; + struct mlx5_pools_container *cont; + + cont = MLX5_CNT_CONTAINER(priv->sh, batch, (age ^ 0x1)); + other = flow_dv_find_pool_by_id(cont, pool->min_dcs->id); + if (!other) + return; + if (pool->min_dcs->id < other->min_dcs->id) { + rte_atomic64_set(&other->a64_dcs, + rte_atomic64_read(&pool->a64_dcs)); + } else { + rte_atomic64_set(&pool->a64_dcs, + rte_atomic64_read(&other->a64_dcs)); + } +} +/** + * Prepare a new counter and/or a new counter pool. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[out] cnt_free + * Where to put the pointer of a new counter. + * @param[in] batch + * Whether the pool is for counter that was allocated by batch command. + * @param[in] age + * Whether the pool is for counter that was allocated for aging. + * + * @return + * The counter pool pointer and @p cnt_free is set on success, + * NULL otherwise and rte_errno is set. + */ +static struct mlx5_flow_counter_pool * +flow_dv_counter_pool_prepare(struct rte_eth_dev *dev, + struct mlx5_flow_counter **cnt_free, + uint32_t batch, uint32_t age) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_pools_container *cont; + struct mlx5_flow_counter_pool *pool; + struct mlx5_devx_obj *dcs = NULL; + struct mlx5_flow_counter *cnt; + uint32_t i; + + cont = MLX5_CNT_CONTAINER(priv->sh, batch, age); + if (!batch) { + /* bulk_bitmap must be 0 for single counter allocation. */ + dcs = mlx5_devx_cmd_flow_counter_alloc(priv->sh->ctx, 0); + if (!dcs) + return NULL; + pool = flow_dv_find_pool_by_id(cont, dcs->id); + if (!pool) { + pool = flow_dv_pool_create(dev, dcs, batch, age); + if (!pool) { + mlx5_devx_cmd_destroy(dcs); + return NULL; + } + } else if (dcs->id < pool->min_dcs->id) { + rte_atomic64_set(&pool->a64_dcs, + (int64_t)(uintptr_t)dcs); + } + flow_dv_counter_update_min_dcs(dev, + pool, batch, age); + i = dcs->id % MLX5_COUNTERS_PER_POOL; + cnt = MLX5_POOL_GET_CNT(pool, i); + TAILQ_INSERT_HEAD(&pool->counters, cnt, next); + MLX5_GET_POOL_CNT_EXT(pool, i)->dcs = dcs; + *cnt_free = cnt; + return pool; + } + /* bulk_bitmap is in 128 counters units. */ + if (priv->config.hca_attr.flow_counter_bulk_alloc_bitmap & 0x4) + dcs = mlx5_devx_cmd_flow_counter_alloc(priv->sh->ctx, 0x4); + if (!dcs) { + rte_errno = ENODATA; + return NULL; + } + pool = flow_dv_pool_create(dev, dcs, batch, age); + if (!pool) { + mlx5_devx_cmd_destroy(dcs); + return NULL; + } + for (i = 0; i < MLX5_COUNTERS_PER_POOL; ++i) { + cnt = MLX5_POOL_GET_CNT(pool, i); + TAILQ_INSERT_HEAD(&pool->counters, cnt, next); + } + *cnt_free = MLX5_POOL_GET_CNT(pool, 0); + return pool; +} + +/** + * Search for existed shared counter. + * + * @param[in] cont + * Pointer to the relevant counter pool container. + * @param[in] id + * The shared counter ID to search. + * @param[out] ppool + * mlx5 flow counter pool in the container, + * + * @return + * NULL if not existed, otherwise pointer to the shared extend counter. + */ +static struct mlx5_flow_counter_ext * +flow_dv_counter_shared_search(struct mlx5_pools_container *cont, uint32_t id, + struct mlx5_flow_counter_pool **ppool) +{ + struct mlx5_flow_counter_ext *cnt; + struct mlx5_flow_counter_pool *pool; + uint32_t i, j; + uint32_t n_valid = rte_atomic16_read(&cont->n_valid); + + for (i = 0; i < n_valid; i++) { + pool = cont->pools[i]; + for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) { + cnt = MLX5_GET_POOL_CNT_EXT(pool, j); + if (cnt->ref_cnt && cnt->shared && cnt->id == id) { + if (ppool) + *ppool = cont->pools[i]; + return cnt; + } + } + } + return NULL; +} + +/** + * Allocate a flow counter. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] shared + * Indicate if this counter is shared with other flows. + * @param[in] id + * Counter identifier. + * @param[in] group + * Counter flow group. + * @param[in] age + * Whether the counter was allocated for aging. + * + * @return + * Index to flow counter on success, 0 otherwise and rte_errno is set. + */ +static uint32_t +flow_dv_counter_alloc(struct rte_eth_dev *dev, uint32_t shared, uint32_t id, + uint16_t group, uint32_t age) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_counter_pool *pool = NULL; + struct mlx5_flow_counter *cnt_free = NULL; + struct mlx5_flow_counter_ext *cnt_ext = NULL; + /* + * Currently group 0 flow counter cannot be assigned to a flow if it is + * not the first one in the batch counter allocation, so it is better + * to allocate counters one by one for these flows in a separate + * container. + * A counter can be shared between different groups so need to take + * shared counters from the single container. + */ + uint32_t batch = (group && !shared && !priv->counter_fallback) ? 1 : 0; + struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch, + age); + uint32_t cnt_idx; + + if (!priv->config.devx) { + rte_errno = ENOTSUP; + return 0; + } + if (shared) { + cnt_ext = flow_dv_counter_shared_search(cont, id, &pool); + if (cnt_ext) { + if (cnt_ext->ref_cnt + 1 == 0) { + rte_errno = E2BIG; + return 0; + } + cnt_ext->ref_cnt++; + cnt_idx = pool->index * MLX5_COUNTERS_PER_POOL + + (cnt_ext->dcs->id % MLX5_COUNTERS_PER_POOL) + + 1; + return cnt_idx; + } + } + /* Pools which has a free counters are in the start. */ + TAILQ_FOREACH(pool, &cont->pool_list, next) { + /* + * The free counter reset values must be updated between the + * counter release to the counter allocation, so, at least one + * query must be done in this time. ensure it by saving the + * query generation in the release time. + * The free list is sorted according to the generation - so if + * the first one is not updated, all the others are not + * updated too. + */ + cnt_free = TAILQ_FIRST(&pool->counters); + if (cnt_free && cnt_free->query_gen < + rte_atomic64_read(&pool->end_query_gen)) + break; + cnt_free = NULL; + } + if (!cnt_free) { + pool = flow_dv_counter_pool_prepare(dev, &cnt_free, batch, age); + if (!pool) + return 0; + } + if (!batch) + cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt_free); + /* Create a DV counter action only in the first time usage. */ + if (!cnt_free->action) { + uint16_t offset; + struct mlx5_devx_obj *dcs; + + if (batch) { + offset = MLX5_CNT_ARRAY_IDX(pool, cnt_free); + dcs = pool->min_dcs; + } else { + offset = 0; + dcs = cnt_ext->dcs; + } + cnt_free->action = mlx5_glue->dv_create_flow_action_counter + (dcs->obj, offset); + if (!cnt_free->action) { + rte_errno = errno; + return 0; + } + } + cnt_idx = MLX5_MAKE_CNT_IDX(pool->index, + MLX5_CNT_ARRAY_IDX(pool, cnt_free)); + cnt_idx += batch * MLX5_CNT_BATCH_OFFSET; + cnt_idx += age * MLX5_CNT_AGE_OFFSET; + /* Update the counter reset values. */ + if (_flow_dv_query_count(dev, cnt_idx, &cnt_free->hits, + &cnt_free->bytes)) + return 0; + if (cnt_ext) { + cnt_ext->shared = shared; + cnt_ext->ref_cnt = 1; + cnt_ext->id = id; + } + if (!priv->counter_fallback && !priv->sh->cmng.query_thread_on) + /* Start the asynchronous batch query by the host thread. */ + mlx5_set_query_alarm(priv->sh); + TAILQ_REMOVE(&pool->counters, cnt_free, next); + if (TAILQ_EMPTY(&pool->counters)) { + /* Move the pool to the end of the container pool list. */ + TAILQ_REMOVE(&cont->pool_list, pool, next); + TAILQ_INSERT_TAIL(&cont->pool_list, pool, next); + } + return cnt_idx; +} + +/** + * Get age param from counter index. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] counter + * Index to the counter handler. + * + * @return + * The aging parameter specified for the counter index. + */ +static struct mlx5_age_param* +flow_dv_counter_idx_get_age(struct rte_eth_dev *dev, + uint32_t counter) +{ + struct mlx5_flow_counter *cnt; + struct mlx5_flow_counter_pool *pool = NULL; + + flow_dv_counter_get_by_idx(dev, counter, &pool); + counter = (counter - 1) % MLX5_COUNTERS_PER_POOL; + cnt = MLX5_POOL_GET_CNT(pool, counter); + return MLX5_CNT_TO_AGE(cnt); +} + +/** + * Remove a flow counter from aged counter list. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] counter + * Index to the counter handler. + * @param[in] cnt + * Pointer to the counter handler. + */ +static void +flow_dv_counter_remove_from_age(struct rte_eth_dev *dev, + uint32_t counter, struct mlx5_flow_counter *cnt) +{ + struct mlx5_age_info *age_info; + struct mlx5_age_param *age_param; + struct mlx5_priv *priv = dev->data->dev_private; + + age_info = GET_PORT_AGE_INFO(priv); + age_param = flow_dv_counter_idx_get_age(dev, counter); + if (rte_atomic16_cmpset((volatile uint16_t *) + &age_param->state, + AGE_CANDIDATE, AGE_FREE) + != AGE_CANDIDATE) { + /** + * We need the lock even it is age timeout, + * since counter may still in process. + */ + rte_spinlock_lock(&age_info->aged_sl); + TAILQ_REMOVE(&age_info->aged_counters, cnt, next); + rte_spinlock_unlock(&age_info->aged_sl); + } + rte_atomic16_set(&age_param->state, AGE_FREE); +} +/** + * Release a flow counter. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] counter + * Index to the counter handler. + */ +static void +flow_dv_counter_release(struct rte_eth_dev *dev, uint32_t counter) +{ + struct mlx5_flow_counter_pool *pool = NULL; + struct mlx5_flow_counter *cnt; + struct mlx5_flow_counter_ext *cnt_ext = NULL; + + if (!counter) + return; + cnt = flow_dv_counter_get_by_idx(dev, counter, &pool); + MLX5_ASSERT(pool); + if (counter < MLX5_CNT_BATCH_OFFSET) { + cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt); + if (cnt_ext && --cnt_ext->ref_cnt) + return; + } + if (IS_AGE_POOL(pool)) + flow_dv_counter_remove_from_age(dev, counter, cnt); + /* Put the counter in the end - the last updated one. */ + TAILQ_INSERT_TAIL(&pool->counters, cnt, next); + /* + * Counters released between query trigger and handler need + * to wait the next round of query. Since the packets arrive + * in the gap period will not be taken into account to the + * old counter. + */ + cnt->query_gen = rte_atomic64_read(&pool->start_query_gen); +} + +/** + * Verify the @p attributes will be correctly understood by the NIC and store + * them in the @p flow if everything is correct. + * + * @param[in] dev + * Pointer to dev struct. + * @param[in] attributes + * Pointer to flow attributes + * @param[in] external + * This flow rule is created by request external to PMD. + * @param[out] error + * Pointer to error structure. + * + * @return + * - 0 on success and non root table. + * - 1 on success and root table. + * - a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_attributes(struct rte_eth_dev *dev, + const struct rte_flow_attr *attributes, + bool external __rte_unused, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + uint32_t priority_max = priv->config.flow_prio - 1; + int ret = 0; + +#ifndef HAVE_MLX5DV_DR + if (attributes->group) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_GROUP, + NULL, + "groups are not supported"); +#else + uint32_t table = 0; + + ret = mlx5_flow_group_to_table(attributes, external, + attributes->group, !!priv->fdb_def_rule, + &table, error); + if (ret) + return ret; + if (!table) + ret = MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL; +#endif + if (attributes->priority != MLX5_FLOW_PRIO_RSVD && + attributes->priority >= priority_max) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, + NULL, + "priority out of range"); + if (attributes->transfer) { + if (!priv->config.dv_esw_en) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "E-Switch dr is not supported"); + if (!(priv->representor || priv->master)) + return rte_flow_error_set + (error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "E-Switch configuration can only be" + " done by a master or a representor device"); + if (attributes->egress) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, attributes, + "egress is not supported"); + } + if (!(attributes->egress ^ attributes->ingress)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR, NULL, + "must specify exactly one of " + "ingress or egress"); + return ret; +} + +/** + * Internal validation function. For validating both actions and items. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] attr + * Pointer to the flow attributes. + * @param[in] items + * Pointer to the list of items. + * @param[in] actions + * Pointer to the list of actions. + * @param[in] external + * This flow rule is created by request external to PMD. + * @param[in] hairpin + * Number of hairpin TX actions, 0 means classic flow. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + bool external, int hairpin, struct rte_flow_error *error) +{ + int ret; + uint64_t action_flags = 0; + uint64_t item_flags = 0; + uint64_t last_item = 0; + uint8_t next_protocol = 0xff; + uint16_t ether_type = 0; + int actions_n = 0; + uint8_t item_ipv6_proto = 0; + const struct rte_flow_item *gre_item = NULL; + const struct rte_flow_action_raw_decap *decap; + const struct rte_flow_action_raw_encap *encap; + const struct rte_flow_action_rss *rss; + const struct rte_flow_item_tcp nic_tcp_mask = { + .hdr = { + .tcp_flags = 0xFF, + .src_port = RTE_BE16(UINT16_MAX), + .dst_port = RTE_BE16(UINT16_MAX), + } + }; + const struct rte_flow_item_ipv4 nic_ipv4_mask = { + .hdr = { + .src_addr = RTE_BE32(0xffffffff), + .dst_addr = RTE_BE32(0xffffffff), + .type_of_service = 0xff, + .next_proto_id = 0xff, + .time_to_live = 0xff, + }, + }; + const struct rte_flow_item_ipv6 nic_ipv6_mask = { + .hdr = { + .src_addr = + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + .dst_addr = + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + .vtc_flow = RTE_BE32(0xffffffff), + .proto = 0xff, + .hop_limits = 0xff, + }, + }; + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_dev_config *dev_conf = &priv->config; + uint16_t queue_index = 0xFFFF; + const struct rte_flow_item_vlan *vlan_m = NULL; + int16_t rw_act_num = 0; + uint64_t is_root; + + if (items == NULL) + return -1; + ret = flow_dv_validate_attributes(dev, attr, external, error); + if (ret < 0) + return ret; + is_root = (uint64_t)ret; + for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + int type = items->type; + + switch (type) { + case RTE_FLOW_ITEM_TYPE_VOID: + break; + case RTE_FLOW_ITEM_TYPE_PORT_ID: + ret = flow_dv_validate_item_port_id + (dev, items, attr, item_flags, error); + if (ret < 0) + return ret; + last_item = MLX5_FLOW_ITEM_PORT_ID; + break; + case RTE_FLOW_ITEM_TYPE_ETH: + ret = mlx5_flow_validate_item_eth(items, item_flags, + error); + if (ret < 0) + return ret; + last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L2 : + MLX5_FLOW_LAYER_OUTER_L2; + if (items->mask != NULL && items->spec != NULL) { + ether_type = + ((const struct rte_flow_item_eth *) + items->spec)->type; + ether_type &= + ((const struct rte_flow_item_eth *) + items->mask)->type; + ether_type = rte_be_to_cpu_16(ether_type); + } else { + ether_type = 0; + } + break; + case RTE_FLOW_ITEM_TYPE_VLAN: + ret = flow_dv_validate_item_vlan(items, item_flags, + dev, error); + if (ret < 0) + return ret; + last_item = tunnel ? MLX5_FLOW_LAYER_INNER_VLAN : + MLX5_FLOW_LAYER_OUTER_VLAN; + if (items->mask != NULL && items->spec != NULL) { + ether_type = + ((const struct rte_flow_item_vlan *) + items->spec)->inner_type; + ether_type &= + ((const struct rte_flow_item_vlan *) + items->mask)->inner_type; + ether_type = rte_be_to_cpu_16(ether_type); + } else { + ether_type = 0; + } + /* Store outer VLAN mask for of_push_vlan action. */ + if (!tunnel) + vlan_m = items->mask; + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + mlx5_flow_tunnel_ip_check(items, next_protocol, + &item_flags, &tunnel); + ret = mlx5_flow_validate_item_ipv4(items, item_flags, + last_item, + ether_type, + &nic_ipv4_mask, + error); + if (ret < 0) + return ret; + last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 : + MLX5_FLOW_LAYER_OUTER_L3_IPV4; + if (items->mask != NULL && + ((const struct rte_flow_item_ipv4 *) + items->mask)->hdr.next_proto_id) { + next_protocol = + ((const struct rte_flow_item_ipv4 *) + (items->spec))->hdr.next_proto_id; + next_protocol &= + ((const struct rte_flow_item_ipv4 *) + (items->mask))->hdr.next_proto_id; + } else { + /* Reset for inner layer. */ + next_protocol = 0xff; + } + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + mlx5_flow_tunnel_ip_check(items, next_protocol, + &item_flags, &tunnel); + ret = mlx5_flow_validate_item_ipv6(items, item_flags, + last_item, + ether_type, + &nic_ipv6_mask, + error); + if (ret < 0) + return ret; + last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 : + MLX5_FLOW_LAYER_OUTER_L3_IPV6; + if (items->mask != NULL && + ((const struct rte_flow_item_ipv6 *) + items->mask)->hdr.proto) { + item_ipv6_proto = + ((const struct rte_flow_item_ipv6 *) + items->spec)->hdr.proto; + next_protocol = + ((const struct rte_flow_item_ipv6 *) + items->spec)->hdr.proto; + next_protocol &= + ((const struct rte_flow_item_ipv6 *) + items->mask)->hdr.proto; + } else { + /* Reset for inner layer. */ + next_protocol = 0xff; + } + break; + case RTE_FLOW_ITEM_TYPE_TCP: + ret = mlx5_flow_validate_item_tcp + (items, item_flags, + next_protocol, + &nic_tcp_mask, + error); + if (ret < 0) + return ret; + last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L4_TCP : + MLX5_FLOW_LAYER_OUTER_L4_TCP; + break; + case RTE_FLOW_ITEM_TYPE_UDP: + ret = mlx5_flow_validate_item_udp(items, item_flags, + next_protocol, + error); + if (ret < 0) + return ret; + last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L4_UDP : + MLX5_FLOW_LAYER_OUTER_L4_UDP; + break; + case RTE_FLOW_ITEM_TYPE_GRE: + ret = mlx5_flow_validate_item_gre(items, item_flags, + next_protocol, error); + if (ret < 0) + return ret; + gre_item = items; + last_item = MLX5_FLOW_LAYER_GRE; + break; + case RTE_FLOW_ITEM_TYPE_NVGRE: + ret = mlx5_flow_validate_item_nvgre(items, item_flags, + next_protocol, + error); + if (ret < 0) + return ret; + last_item = MLX5_FLOW_LAYER_NVGRE; + break; + case RTE_FLOW_ITEM_TYPE_GRE_KEY: + ret = mlx5_flow_validate_item_gre_key + (items, item_flags, gre_item, error); + if (ret < 0) + return ret; + last_item = MLX5_FLOW_LAYER_GRE_KEY; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN: + ret = mlx5_flow_validate_item_vxlan(items, item_flags, + error); + if (ret < 0) + return ret; + last_item = MLX5_FLOW_LAYER_VXLAN; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: + ret = mlx5_flow_validate_item_vxlan_gpe(items, + item_flags, dev, + error); + if (ret < 0) + return ret; + last_item = MLX5_FLOW_LAYER_VXLAN_GPE; + break; + case RTE_FLOW_ITEM_TYPE_GENEVE: + ret = mlx5_flow_validate_item_geneve(items, + item_flags, dev, + error); + if (ret < 0) + return ret; + last_item = MLX5_FLOW_LAYER_GENEVE; + break; + case RTE_FLOW_ITEM_TYPE_MPLS: + ret = mlx5_flow_validate_item_mpls(dev, items, + item_flags, + last_item, error); + if (ret < 0) + return ret; + last_item = MLX5_FLOW_LAYER_MPLS; + break; + + case RTE_FLOW_ITEM_TYPE_MARK: + ret = flow_dv_validate_item_mark(dev, items, attr, + error); + if (ret < 0) + return ret; + last_item = MLX5_FLOW_ITEM_MARK; + break; + case RTE_FLOW_ITEM_TYPE_META: + ret = flow_dv_validate_item_meta(dev, items, attr, + error); + if (ret < 0) + return ret; + last_item = MLX5_FLOW_ITEM_METADATA; + break; + case RTE_FLOW_ITEM_TYPE_ICMP: + ret = mlx5_flow_validate_item_icmp(items, item_flags, + next_protocol, + error); + if (ret < 0) + return ret; + last_item = MLX5_FLOW_LAYER_ICMP; + break; + case RTE_FLOW_ITEM_TYPE_ICMP6: + ret = mlx5_flow_validate_item_icmp6(items, item_flags, + next_protocol, + error); + if (ret < 0) + return ret; + item_ipv6_proto = IPPROTO_ICMPV6; + last_item = MLX5_FLOW_LAYER_ICMP6; + break; + case RTE_FLOW_ITEM_TYPE_TAG: + ret = flow_dv_validate_item_tag(dev, items, + attr, error); + if (ret < 0) + return ret; + last_item = MLX5_FLOW_ITEM_TAG; + break; + case MLX5_RTE_FLOW_ITEM_TYPE_TAG: + case MLX5_RTE_FLOW_ITEM_TYPE_TX_QUEUE: + break; + case RTE_FLOW_ITEM_TYPE_GTP: + ret = flow_dv_validate_item_gtp(dev, items, item_flags, + error); + if (ret < 0) + return ret; + last_item = MLX5_FLOW_LAYER_GTP; + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + NULL, "item not supported"); + } + item_flags |= last_item; + } + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + int type = actions->type; + if (actions_n == MLX5_DV_MAX_NUMBER_OF_ACTIONS) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, "too many actions"); + switch (type) { + case RTE_FLOW_ACTION_TYPE_VOID: + break; + case RTE_FLOW_ACTION_TYPE_PORT_ID: + ret = flow_dv_validate_action_port_id(dev, + action_flags, + actions, + attr, + error); + if (ret) + return ret; + action_flags |= MLX5_FLOW_ACTION_PORT_ID; + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_FLAG: + ret = flow_dv_validate_action_flag(dev, action_flags, + attr, error); + if (ret < 0) + return ret; + if (dev_conf->dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { + /* Count all modify-header actions as one. */ + if (!(action_flags & + MLX5_FLOW_MODIFY_HDR_ACTIONS)) + ++actions_n; + action_flags |= MLX5_FLOW_ACTION_FLAG | + MLX5_FLOW_ACTION_MARK_EXT; + } else { + action_flags |= MLX5_FLOW_ACTION_FLAG; + ++actions_n; + } + rw_act_num += MLX5_ACT_NUM_SET_MARK; + break; + case RTE_FLOW_ACTION_TYPE_MARK: + ret = flow_dv_validate_action_mark(dev, actions, + action_flags, + attr, error); + if (ret < 0) + return ret; + if (dev_conf->dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { + /* Count all modify-header actions as one. */ + if (!(action_flags & + MLX5_FLOW_MODIFY_HDR_ACTIONS)) + ++actions_n; + action_flags |= MLX5_FLOW_ACTION_MARK | + MLX5_FLOW_ACTION_MARK_EXT; + } else { + action_flags |= MLX5_FLOW_ACTION_MARK; + ++actions_n; + } + rw_act_num += MLX5_ACT_NUM_SET_MARK; + break; + case RTE_FLOW_ACTION_TYPE_SET_META: + ret = flow_dv_validate_action_set_meta(dev, actions, + action_flags, + attr, error); + if (ret < 0) + return ret; + /* Count all modify-header actions as one action. */ + if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS)) + ++actions_n; + action_flags |= MLX5_FLOW_ACTION_SET_META; + rw_act_num += MLX5_ACT_NUM_SET_META; + break; + case RTE_FLOW_ACTION_TYPE_SET_TAG: + ret = flow_dv_validate_action_set_tag(dev, actions, + action_flags, + attr, error); + if (ret < 0) + return ret; + /* Count all modify-header actions as one action. */ + if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS)) + ++actions_n; + action_flags |= MLX5_FLOW_ACTION_SET_TAG; + rw_act_num += MLX5_ACT_NUM_SET_TAG; + break; + case RTE_FLOW_ACTION_TYPE_DROP: + ret = mlx5_flow_validate_action_drop(action_flags, + attr, error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_DROP; + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_QUEUE: + ret = mlx5_flow_validate_action_queue(actions, + action_flags, dev, + attr, error); + if (ret < 0) + return ret; + queue_index = ((const struct rte_flow_action_queue *) + (actions->conf))->index; + action_flags |= MLX5_FLOW_ACTION_QUEUE; + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_RSS: + rss = actions->conf; + ret = mlx5_flow_validate_action_rss(actions, + action_flags, dev, + attr, item_flags, + error); + if (ret < 0) + return ret; + if (rss != NULL && rss->queue_num) + queue_index = rss->queue[0]; + action_flags |= MLX5_FLOW_ACTION_RSS; + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_COUNT: + ret = flow_dv_validate_action_count(dev, error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_COUNT; + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN: + if (flow_dv_validate_action_pop_vlan(dev, + action_flags, + actions, + item_flags, attr, + error)) + return -rte_errno; + action_flags |= MLX5_FLOW_ACTION_OF_POP_VLAN; + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: + ret = flow_dv_validate_action_push_vlan(dev, + action_flags, + vlan_m, + actions, attr, + error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN; + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP: + ret = flow_dv_validate_action_set_vlan_pcp + (action_flags, actions, error); + if (ret < 0) + return ret; + /* Count PCP with push_vlan command. */ + action_flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP; + break; + case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID: + ret = flow_dv_validate_action_set_vlan_vid + (item_flags, action_flags, + actions, error); + if (ret < 0) + return ret; + /* Count VID with push_vlan command. */ + action_flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID; + rw_act_num += MLX5_ACT_NUM_MDF_VID; + break; + case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: + case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP: + ret = flow_dv_validate_action_l2_encap(dev, + action_flags, + actions, attr, + error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_ENCAP; + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP: + case RTE_FLOW_ACTION_TYPE_NVGRE_DECAP: + ret = flow_dv_validate_action_decap(dev, action_flags, + attr, error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_DECAP; + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: + ret = flow_dv_validate_action_raw_encap_decap + (dev, NULL, actions->conf, attr, &action_flags, + &actions_n, error); + if (ret < 0) + return ret; + break; + case RTE_FLOW_ACTION_TYPE_RAW_DECAP: + decap = actions->conf; + while ((++actions)->type == RTE_FLOW_ACTION_TYPE_VOID) + ; + if (actions->type != RTE_FLOW_ACTION_TYPE_RAW_ENCAP) { + encap = NULL; + actions--; + } else { + encap = actions->conf; + } + ret = flow_dv_validate_action_raw_encap_decap + (dev, + decap ? decap : &empty_decap, encap, + attr, &action_flags, &actions_n, + error); + if (ret < 0) + return ret; + break; + case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC: + case RTE_FLOW_ACTION_TYPE_SET_MAC_DST: + ret = flow_dv_validate_action_modify_mac(action_flags, + actions, + item_flags, + error); + if (ret < 0) + return ret; + /* Count all modify-header actions as one action. */ + if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS)) + ++actions_n; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ? + MLX5_FLOW_ACTION_SET_MAC_SRC : + MLX5_FLOW_ACTION_SET_MAC_DST; + /* + * Even if the source and destination MAC addresses have + * overlap in the header with 4B alignment, the convert + * function will handle them separately and 4 SW actions + * will be created. And 2 actions will be added each + * time no matter how many bytes of address will be set. + */ + rw_act_num += MLX5_ACT_NUM_MDF_MAC; + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC: + case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST: + ret = flow_dv_validate_action_modify_ipv4(action_flags, + actions, + item_flags, + error); + if (ret < 0) + return ret; + /* Count all modify-header actions as one action. */ + if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS)) + ++actions_n; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ? + MLX5_FLOW_ACTION_SET_IPV4_SRC : + MLX5_FLOW_ACTION_SET_IPV4_DST; + rw_act_num += MLX5_ACT_NUM_MDF_IPV4; + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC: + case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST: + ret = flow_dv_validate_action_modify_ipv6(action_flags, + actions, + item_flags, + error); + if (ret < 0) + return ret; + if (item_ipv6_proto == IPPROTO_ICMPV6) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "Can't change header " + "with ICMPv6 proto"); + /* Count all modify-header actions as one action. */ + if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS)) + ++actions_n; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ? + MLX5_FLOW_ACTION_SET_IPV6_SRC : + MLX5_FLOW_ACTION_SET_IPV6_DST; + rw_act_num += MLX5_ACT_NUM_MDF_IPV6; + break; + case RTE_FLOW_ACTION_TYPE_SET_TP_SRC: + case RTE_FLOW_ACTION_TYPE_SET_TP_DST: + ret = flow_dv_validate_action_modify_tp(action_flags, + actions, + item_flags, + error); + if (ret < 0) + return ret; + /* Count all modify-header actions as one action. */ + if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS)) + ++actions_n; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_SET_TP_SRC ? + MLX5_FLOW_ACTION_SET_TP_SRC : + MLX5_FLOW_ACTION_SET_TP_DST; + rw_act_num += MLX5_ACT_NUM_MDF_PORT; + break; + case RTE_FLOW_ACTION_TYPE_DEC_TTL: + case RTE_FLOW_ACTION_TYPE_SET_TTL: + ret = flow_dv_validate_action_modify_ttl(action_flags, + actions, + item_flags, + error); + if (ret < 0) + return ret; + /* Count all modify-header actions as one action. */ + if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS)) + ++actions_n; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_SET_TTL ? + MLX5_FLOW_ACTION_SET_TTL : + MLX5_FLOW_ACTION_DEC_TTL; + rw_act_num += MLX5_ACT_NUM_MDF_TTL; + break; + case RTE_FLOW_ACTION_TYPE_JUMP: + ret = flow_dv_validate_action_jump(actions, + action_flags, + attr, external, + error); + if (ret) + return ret; + ++actions_n; + action_flags |= MLX5_FLOW_ACTION_JUMP; + break; + case RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ: + case RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ: + ret = flow_dv_validate_action_modify_tcp_seq + (action_flags, + actions, + item_flags, + error); + if (ret < 0) + return ret; + /* Count all modify-header actions as one action. */ + if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS)) + ++actions_n; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ ? + MLX5_FLOW_ACTION_INC_TCP_SEQ : + MLX5_FLOW_ACTION_DEC_TCP_SEQ; + rw_act_num += MLX5_ACT_NUM_MDF_TCPSEQ; + break; + case RTE_FLOW_ACTION_TYPE_INC_TCP_ACK: + case RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK: + ret = flow_dv_validate_action_modify_tcp_ack + (action_flags, + actions, + item_flags, + error); + if (ret < 0) + return ret; + /* Count all modify-header actions as one action. */ + if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS)) + ++actions_n; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_INC_TCP_ACK ? + MLX5_FLOW_ACTION_INC_TCP_ACK : + MLX5_FLOW_ACTION_DEC_TCP_ACK; + rw_act_num += MLX5_ACT_NUM_MDF_TCPACK; + break; + case MLX5_RTE_FLOW_ACTION_TYPE_MARK: + break; + case MLX5_RTE_FLOW_ACTION_TYPE_TAG: + case MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG: + rw_act_num += MLX5_ACT_NUM_SET_TAG; + break; + case RTE_FLOW_ACTION_TYPE_METER: + ret = mlx5_flow_validate_action_meter(dev, + action_flags, + actions, attr, + error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_METER; + ++actions_n; + /* Meter action will add one more TAG action. */ + rw_act_num += MLX5_ACT_NUM_SET_TAG; + break; + case RTE_FLOW_ACTION_TYPE_AGE: + ret = flow_dv_validate_action_age(action_flags, + actions, dev, + error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_AGE; + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP: + ret = flow_dv_validate_action_modify_ipv4_dscp + (action_flags, + actions, + item_flags, + error); + if (ret < 0) + return ret; + /* Count all modify-header actions as one action. */ + if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS)) + ++actions_n; + action_flags |= MLX5_FLOW_ACTION_SET_IPV4_DSCP; + rw_act_num += MLX5_ACT_NUM_SET_DSCP; + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP: + ret = flow_dv_validate_action_modify_ipv6_dscp + (action_flags, + actions, + item_flags, + error); + if (ret < 0) + return ret; + /* Count all modify-header actions as one action. */ + if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS)) + ++actions_n; + action_flags |= MLX5_FLOW_ACTION_SET_IPV6_DSCP; + rw_act_num += MLX5_ACT_NUM_SET_DSCP; + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "action not supported"); + } + } + /* + * Validate the drop action mutual exclusion with other actions. + * Drop action is mutually-exclusive with any other action, except for + * Count action. + */ + if ((action_flags & MLX5_FLOW_ACTION_DROP) && + (action_flags & ~(MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_COUNT))) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "Drop action is mutually-exclusive " + "with any other action, except for " + "Count action"); + /* Eswitch has few restrictions on using items and actions */ + if (attr->transfer) { + if (!mlx5_flow_ext_mreg_supported(dev) && + action_flags & MLX5_FLOW_ACTION_FLAG) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "unsupported action FLAG"); + if (!mlx5_flow_ext_mreg_supported(dev) && + action_flags & MLX5_FLOW_ACTION_MARK) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "unsupported action MARK"); + if (action_flags & MLX5_FLOW_ACTION_QUEUE) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "unsupported action QUEUE"); + if (action_flags & MLX5_FLOW_ACTION_RSS) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "unsupported action RSS"); + if (!(action_flags & MLX5_FLOW_FATE_ESWITCH_ACTIONS)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "no fate action is found"); + } else { + if (!(action_flags & MLX5_FLOW_FATE_ACTIONS) && attr->ingress) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "no fate action is found"); + } + /* Continue validation for Xcap actions.*/ + if ((action_flags & MLX5_FLOW_XCAP_ACTIONS) && (queue_index == 0xFFFF || + mlx5_rxq_get_type(dev, queue_index) != MLX5_RXQ_TYPE_HAIRPIN)) { + if ((action_flags & MLX5_FLOW_XCAP_ACTIONS) == + MLX5_FLOW_XCAP_ACTIONS) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "encap and decap " + "combination aren't supported"); + if (!attr->transfer && attr->ingress && (action_flags & + MLX5_FLOW_ACTION_ENCAP)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "encap is not supported" + " for ingress traffic"); + } + /* Hairpin flow will add one more TAG action. */ + if (hairpin > 0) + rw_act_num += MLX5_ACT_NUM_SET_TAG; + /* extra metadata enabled: one more TAG action will be add. */ + if (dev_conf->dv_flow_en && + dev_conf->dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && + mlx5_flow_ext_mreg_supported(dev)) + rw_act_num += MLX5_ACT_NUM_SET_TAG; + if ((uint32_t)rw_act_num > + flow_dv_modify_hdr_action_max(dev, is_root)) { + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "too many header modify" + " actions to support"); + } + return 0; +} + +/** + * Internal preparation function. Allocates the DV flow size, + * this size is constant. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] attr + * Pointer to the flow attributes. + * @param[in] items + * Pointer to the list of items. + * @param[in] actions + * Pointer to the list of actions. + * @param[out] error + * Pointer to the error structure. + * + * @return + * Pointer to mlx5_flow object on success, + * otherwise NULL and rte_errno is set. + */ +static struct mlx5_flow * +flow_dv_prepare(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr __rte_unused, + const struct rte_flow_item items[] __rte_unused, + const struct rte_flow_action actions[] __rte_unused, + struct rte_flow_error *error) +{ + uint32_t handle_idx = 0; + struct mlx5_flow *dev_flow; + struct mlx5_flow_handle *dev_handle; + struct mlx5_priv *priv = dev->data->dev_private; + + /* In case of corrupting the memory. */ + if (priv->flow_idx >= MLX5_NUM_MAX_DEV_FLOWS) { + rte_flow_error_set(error, ENOSPC, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "not free temporary device flow"); + return NULL; + } + dev_handle = mlx5_ipool_zmalloc(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], + &handle_idx); + if (!dev_handle) { + rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "not enough memory to create flow handle"); + return NULL; + } + /* No multi-thread supporting. */ + dev_flow = &((struct mlx5_flow *)priv->inter_flows)[priv->flow_idx++]; + dev_flow->handle = dev_handle; + dev_flow->handle_idx = handle_idx; + dev_flow->dv.value.size = MLX5_ST_SZ_BYTES(fte_match_param); + /* + * The matching value needs to be cleared to 0 before using. In the + * past, it will be automatically cleared when using rte_*alloc + * API. The time consumption will be almost the same as before. + */ + memset(dev_flow->dv.value.buf, 0, MLX5_ST_SZ_BYTES(fte_match_param)); + dev_flow->ingress = attr->ingress; + dev_flow->dv.transfer = attr->transfer; + return dev_flow; +} + +#ifdef RTE_LIBRTE_MLX5_DEBUG +/** + * Sanity check for match mask and value. Similar to check_valid_spec() in + * kernel driver. If unmasked bit is present in value, it returns failure. + * + * @param match_mask + * pointer to match mask buffer. + * @param match_value + * pointer to match value buffer. + * + * @return + * 0 if valid, -EINVAL otherwise. + */ +static int +flow_dv_check_valid_spec(void *match_mask, void *match_value) +{ + uint8_t *m = match_mask; + uint8_t *v = match_value; + unsigned int i; + + for (i = 0; i < MLX5_ST_SZ_BYTES(fte_match_param); ++i) { + if (v[i] & ~m[i]) { + DRV_LOG(ERR, + "match_value differs from match_criteria" + " %p[%u] != %p[%u]", + match_value, i, match_mask, i); + return -EINVAL; + } + } + return 0; +} +#endif + +/** + * Add match of ip_version. + * + * @param[in] group + * Flow group. + * @param[in] headers_v + * Values header pointer. + * @param[in] headers_m + * Masks header pointer. + * @param[in] ip_version + * The IP version to set. + */ +static inline void +flow_dv_set_match_ip_version(uint32_t group, + void *headers_v, + void *headers_m, + uint8_t ip_version) +{ + if (group == 0) + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_version, 0xf); + else + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_version, + ip_version); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_version, ip_version); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, 0); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ethertype, 0); +} + +/** + * Add Ethernet item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_eth(void *matcher, void *key, + const struct rte_flow_item *item, int inner, + uint32_t group) +{ + const struct rte_flow_item_eth *eth_m = item->mask; + const struct rte_flow_item_eth *eth_v = item->spec; + const struct rte_flow_item_eth nic_mask = { + .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", + .src.addr_bytes = "\xff\xff\xff\xff\xff\xff", + .type = RTE_BE16(0xffff), + }; + void *headers_m; + void *headers_v; + char *l24_v; + unsigned int i; + + if (!eth_v) + return; + if (!eth_m) + eth_m = &nic_mask; + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); + } else { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + } + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m, dmac_47_16), + ð_m->dst, sizeof(eth_m->dst)); + /* The value must be in the range of the mask. */ + l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, dmac_47_16); + for (i = 0; i < sizeof(eth_m->dst); ++i) + l24_v[i] = eth_m->dst.addr_bytes[i] & eth_v->dst.addr_bytes[i]; + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m, smac_47_16), + ð_m->src, sizeof(eth_m->src)); + l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, smac_47_16); + /* The value must be in the range of the mask. */ + for (i = 0; i < sizeof(eth_m->dst); ++i) + l24_v[i] = eth_m->src.addr_bytes[i] & eth_v->src.addr_bytes[i]; + if (eth_v->type) { + /* When ethertype is present set mask for tagged VLAN. */ + MLX5_SET(fte_match_set_lyr_2_4, headers_m, cvlan_tag, 1); + /* Set value for tagged VLAN if ethertype is 802.1Q. */ + if (eth_v->type == RTE_BE16(RTE_ETHER_TYPE_VLAN) || + eth_v->type == RTE_BE16(RTE_ETHER_TYPE_QINQ)) { + MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag, + 1); + /* Return here to avoid setting match on ethertype. */ + return; + } + } + /* + * HW supports match on one Ethertype, the Ethertype following the last + * VLAN tag of the packet (see PRM). + * Set match on ethertype only if ETH header is not followed by VLAN. + * HW is optimized for IPv4/IPv6. In such cases, avoid setting + * ethertype, and use ip_version field instead. + */ + if (eth_v->type == RTE_BE16(RTE_ETHER_TYPE_IPV4) && + eth_m->type == 0xFFFF) { + flow_dv_set_match_ip_version(group, headers_v, headers_m, 4); + } else if (eth_v->type == RTE_BE16(RTE_ETHER_TYPE_IPV6) && + eth_m->type == 0xFFFF) { + flow_dv_set_match_ip_version(group, headers_v, headers_m, 6); + } else { + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ethertype, + rte_be_to_cpu_16(eth_m->type)); + l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + ethertype); + *(uint16_t *)(l24_v) = eth_m->type & eth_v->type; + } +} + +/** + * Add VLAN item to matcher and to the value. + * + * @param[in, out] dev_flow + * Flow descriptor. + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_vlan(struct mlx5_flow *dev_flow, + void *matcher, void *key, + const struct rte_flow_item *item, + int inner, uint32_t group) +{ + const struct rte_flow_item_vlan *vlan_m = item->mask; + const struct rte_flow_item_vlan *vlan_v = item->spec; + void *headers_m; + void *headers_v; + uint16_t tci_m; + uint16_t tci_v; + + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); + } else { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + /* + * This is workaround, masks are not supported, + * and pre-validated. + */ + if (vlan_v) + dev_flow->handle->vf_vlan.tag = + rte_be_to_cpu_16(vlan_v->tci) & 0x0fff; + } + /* + * When VLAN item exists in flow, mark packet as tagged, + * even if TCI is not specified. + */ + MLX5_SET(fte_match_set_lyr_2_4, headers_m, cvlan_tag, 1); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag, 1); + if (!vlan_v) + return; + if (!vlan_m) + vlan_m = &rte_flow_item_vlan_mask; + tci_m = rte_be_to_cpu_16(vlan_m->tci); + tci_v = rte_be_to_cpu_16(vlan_m->tci & vlan_v->tci); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, first_vid, tci_m); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, tci_v); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, first_cfi, tci_m >> 12); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_cfi, tci_v >> 12); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, first_prio, tci_m >> 13); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_prio, tci_v >> 13); + /* + * HW is optimized for IPv4/IPv6. In such cases, avoid setting + * ethertype, and use ip_version field instead. + */ + if (vlan_v->inner_type == RTE_BE16(RTE_ETHER_TYPE_IPV4) && + vlan_m->inner_type == 0xFFFF) { + flow_dv_set_match_ip_version(group, headers_v, headers_m, 4); + } else if (vlan_v->inner_type == RTE_BE16(RTE_ETHER_TYPE_IPV6) && + vlan_m->inner_type == 0xFFFF) { + flow_dv_set_match_ip_version(group, headers_v, headers_m, 6); + } else { + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ethertype, + rte_be_to_cpu_16(vlan_m->inner_type)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, + rte_be_to_cpu_16(vlan_m->inner_type & + vlan_v->inner_type)); + } +} + +/** + * Add IPV4 item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. + * @param[in] inner + * Item is inner pattern. + * @param[in] group + * The group to insert the rule. + */ +static void +flow_dv_translate_item_ipv4(void *matcher, void *key, + const struct rte_flow_item *item, + const uint64_t item_flags, + int inner, uint32_t group) +{ + const struct rte_flow_item_ipv4 *ipv4_m = item->mask; + const struct rte_flow_item_ipv4 *ipv4_v = item->spec; + const struct rte_flow_item_ipv4 nic_mask = { + .hdr = { + .src_addr = RTE_BE32(0xffffffff), + .dst_addr = RTE_BE32(0xffffffff), + .type_of_service = 0xff, + .next_proto_id = 0xff, + .time_to_live = 0xff, + }, + }; + void *headers_m; + void *headers_v; + char *l24_m; + char *l24_v; + uint8_t tos; + + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); + } else { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + } + flow_dv_set_match_ip_version(group, headers_v, headers_m, 4); + /* + * On outer header (which must contains L2), or inner header with L2, + * set cvlan_tag mask bit to mark this packet as untagged. + * This should be done even if item->spec is empty. + */ + if (!inner || item_flags & MLX5_FLOW_LAYER_INNER_L2) + MLX5_SET(fte_match_set_lyr_2_4, headers_m, cvlan_tag, 1); + if (!ipv4_v) + return; + if (!ipv4_m) + ipv4_m = &nic_mask; + l24_m = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + *(uint32_t *)l24_m = ipv4_m->hdr.dst_addr; + *(uint32_t *)l24_v = ipv4_m->hdr.dst_addr & ipv4_v->hdr.dst_addr; + l24_m = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m, + src_ipv4_src_ipv6.ipv4_layout.ipv4); + l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + src_ipv4_src_ipv6.ipv4_layout.ipv4); + *(uint32_t *)l24_m = ipv4_m->hdr.src_addr; + *(uint32_t *)l24_v = ipv4_m->hdr.src_addr & ipv4_v->hdr.src_addr; + tos = ipv4_m->hdr.type_of_service & ipv4_v->hdr.type_of_service; + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_ecn, + ipv4_m->hdr.type_of_service); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_ecn, tos); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_dscp, + ipv4_m->hdr.type_of_service >> 2); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_dscp, tos >> 2); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, + ipv4_m->hdr.next_proto_id); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, + ipv4_v->hdr.next_proto_id & ipv4_m->hdr.next_proto_id); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_ttl_hoplimit, + ipv4_m->hdr.time_to_live); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_ttl_hoplimit, + ipv4_v->hdr.time_to_live & ipv4_m->hdr.time_to_live); +} + +/** + * Add IPV6 item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. + * @param[in] inner + * Item is inner pattern. + * @param[in] group + * The group to insert the rule. + */ +static void +flow_dv_translate_item_ipv6(void *matcher, void *key, + const struct rte_flow_item *item, + const uint64_t item_flags, + int inner, uint32_t group) +{ + const struct rte_flow_item_ipv6 *ipv6_m = item->mask; + const struct rte_flow_item_ipv6 *ipv6_v = item->spec; + const struct rte_flow_item_ipv6 nic_mask = { + .hdr = { + .src_addr = + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + .dst_addr = + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + .vtc_flow = RTE_BE32(0xffffffff), + .proto = 0xff, + .hop_limits = 0xff, + }, + }; + void *headers_m; + void *headers_v; + void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters); + void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters); + char *l24_m; + char *l24_v; + uint32_t vtc_m; + uint32_t vtc_v; + int i; + int size; + + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); + } else { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + } + flow_dv_set_match_ip_version(group, headers_v, headers_m, 6); + /* + * On outer header (which must contains L2), or inner header with L2, + * set cvlan_tag mask bit to mark this packet as untagged. + * This should be done even if item->spec is empty. + */ + if (!inner || item_flags & MLX5_FLOW_LAYER_INNER_L2) + MLX5_SET(fte_match_set_lyr_2_4, headers_m, cvlan_tag, 1); + if (!ipv6_v) + return; + if (!ipv6_m) + ipv6_m = &nic_mask; + size = sizeof(ipv6_m->hdr.dst_addr); + l24_m = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6); + l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6); + memcpy(l24_m, ipv6_m->hdr.dst_addr, size); + for (i = 0; i < size; ++i) + l24_v[i] = l24_m[i] & ipv6_v->hdr.dst_addr[i]; + l24_m = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m, + src_ipv4_src_ipv6.ipv6_layout.ipv6); + l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + src_ipv4_src_ipv6.ipv6_layout.ipv6); + memcpy(l24_m, ipv6_m->hdr.src_addr, size); + for (i = 0; i < size; ++i) + l24_v[i] = l24_m[i] & ipv6_v->hdr.src_addr[i]; + /* TOS. */ + vtc_m = rte_be_to_cpu_32(ipv6_m->hdr.vtc_flow); + vtc_v = rte_be_to_cpu_32(ipv6_m->hdr.vtc_flow & ipv6_v->hdr.vtc_flow); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_ecn, vtc_m >> 20); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_ecn, vtc_v >> 20); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_dscp, vtc_m >> 22); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_dscp, vtc_v >> 22); + /* Label. */ + if (inner) { + MLX5_SET(fte_match_set_misc, misc_m, inner_ipv6_flow_label, + vtc_m); + MLX5_SET(fte_match_set_misc, misc_v, inner_ipv6_flow_label, + vtc_v); + } else { + MLX5_SET(fte_match_set_misc, misc_m, outer_ipv6_flow_label, + vtc_m); + MLX5_SET(fte_match_set_misc, misc_v, outer_ipv6_flow_label, + vtc_v); + } + /* Protocol. */ + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, + ipv6_m->hdr.proto); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, + ipv6_v->hdr.proto & ipv6_m->hdr.proto); + /* Hop limit. */ + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_ttl_hoplimit, + ipv6_m->hdr.hop_limits); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_ttl_hoplimit, + ipv6_v->hdr.hop_limits & ipv6_m->hdr.hop_limits); +} + +/** + * Add TCP item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_tcp(void *matcher, void *key, + const struct rte_flow_item *item, + int inner) +{ + const struct rte_flow_item_tcp *tcp_m = item->mask; + const struct rte_flow_item_tcp *tcp_v = item->spec; + void *headers_m; + void *headers_v; + + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); + } else { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + } + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xff); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_TCP); + if (!tcp_v) + return; + if (!tcp_m) + tcp_m = &rte_flow_item_tcp_mask; + MLX5_SET(fte_match_set_lyr_2_4, headers_m, tcp_sport, + rte_be_to_cpu_16(tcp_m->hdr.src_port)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_sport, + rte_be_to_cpu_16(tcp_v->hdr.src_port & tcp_m->hdr.src_port)); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, tcp_dport, + rte_be_to_cpu_16(tcp_m->hdr.dst_port)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_dport, + rte_be_to_cpu_16(tcp_v->hdr.dst_port & tcp_m->hdr.dst_port)); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, tcp_flags, + tcp_m->hdr.tcp_flags); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_flags, + (tcp_v->hdr.tcp_flags & tcp_m->hdr.tcp_flags)); +} + +/** + * Add UDP item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_udp(void *matcher, void *key, + const struct rte_flow_item *item, + int inner) +{ + const struct rte_flow_item_udp *udp_m = item->mask; + const struct rte_flow_item_udp *udp_v = item->spec; + void *headers_m; + void *headers_v; + + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); + } else { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + } + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xff); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_UDP); + if (!udp_v) + return; + if (!udp_m) + udp_m = &rte_flow_item_udp_mask; + MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_sport, + rte_be_to_cpu_16(udp_m->hdr.src_port)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport, + rte_be_to_cpu_16(udp_v->hdr.src_port & udp_m->hdr.src_port)); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, + rte_be_to_cpu_16(udp_m->hdr.dst_port)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, + rte_be_to_cpu_16(udp_v->hdr.dst_port & udp_m->hdr.dst_port)); +} + +/** + * Add GRE optional Key item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_gre_key(void *matcher, void *key, + const struct rte_flow_item *item) +{ + const rte_be32_t *key_m = item->mask; + const rte_be32_t *key_v = item->spec; + void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters); + void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters); + rte_be32_t gre_key_default_mask = RTE_BE32(UINT32_MAX); + + /* GRE K bit must be on and should already be validated */ + MLX5_SET(fte_match_set_misc, misc_m, gre_k_present, 1); + MLX5_SET(fte_match_set_misc, misc_v, gre_k_present, 1); + if (!key_v) + return; + if (!key_m) + key_m = &gre_key_default_mask; + MLX5_SET(fte_match_set_misc, misc_m, gre_key_h, + rte_be_to_cpu_32(*key_m) >> 8); + MLX5_SET(fte_match_set_misc, misc_v, gre_key_h, + rte_be_to_cpu_32((*key_v) & (*key_m)) >> 8); + MLX5_SET(fte_match_set_misc, misc_m, gre_key_l, + rte_be_to_cpu_32(*key_m) & 0xFF); + MLX5_SET(fte_match_set_misc, misc_v, gre_key_l, + rte_be_to_cpu_32((*key_v) & (*key_m)) & 0xFF); +} + +/** + * Add GRE item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_gre(void *matcher, void *key, + const struct rte_flow_item *item, + int inner) +{ + const struct rte_flow_item_gre *gre_m = item->mask; + const struct rte_flow_item_gre *gre_v = item->spec; + void *headers_m; + void *headers_v; + void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters); + void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters); + struct { + union { + __extension__ + struct { + uint16_t version:3; + uint16_t rsvd0:9; + uint16_t s_present:1; + uint16_t k_present:1; + uint16_t rsvd_bit1:1; + uint16_t c_present:1; + }; + uint16_t value; + }; + } gre_crks_rsvd0_ver_m, gre_crks_rsvd0_ver_v; + + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); + } else { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + } + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xff); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_GRE); + if (!gre_v) + return; + if (!gre_m) + gre_m = &rte_flow_item_gre_mask; + MLX5_SET(fte_match_set_misc, misc_m, gre_protocol, + rte_be_to_cpu_16(gre_m->protocol)); + MLX5_SET(fte_match_set_misc, misc_v, gre_protocol, + rte_be_to_cpu_16(gre_v->protocol & gre_m->protocol)); + gre_crks_rsvd0_ver_m.value = rte_be_to_cpu_16(gre_m->c_rsvd0_ver); + gre_crks_rsvd0_ver_v.value = rte_be_to_cpu_16(gre_v->c_rsvd0_ver); + MLX5_SET(fte_match_set_misc, misc_m, gre_c_present, + gre_crks_rsvd0_ver_m.c_present); + MLX5_SET(fte_match_set_misc, misc_v, gre_c_present, + gre_crks_rsvd0_ver_v.c_present & + gre_crks_rsvd0_ver_m.c_present); + MLX5_SET(fte_match_set_misc, misc_m, gre_k_present, + gre_crks_rsvd0_ver_m.k_present); + MLX5_SET(fte_match_set_misc, misc_v, gre_k_present, + gre_crks_rsvd0_ver_v.k_present & + gre_crks_rsvd0_ver_m.k_present); + MLX5_SET(fte_match_set_misc, misc_m, gre_s_present, + gre_crks_rsvd0_ver_m.s_present); + MLX5_SET(fte_match_set_misc, misc_v, gre_s_present, + gre_crks_rsvd0_ver_v.s_present & + gre_crks_rsvd0_ver_m.s_present); +} + +/** + * Add NVGRE item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_nvgre(void *matcher, void *key, + const struct rte_flow_item *item, + int inner) +{ + const struct rte_flow_item_nvgre *nvgre_m = item->mask; + const struct rte_flow_item_nvgre *nvgre_v = item->spec; + void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters); + void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters); + const char *tni_flow_id_m = (const char *)nvgre_m->tni; + const char *tni_flow_id_v = (const char *)nvgre_v->tni; + char *gre_key_m; + char *gre_key_v; + int size; + int i; + + /* For NVGRE, GRE header fields must be set with defined values. */ + const struct rte_flow_item_gre gre_spec = { + .c_rsvd0_ver = RTE_BE16(0x2000), + .protocol = RTE_BE16(RTE_ETHER_TYPE_TEB) + }; + const struct rte_flow_item_gre gre_mask = { + .c_rsvd0_ver = RTE_BE16(0xB000), + .protocol = RTE_BE16(UINT16_MAX), + }; + const struct rte_flow_item gre_item = { + .spec = &gre_spec, + .mask = &gre_mask, + .last = NULL, + }; + flow_dv_translate_item_gre(matcher, key, &gre_item, inner); + if (!nvgre_v) + return; + if (!nvgre_m) + nvgre_m = &rte_flow_item_nvgre_mask; + size = sizeof(nvgre_m->tni) + sizeof(nvgre_m->flow_id); + gre_key_m = MLX5_ADDR_OF(fte_match_set_misc, misc_m, gre_key_h); + gre_key_v = MLX5_ADDR_OF(fte_match_set_misc, misc_v, gre_key_h); + memcpy(gre_key_m, tni_flow_id_m, size); + for (i = 0; i < size; ++i) + gre_key_v[i] = gre_key_m[i] & tni_flow_id_v[i]; +} + +/** + * Add VXLAN item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_vxlan(void *matcher, void *key, + const struct rte_flow_item *item, + int inner) +{ + const struct rte_flow_item_vxlan *vxlan_m = item->mask; + const struct rte_flow_item_vxlan *vxlan_v = item->spec; + void *headers_m; + void *headers_v; + void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters); + void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters); + char *vni_m; + char *vni_v; + uint16_t dport; + int size; + int i; + + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); + } else { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + } + dport = item->type == RTE_FLOW_ITEM_TYPE_VXLAN ? + MLX5_UDP_PORT_VXLAN : MLX5_UDP_PORT_VXLAN_GPE; + if (!MLX5_GET16(fte_match_set_lyr_2_4, headers_v, udp_dport)) { + MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xFFFF); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, dport); + } + if (!vxlan_v) + return; + if (!vxlan_m) + vxlan_m = &rte_flow_item_vxlan_mask; + size = sizeof(vxlan_m->vni); + vni_m = MLX5_ADDR_OF(fte_match_set_misc, misc_m, vxlan_vni); + vni_v = MLX5_ADDR_OF(fte_match_set_misc, misc_v, vxlan_vni); + memcpy(vni_m, vxlan_m->vni, size); + for (i = 0; i < size; ++i) + vni_v[i] = vni_m[i] & vxlan_v->vni[i]; +} + +/** + * Add VXLAN-GPE item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ + +static void +flow_dv_translate_item_vxlan_gpe(void *matcher, void *key, + const struct rte_flow_item *item, int inner) +{ + const struct rte_flow_item_vxlan_gpe *vxlan_m = item->mask; + const struct rte_flow_item_vxlan_gpe *vxlan_v = item->spec; + void *headers_m; + void *headers_v; + void *misc_m = + MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters_3); + void *misc_v = + MLX5_ADDR_OF(fte_match_param, key, misc_parameters_3); + char *vni_m; + char *vni_v; + uint16_t dport; + int size; + int i; + uint8_t flags_m = 0xff; + uint8_t flags_v = 0xc; + + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); + } else { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + } + dport = item->type == RTE_FLOW_ITEM_TYPE_VXLAN ? + MLX5_UDP_PORT_VXLAN : MLX5_UDP_PORT_VXLAN_GPE; + if (!MLX5_GET16(fte_match_set_lyr_2_4, headers_v, udp_dport)) { + MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xFFFF); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, dport); + } + if (!vxlan_v) + return; + if (!vxlan_m) + vxlan_m = &rte_flow_item_vxlan_gpe_mask; + size = sizeof(vxlan_m->vni); + vni_m = MLX5_ADDR_OF(fte_match_set_misc3, misc_m, outer_vxlan_gpe_vni); + vni_v = MLX5_ADDR_OF(fte_match_set_misc3, misc_v, outer_vxlan_gpe_vni); + memcpy(vni_m, vxlan_m->vni, size); + for (i = 0; i < size; ++i) + vni_v[i] = vni_m[i] & vxlan_v->vni[i]; + if (vxlan_m->flags) { + flags_m = vxlan_m->flags; + flags_v = vxlan_v->flags; + } + MLX5_SET(fte_match_set_misc3, misc_m, outer_vxlan_gpe_flags, flags_m); + MLX5_SET(fte_match_set_misc3, misc_v, outer_vxlan_gpe_flags, flags_v); + MLX5_SET(fte_match_set_misc3, misc_m, outer_vxlan_gpe_next_protocol, + vxlan_m->protocol); + MLX5_SET(fte_match_set_misc3, misc_v, outer_vxlan_gpe_next_protocol, + vxlan_v->protocol); +} + +/** + * Add Geneve item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ + +static void +flow_dv_translate_item_geneve(void *matcher, void *key, + const struct rte_flow_item *item, int inner) +{ + const struct rte_flow_item_geneve *geneve_m = item->mask; + const struct rte_flow_item_geneve *geneve_v = item->spec; + void *headers_m; + void *headers_v; + void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters); + void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters); + uint16_t dport; + uint16_t gbhdr_m; + uint16_t gbhdr_v; + char *vni_m; + char *vni_v; + size_t size, i; + + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); + } else { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + } + dport = MLX5_UDP_PORT_GENEVE; + if (!MLX5_GET16(fte_match_set_lyr_2_4, headers_v, udp_dport)) { + MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xFFFF); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, dport); + } + if (!geneve_v) + return; + if (!geneve_m) + geneve_m = &rte_flow_item_geneve_mask; + size = sizeof(geneve_m->vni); + vni_m = MLX5_ADDR_OF(fte_match_set_misc, misc_m, geneve_vni); + vni_v = MLX5_ADDR_OF(fte_match_set_misc, misc_v, geneve_vni); + memcpy(vni_m, geneve_m->vni, size); + for (i = 0; i < size; ++i) + vni_v[i] = vni_m[i] & geneve_v->vni[i]; + MLX5_SET(fte_match_set_misc, misc_m, geneve_protocol_type, + rte_be_to_cpu_16(geneve_m->protocol)); + MLX5_SET(fte_match_set_misc, misc_v, geneve_protocol_type, + rte_be_to_cpu_16(geneve_v->protocol & geneve_m->protocol)); + gbhdr_m = rte_be_to_cpu_16(geneve_m->ver_opt_len_o_c_rsvd0); + gbhdr_v = rte_be_to_cpu_16(geneve_v->ver_opt_len_o_c_rsvd0); + MLX5_SET(fte_match_set_misc, misc_m, geneve_oam, + MLX5_GENEVE_OAMF_VAL(gbhdr_m)); + MLX5_SET(fte_match_set_misc, misc_v, geneve_oam, + MLX5_GENEVE_OAMF_VAL(gbhdr_v) & MLX5_GENEVE_OAMF_VAL(gbhdr_m)); + MLX5_SET(fte_match_set_misc, misc_m, geneve_opt_len, + MLX5_GENEVE_OPTLEN_VAL(gbhdr_m)); + MLX5_SET(fte_match_set_misc, misc_v, geneve_opt_len, + MLX5_GENEVE_OPTLEN_VAL(gbhdr_v) & + MLX5_GENEVE_OPTLEN_VAL(gbhdr_m)); +} + +/** + * Add MPLS item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] prev_layer + * The protocol layer indicated in previous item. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_mpls(void *matcher, void *key, + const struct rte_flow_item *item, + uint64_t prev_layer, + int inner) +{ + const uint32_t *in_mpls_m = item->mask; + const uint32_t *in_mpls_v = item->spec; + uint32_t *out_mpls_m = 0; + uint32_t *out_mpls_v = 0; + void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters); + void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters); + void *misc2_m = MLX5_ADDR_OF(fte_match_param, matcher, + misc_parameters_2); + void *misc2_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters_2); + void *headers_m = MLX5_ADDR_OF(fte_match_param, matcher, outer_headers); + void *headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + + switch (prev_layer) { + case MLX5_FLOW_LAYER_OUTER_L4_UDP: + MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xffff); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, + MLX5_UDP_PORT_MPLS); + break; + case MLX5_FLOW_LAYER_GRE: + MLX5_SET(fte_match_set_misc, misc_m, gre_protocol, 0xffff); + MLX5_SET(fte_match_set_misc, misc_v, gre_protocol, + RTE_ETHER_TYPE_MPLS); + break; + default: + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xff); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, + IPPROTO_MPLS); + break; + } + if (!in_mpls_v) + return; + if (!in_mpls_m) + in_mpls_m = (const uint32_t *)&rte_flow_item_mpls_mask; + switch (prev_layer) { + case MLX5_FLOW_LAYER_OUTER_L4_UDP: + out_mpls_m = + (uint32_t *)MLX5_ADDR_OF(fte_match_set_misc2, misc2_m, + outer_first_mpls_over_udp); + out_mpls_v = + (uint32_t *)MLX5_ADDR_OF(fte_match_set_misc2, misc2_v, + outer_first_mpls_over_udp); + break; + case MLX5_FLOW_LAYER_GRE: + out_mpls_m = + (uint32_t *)MLX5_ADDR_OF(fte_match_set_misc2, misc2_m, + outer_first_mpls_over_gre); + out_mpls_v = + (uint32_t *)MLX5_ADDR_OF(fte_match_set_misc2, misc2_v, + outer_first_mpls_over_gre); + break; + default: + /* Inner MPLS not over GRE is not supported. */ + if (!inner) { + out_mpls_m = + (uint32_t *)MLX5_ADDR_OF(fte_match_set_misc2, + misc2_m, + outer_first_mpls); + out_mpls_v = + (uint32_t *)MLX5_ADDR_OF(fte_match_set_misc2, + misc2_v, + outer_first_mpls); + } + break; + } + if (out_mpls_m && out_mpls_v) { + *out_mpls_m = *in_mpls_m; + *out_mpls_v = *in_mpls_v & *in_mpls_m; + } +} + +/** + * Add metadata register item to matcher + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] reg_type + * Type of device metadata register + * @param[in] value + * Register value + * @param[in] mask + * Register mask + */ +static void +flow_dv_match_meta_reg(void *matcher, void *key, + enum modify_reg reg_type, + uint32_t data, uint32_t mask) +{ + void *misc2_m = + MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters_2); + void *misc2_v = + MLX5_ADDR_OF(fte_match_param, key, misc_parameters_2); + uint32_t temp; + + data &= mask; + switch (reg_type) { + case REG_A: + MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_a, mask); + MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_a, data); + break; + case REG_B: + MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_b, mask); + MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_b, data); + break; + case REG_C_0: + /* + * The metadata register C0 field might be divided into + * source vport index and META item value, we should set + * this field according to specified mask, not as whole one. + */ + temp = MLX5_GET(fte_match_set_misc2, misc2_m, metadata_reg_c_0); + temp |= mask; + MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_0, temp); + temp = MLX5_GET(fte_match_set_misc2, misc2_v, metadata_reg_c_0); + temp &= ~mask; + temp |= data; + MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_0, temp); + break; + case REG_C_1: + MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_1, mask); + MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_1, data); + break; + case REG_C_2: + MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_2, mask); + MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_2, data); + break; + case REG_C_3: + MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_3, mask); + MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_3, data); + break; + case REG_C_4: + MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_4, mask); + MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_4, data); + break; + case REG_C_5: + MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_5, mask); + MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_5, data); + break; + case REG_C_6: + MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_6, mask); + MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_6, data); + break; + case REG_C_7: + MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_7, mask); + MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_7, data); + break; + default: + MLX5_ASSERT(false); + break; + } +} + +/** + * Add MARK item to matcher + * + * @param[in] dev + * The device to configure through. + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + */ +static void +flow_dv_translate_item_mark(struct rte_eth_dev *dev, + void *matcher, void *key, + const struct rte_flow_item *item) +{ + struct mlx5_priv *priv = dev->data->dev_private; + const struct rte_flow_item_mark *mark; + uint32_t value; + uint32_t mask; + + mark = item->mask ? (const void *)item->mask : + &rte_flow_item_mark_mask; + mask = mark->id & priv->sh->dv_mark_mask; + mark = (const void *)item->spec; + MLX5_ASSERT(mark); + value = mark->id & priv->sh->dv_mark_mask & mask; + if (mask) { + enum modify_reg reg; + + /* Get the metadata register index for the mark. */ + reg = mlx5_flow_get_reg_id(dev, MLX5_FLOW_MARK, 0, NULL); + MLX5_ASSERT(reg > 0); + if (reg == REG_C_0) { + struct mlx5_priv *priv = dev->data->dev_private; + uint32_t msk_c0 = priv->sh->dv_regc0_mask; + uint32_t shl_c0 = rte_bsf32(msk_c0); + + mask &= msk_c0; + mask <<= shl_c0; + value <<= shl_c0; + } + flow_dv_match_meta_reg(matcher, key, reg, value, mask); + } +} + +/** + * Add META item to matcher + * + * @param[in] dev + * The devich to configure through. + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] attr + * Attributes of flow that includes this item. + * @param[in] item + * Flow pattern to translate. + */ +static void +flow_dv_translate_item_meta(struct rte_eth_dev *dev, + void *matcher, void *key, + const struct rte_flow_attr *attr, + const struct rte_flow_item *item) +{ + const struct rte_flow_item_meta *meta_m; + const struct rte_flow_item_meta *meta_v; + + meta_m = (const void *)item->mask; + if (!meta_m) + meta_m = &rte_flow_item_meta_mask; + meta_v = (const void *)item->spec; + if (meta_v) { + int reg; + uint32_t value = meta_v->data; + uint32_t mask = meta_m->data; + + reg = flow_dv_get_metadata_reg(dev, attr, NULL); + if (reg < 0) + return; + /* + * In datapath code there is no endianness + * coversions for perfromance reasons, all + * pattern conversions are done in rte_flow. + */ + value = rte_cpu_to_be_32(value); + mask = rte_cpu_to_be_32(mask); + if (reg == REG_C_0) { + struct mlx5_priv *priv = dev->data->dev_private; + uint32_t msk_c0 = priv->sh->dv_regc0_mask; + uint32_t shl_c0 = rte_bsf32(msk_c0); +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN + uint32_t shr_c0 = __builtin_clz(priv->sh->dv_meta_mask); + + value >>= shr_c0; + mask >>= shr_c0; +#endif + value <<= shl_c0; + mask <<= shl_c0; + MLX5_ASSERT(msk_c0); + MLX5_ASSERT(!(~msk_c0 & mask)); + } + flow_dv_match_meta_reg(matcher, key, reg, value, mask); + } +} + +/** + * Add vport metadata Reg C0 item to matcher + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] reg + * Flow pattern to translate. + */ +static void +flow_dv_translate_item_meta_vport(void *matcher, void *key, + uint32_t value, uint32_t mask) +{ + flow_dv_match_meta_reg(matcher, key, REG_C_0, value, mask); +} + +/** + * Add tag item to matcher + * + * @param[in] dev + * The devich to configure through. + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + */ +static void +flow_dv_translate_mlx5_item_tag(struct rte_eth_dev *dev, + void *matcher, void *key, + const struct rte_flow_item *item) +{ + const struct mlx5_rte_flow_item_tag *tag_v = item->spec; + const struct mlx5_rte_flow_item_tag *tag_m = item->mask; + uint32_t mask, value; + + MLX5_ASSERT(tag_v); + value = tag_v->data; + mask = tag_m ? tag_m->data : UINT32_MAX; + if (tag_v->id == REG_C_0) { + struct mlx5_priv *priv = dev->data->dev_private; + uint32_t msk_c0 = priv->sh->dv_regc0_mask; + uint32_t shl_c0 = rte_bsf32(msk_c0); + + mask &= msk_c0; + mask <<= shl_c0; + value <<= shl_c0; + } + flow_dv_match_meta_reg(matcher, key, tag_v->id, value, mask); +} + +/** + * Add TAG item to matcher + * + * @param[in] dev + * The devich to configure through. + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + */ +static void +flow_dv_translate_item_tag(struct rte_eth_dev *dev, + void *matcher, void *key, + const struct rte_flow_item *item) +{ + const struct rte_flow_item_tag *tag_v = item->spec; + const struct rte_flow_item_tag *tag_m = item->mask; + enum modify_reg reg; + + MLX5_ASSERT(tag_v); + tag_m = tag_m ? tag_m : &rte_flow_item_tag_mask; + /* Get the metadata register index for the tag. */ + reg = mlx5_flow_get_reg_id(dev, MLX5_APP_TAG, tag_v->index, NULL); + MLX5_ASSERT(reg > 0); + flow_dv_match_meta_reg(matcher, key, reg, tag_v->data, tag_m->data); +} + +/** + * Add source vport match to the specified matcher. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] port + * Source vport value to match + * @param[in] mask + * Mask + */ +static void +flow_dv_translate_item_source_vport(void *matcher, void *key, + int16_t port, uint16_t mask) +{ + void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters); + void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters); + + MLX5_SET(fte_match_set_misc, misc_m, source_port, mask); + MLX5_SET(fte_match_set_misc, misc_v, source_port, port); +} + +/** + * Translate port-id item to eswitch match on port-id. + * + * @param[in] dev + * The devich to configure through. + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * + * @return + * 0 on success, a negative errno value otherwise. + */ +static int +flow_dv_translate_item_port_id(struct rte_eth_dev *dev, void *matcher, + void *key, const struct rte_flow_item *item) +{ + const struct rte_flow_item_port_id *pid_m = item ? item->mask : NULL; + const struct rte_flow_item_port_id *pid_v = item ? item->spec : NULL; + struct mlx5_priv *priv; + uint16_t mask, id; + + mask = pid_m ? pid_m->id : 0xffff; + id = pid_v ? pid_v->id : dev->data->port_id; + priv = mlx5_port_to_eswitch_info(id, item == NULL); + if (!priv) + return -rte_errno; + /* Translate to vport field or to metadata, depending on mode. */ + if (priv->vport_meta_mask) + flow_dv_translate_item_meta_vport(matcher, key, + priv->vport_meta_tag, + priv->vport_meta_mask); + else + flow_dv_translate_item_source_vport(matcher, key, + priv->vport_id, mask); + return 0; +} + +/** + * Add ICMP6 item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_icmp6(void *matcher, void *key, + const struct rte_flow_item *item, + int inner) +{ + const struct rte_flow_item_icmp6 *icmp6_m = item->mask; + const struct rte_flow_item_icmp6 *icmp6_v = item->spec; + void *headers_m; + void *headers_v; + void *misc3_m = MLX5_ADDR_OF(fte_match_param, matcher, + misc_parameters_3); + void *misc3_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters_3); + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); + } else { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + } + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xFF); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_ICMPV6); + if (!icmp6_v) + return; + if (!icmp6_m) + icmp6_m = &rte_flow_item_icmp6_mask; + /* + * Force flow only to match the non-fragmented IPv6 ICMPv6 packets. + * If only the protocol is specified, no need to match the frag. + */ + MLX5_SET(fte_match_set_lyr_2_4, headers_m, frag, 1); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, frag, 0); + MLX5_SET(fte_match_set_misc3, misc3_m, icmpv6_type, icmp6_m->type); + MLX5_SET(fte_match_set_misc3, misc3_v, icmpv6_type, + icmp6_v->type & icmp6_m->type); + MLX5_SET(fte_match_set_misc3, misc3_m, icmpv6_code, icmp6_m->code); + MLX5_SET(fte_match_set_misc3, misc3_v, icmpv6_code, + icmp6_v->code & icmp6_m->code); +} + +/** + * Add ICMP item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_icmp(void *matcher, void *key, + const struct rte_flow_item *item, + int inner) +{ + const struct rte_flow_item_icmp *icmp_m = item->mask; + const struct rte_flow_item_icmp *icmp_v = item->spec; + void *headers_m; + void *headers_v; + void *misc3_m = MLX5_ADDR_OF(fte_match_param, matcher, + misc_parameters_3); + void *misc3_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters_3); + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); + } else { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + } + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xFF); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_ICMP); + if (!icmp_v) + return; + if (!icmp_m) + icmp_m = &rte_flow_item_icmp_mask; + /* + * Force flow only to match the non-fragmented IPv4 ICMP packets. + * If only the protocol is specified, no need to match the frag. + */ + MLX5_SET(fte_match_set_lyr_2_4, headers_m, frag, 1); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, frag, 0); + MLX5_SET(fte_match_set_misc3, misc3_m, icmp_type, + icmp_m->hdr.icmp_type); + MLX5_SET(fte_match_set_misc3, misc3_v, icmp_type, + icmp_v->hdr.icmp_type & icmp_m->hdr.icmp_type); + MLX5_SET(fte_match_set_misc3, misc3_m, icmp_code, + icmp_m->hdr.icmp_code); + MLX5_SET(fte_match_set_misc3, misc3_v, icmp_code, + icmp_v->hdr.icmp_code & icmp_m->hdr.icmp_code); +} + +/** + * Add GTP item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_gtp(void *matcher, void *key, + const struct rte_flow_item *item, int inner) +{ + const struct rte_flow_item_gtp *gtp_m = item->mask; + const struct rte_flow_item_gtp *gtp_v = item->spec; + void *headers_m; + void *headers_v; + void *misc3_m = MLX5_ADDR_OF(fte_match_param, matcher, + misc_parameters_3); + void *misc3_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters_3); + uint16_t dport = RTE_GTPU_UDP_PORT; + + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); + } else { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + } + if (!MLX5_GET16(fte_match_set_lyr_2_4, headers_v, udp_dport)) { + MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xFFFF); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, dport); + } + if (!gtp_v) + return; + if (!gtp_m) + gtp_m = &rte_flow_item_gtp_mask; + MLX5_SET(fte_match_set_misc3, misc3_m, gtpu_msg_flags, + gtp_m->v_pt_rsv_flags); + MLX5_SET(fte_match_set_misc3, misc3_v, gtpu_msg_flags, + gtp_v->v_pt_rsv_flags & gtp_m->v_pt_rsv_flags); + MLX5_SET(fte_match_set_misc3, misc3_m, gtpu_msg_type, gtp_m->msg_type); + MLX5_SET(fte_match_set_misc3, misc3_v, gtpu_msg_type, + gtp_v->msg_type & gtp_m->msg_type); + MLX5_SET(fte_match_set_misc3, misc3_m, gtpu_teid, + rte_be_to_cpu_32(gtp_m->teid)); + MLX5_SET(fte_match_set_misc3, misc3_v, gtpu_teid, + rte_be_to_cpu_32(gtp_v->teid & gtp_m->teid)); +} + +static uint32_t matcher_zero[MLX5_ST_SZ_DW(fte_match_param)] = { 0 }; + +#define HEADER_IS_ZERO(match_criteria, headers) \ + !(memcmp(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \ + matcher_zero, MLX5_FLD_SZ_BYTES(fte_match_param, headers))) \ + +/** + * Calculate flow matcher enable bitmap. + * + * @param match_criteria + * Pointer to flow matcher criteria. + * + * @return + * Bitmap of enabled fields. + */ +static uint8_t +flow_dv_matcher_enable(uint32_t *match_criteria) +{ + uint8_t match_criteria_enable; + + match_criteria_enable = + (!HEADER_IS_ZERO(match_criteria, outer_headers)) << + MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT; + match_criteria_enable |= + (!HEADER_IS_ZERO(match_criteria, misc_parameters)) << + MLX5_MATCH_CRITERIA_ENABLE_MISC_BIT; + match_criteria_enable |= + (!HEADER_IS_ZERO(match_criteria, inner_headers)) << + MLX5_MATCH_CRITERIA_ENABLE_INNER_BIT; + match_criteria_enable |= + (!HEADER_IS_ZERO(match_criteria, misc_parameters_2)) << + MLX5_MATCH_CRITERIA_ENABLE_MISC2_BIT; + match_criteria_enable |= + (!HEADER_IS_ZERO(match_criteria, misc_parameters_3)) << + MLX5_MATCH_CRITERIA_ENABLE_MISC3_BIT; + return match_criteria_enable; +} + + +/** + * Get a flow table. + * + * @param[in, out] dev + * Pointer to rte_eth_dev structure. + * @param[in] table_id + * Table id to use. + * @param[in] egress + * Direction of the table. + * @param[in] transfer + * E-Switch or NIC flow. + * @param[out] error + * pointer to error structure. + * + * @return + * Returns tables resource based on the index, NULL in case of failed. + */ +static struct mlx5_flow_tbl_resource * +flow_dv_tbl_resource_get(struct rte_eth_dev *dev, + uint32_t table_id, uint8_t egress, + uint8_t transfer, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; + struct mlx5_flow_tbl_resource *tbl; + union mlx5_flow_tbl_key table_key = { + { + .table_id = table_id, + .reserved = 0, + .domain = !!transfer, + .direction = !!egress, + } + }; + struct mlx5_hlist_entry *pos = mlx5_hlist_lookup(sh->flow_tbls, + table_key.v64); + struct mlx5_flow_tbl_data_entry *tbl_data; + uint32_t idx = 0; + int ret; + void *domain; + + if (pos) { + tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry, + entry); + tbl = &tbl_data->tbl; + rte_atomic32_inc(&tbl->refcnt); + return tbl; + } + tbl_data = mlx5_ipool_zmalloc(sh->ipool[MLX5_IPOOL_JUMP], &idx); + if (!tbl_data) { + rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "cannot allocate flow table data entry"); + return NULL; + } + tbl_data->idx = idx; + tbl = &tbl_data->tbl; + pos = &tbl_data->entry; + if (transfer) + domain = sh->fdb_domain; + else if (egress) + domain = sh->tx_domain; + else + domain = sh->rx_domain; + tbl->obj = mlx5_glue->dr_create_flow_tbl(domain, table_id); + if (!tbl->obj) { + rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "cannot create flow table object"); + mlx5_ipool_free(sh->ipool[MLX5_IPOOL_JUMP], idx); + return NULL; + } + /* + * No multi-threads now, but still better to initialize the reference + * count before insert it into the hash list. + */ + rte_atomic32_init(&tbl->refcnt); + /* Jump action reference count is initialized here. */ + rte_atomic32_init(&tbl_data->jump.refcnt); + pos->key = table_key.v64; + ret = mlx5_hlist_insert(sh->flow_tbls, pos); + if (ret < 0) { + rte_flow_error_set(error, -ret, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "cannot insert flow table data entry"); + mlx5_glue->dr_destroy_flow_tbl(tbl->obj); + mlx5_ipool_free(sh->ipool[MLX5_IPOOL_JUMP], idx); + } + rte_atomic32_inc(&tbl->refcnt); + return tbl; +} + +/** + * Release a flow table. + * + * @param[in] dev + * Pointer to rte_eth_dev structure. + * @param[in] tbl + * Table resource to be released. + * + * @return + * Returns 0 if table was released, else return 1; + */ +static int +flow_dv_tbl_resource_release(struct rte_eth_dev *dev, + struct mlx5_flow_tbl_resource *tbl) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; + struct mlx5_flow_tbl_data_entry *tbl_data = + container_of(tbl, struct mlx5_flow_tbl_data_entry, tbl); + + if (!tbl) + return 0; + if (rte_atomic32_dec_and_test(&tbl->refcnt)) { + struct mlx5_hlist_entry *pos = &tbl_data->entry; + + mlx5_glue->dr_destroy_flow_tbl(tbl->obj); + tbl->obj = NULL; + /* remove the entry from the hash list and free memory. */ + mlx5_hlist_remove(sh->flow_tbls, pos); + mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_JUMP], + tbl_data->idx); + return 0; + } + return 1; +} + +/** + * Register the flow matcher. + * + * @param[in, out] dev + * Pointer to rte_eth_dev structure. + * @param[in, out] matcher + * Pointer to flow matcher. + * @param[in, out] key + * Pointer to flow table key. + * @parm[in, out] dev_flow + * Pointer to the dev_flow. + * @param[out] error + * pointer to error structure. + * + * @return + * 0 on success otherwise -errno and errno is set. + */ +static int +flow_dv_matcher_register(struct rte_eth_dev *dev, + struct mlx5_flow_dv_matcher *matcher, + union mlx5_flow_tbl_key *key, + struct mlx5_flow *dev_flow, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; + struct mlx5_flow_dv_matcher *cache_matcher; + struct mlx5dv_flow_matcher_attr dv_attr = { + .type = IBV_FLOW_ATTR_NORMAL, + .match_mask = (void *)&matcher->mask, + }; + struct mlx5_flow_tbl_resource *tbl; + struct mlx5_flow_tbl_data_entry *tbl_data; + + tbl = flow_dv_tbl_resource_get(dev, key->table_id, key->direction, + key->domain, error); + if (!tbl) + return -rte_errno; /* No need to refill the error info */ + tbl_data = container_of(tbl, struct mlx5_flow_tbl_data_entry, tbl); + /* Lookup from cache. */ + LIST_FOREACH(cache_matcher, &tbl_data->matchers, next) { + if (matcher->crc == cache_matcher->crc && + matcher->priority == cache_matcher->priority && + !memcmp((const void *)matcher->mask.buf, + (const void *)cache_matcher->mask.buf, + cache_matcher->mask.size)) { + DRV_LOG(DEBUG, + "%s group %u priority %hd use %s " + "matcher %p: refcnt %d++", + key->domain ? "FDB" : "NIC", key->table_id, + cache_matcher->priority, + key->direction ? "tx" : "rx", + (void *)cache_matcher, + rte_atomic32_read(&cache_matcher->refcnt)); + rte_atomic32_inc(&cache_matcher->refcnt); + dev_flow->handle->dvh.matcher = cache_matcher; + /* old matcher should not make the table ref++. */ + flow_dv_tbl_resource_release(dev, tbl); + return 0; + } + } + /* Register new matcher. */ + cache_matcher = rte_calloc(__func__, 1, sizeof(*cache_matcher), 0); + if (!cache_matcher) { + flow_dv_tbl_resource_release(dev, tbl); + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "cannot allocate matcher memory"); + } + *cache_matcher = *matcher; + dv_attr.match_criteria_enable = + flow_dv_matcher_enable(cache_matcher->mask.buf); + dv_attr.priority = matcher->priority; + if (key->direction) + dv_attr.flags |= IBV_FLOW_ATTR_FLAGS_EGRESS; + cache_matcher->matcher_object = + mlx5_glue->dv_create_flow_matcher(sh->ctx, &dv_attr, tbl->obj); + if (!cache_matcher->matcher_object) { + rte_free(cache_matcher); +#ifdef HAVE_MLX5DV_DR + flow_dv_tbl_resource_release(dev, tbl); +#endif + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "cannot create matcher"); + } + /* Save the table information */ + cache_matcher->tbl = tbl; + rte_atomic32_init(&cache_matcher->refcnt); + /* only matcher ref++, table ref++ already done above in get API. */ + rte_atomic32_inc(&cache_matcher->refcnt); + LIST_INSERT_HEAD(&tbl_data->matchers, cache_matcher, next); + dev_flow->handle->dvh.matcher = cache_matcher; + DRV_LOG(DEBUG, "%s group %u priority %hd new %s matcher %p: refcnt %d", + key->domain ? "FDB" : "NIC", key->table_id, + cache_matcher->priority, + key->direction ? "tx" : "rx", (void *)cache_matcher, + rte_atomic32_read(&cache_matcher->refcnt)); + return 0; +} + +/** + * Find existing tag resource or create and register a new one. + * + * @param dev[in, out] + * Pointer to rte_eth_dev structure. + * @param[in, out] tag_be24 + * Tag value in big endian then R-shift 8. + * @parm[in, out] dev_flow + * Pointer to the dev_flow. + * @param[out] error + * pointer to error structure. + * + * @return + * 0 on success otherwise -errno and errno is set. + */ +static int +flow_dv_tag_resource_register + (struct rte_eth_dev *dev, + uint32_t tag_be24, + struct mlx5_flow *dev_flow, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; + struct mlx5_flow_dv_tag_resource *cache_resource; + struct mlx5_hlist_entry *entry; + + /* Lookup a matching resource from cache. */ + entry = mlx5_hlist_lookup(sh->tag_table, (uint64_t)tag_be24); + if (entry) { + cache_resource = container_of + (entry, struct mlx5_flow_dv_tag_resource, entry); + rte_atomic32_inc(&cache_resource->refcnt); + dev_flow->handle->dvh.rix_tag = cache_resource->idx; + dev_flow->dv.tag_resource = cache_resource; + DRV_LOG(DEBUG, "cached tag resource %p: refcnt now %d++", + (void *)cache_resource, + rte_atomic32_read(&cache_resource->refcnt)); + return 0; + } + /* Register new resource. */ + cache_resource = mlx5_ipool_zmalloc(sh->ipool[MLX5_IPOOL_TAG], + &dev_flow->handle->dvh.rix_tag); + if (!cache_resource) + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "cannot allocate resource memory"); + cache_resource->entry.key = (uint64_t)tag_be24; + cache_resource->action = mlx5_glue->dv_create_flow_action_tag(tag_be24); + if (!cache_resource->action) { + rte_free(cache_resource); + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "cannot create action"); + } + rte_atomic32_init(&cache_resource->refcnt); + rte_atomic32_inc(&cache_resource->refcnt); + if (mlx5_hlist_insert(sh->tag_table, &cache_resource->entry)) { + mlx5_glue->destroy_flow_action(cache_resource->action); + rte_free(cache_resource); + return rte_flow_error_set(error, EEXIST, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "cannot insert tag"); + } + dev_flow->dv.tag_resource = cache_resource; + DRV_LOG(DEBUG, "new tag resource %p: refcnt now %d++", + (void *)cache_resource, + rte_atomic32_read(&cache_resource->refcnt)); + return 0; +} + +/** + * Release the tag. + * + * @param dev + * Pointer to Ethernet device. + * @param tag_idx + * Tag index. + * + * @return + * 1 while a reference on it exists, 0 when freed. + */ +static int +flow_dv_tag_release(struct rte_eth_dev *dev, + uint32_t tag_idx) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; + struct mlx5_flow_dv_tag_resource *tag; + + tag = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_TAG], tag_idx); + if (!tag) + return 0; + DRV_LOG(DEBUG, "port %u tag %p: refcnt %d--", + dev->data->port_id, (void *)tag, + rte_atomic32_read(&tag->refcnt)); + if (rte_atomic32_dec_and_test(&tag->refcnt)) { + claim_zero(mlx5_glue->destroy_flow_action(tag->action)); + mlx5_hlist_remove(sh->tag_table, &tag->entry); + DRV_LOG(DEBUG, "port %u tag %p: removed", + dev->data->port_id, (void *)tag); + mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_TAG], tag_idx); + return 0; + } + return 1; +} + +/** + * Translate port ID action to vport. + * + * @param[in] dev + * Pointer to rte_eth_dev structure. + * @param[in] action + * Pointer to the port ID action. + * @param[out] dst_port_id + * The target port ID. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_translate_action_port_id(struct rte_eth_dev *dev, + const struct rte_flow_action *action, + uint32_t *dst_port_id, + struct rte_flow_error *error) +{ + uint32_t port; + struct mlx5_priv *priv; + const struct rte_flow_action_port_id *conf = + (const struct rte_flow_action_port_id *)action->conf; + + port = conf->original ? dev->data->port_id : conf->id; + priv = mlx5_port_to_eswitch_info(port, false); + if (!priv) + return rte_flow_error_set(error, -rte_errno, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "No eswitch info was found for port"); +#ifdef HAVE_MLX5DV_DR_DEVX_PORT + /* + * This parameter is transferred to + * mlx5dv_dr_action_create_dest_ib_port(). + */ + *dst_port_id = priv->ibv_port; +#else + /* + * Legacy mode, no LAG configurations is supported. + * This parameter is transferred to + * mlx5dv_dr_action_create_dest_vport(). + */ + *dst_port_id = priv->vport_id; +#endif + return 0; +} + +/** + * Create a counter with aging configuration. + * + * @param[in] dev + * Pointer to rte_eth_dev structure. + * @param[out] count + * Pointer to the counter action configuration. + * @param[in] age + * Pointer to the aging action configuration. + * + * @return + * Index to flow counter on success, 0 otherwise. + */ +static uint32_t +flow_dv_translate_create_counter(struct rte_eth_dev *dev, + struct mlx5_flow *dev_flow, + const struct rte_flow_action_count *count, + const struct rte_flow_action_age *age) +{ + uint32_t counter; + struct mlx5_age_param *age_param; + + counter = flow_dv_counter_alloc(dev, + count ? count->shared : 0, + count ? count->id : 0, + dev_flow->dv.group, !!age); + if (!counter || age == NULL) + return counter; + age_param = flow_dv_counter_idx_get_age(dev, counter); + /* + * The counter age accuracy may have a bit delay. Have 3/4 + * second bias on the timeount in order to let it age in time. + */ + age_param->context = age->context ? age->context : + (void *)(uintptr_t)(dev_flow->flow_idx); + /* + * The counter age accuracy may have a bit delay. Have 3/4 + * second bias on the timeount in order to let it age in time. + */ + age_param->timeout = age->timeout * 10 - MLX5_AGING_TIME_DELAY; + /* Set expire time in unit of 0.1 sec. */ + age_param->port_id = dev->data->port_id; + age_param->expire = age_param->timeout + + rte_rdtsc() / (rte_get_tsc_hz() / 10); + rte_atomic16_set(&age_param->state, AGE_CANDIDATE); + return counter; +} +/** + * Add Tx queue matcher + * + * @param[in] dev + * Pointer to the dev struct. + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_tx_queue(struct rte_eth_dev *dev, + void *matcher, void *key, + const struct rte_flow_item *item) +{ + const struct mlx5_rte_flow_item_tx_queue *queue_m; + const struct mlx5_rte_flow_item_tx_queue *queue_v; + void *misc_m = + MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters); + void *misc_v = + MLX5_ADDR_OF(fte_match_param, key, misc_parameters); + struct mlx5_txq_ctrl *txq; + uint32_t queue; + + + queue_m = (const void *)item->mask; + if (!queue_m) + return; + queue_v = (const void *)item->spec; + if (!queue_v) + return; + txq = mlx5_txq_get(dev, queue_v->queue); + if (!txq) + return; + queue = txq->obj->sq->id; + MLX5_SET(fte_match_set_misc, misc_m, source_sqn, queue_m->queue); + MLX5_SET(fte_match_set_misc, misc_v, source_sqn, + queue & queue_m->queue); + mlx5_txq_release(dev, queue_v->queue); +} + +/** + * Set the hash fields according to the @p flow information. + * + * @param[in] dev_flow + * Pointer to the mlx5_flow. + * @param[in] rss_desc + * Pointer to the mlx5_flow_rss_desc. + */ +static void +flow_dv_hashfields_set(struct mlx5_flow *dev_flow, + struct mlx5_flow_rss_desc *rss_desc) +{ + uint64_t items = dev_flow->handle->layers; + int rss_inner = 0; + uint64_t rss_types = rte_eth_rss_hf_refine(rss_desc->types); + + dev_flow->hash_fields = 0; +#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + if (rss_desc->level >= 2) { + dev_flow->hash_fields |= IBV_RX_HASH_INNER; + rss_inner = 1; + } +#endif + if ((rss_inner && (items & MLX5_FLOW_LAYER_INNER_L3_IPV4)) || + (!rss_inner && (items & MLX5_FLOW_LAYER_OUTER_L3_IPV4))) { + if (rss_types & MLX5_IPV4_LAYER_TYPES) { + if (rss_types & ETH_RSS_L3_SRC_ONLY) + dev_flow->hash_fields |= IBV_RX_HASH_SRC_IPV4; + else if (rss_types & ETH_RSS_L3_DST_ONLY) + dev_flow->hash_fields |= IBV_RX_HASH_DST_IPV4; + else + dev_flow->hash_fields |= MLX5_IPV4_IBV_RX_HASH; + } + } else if ((rss_inner && (items & MLX5_FLOW_LAYER_INNER_L3_IPV6)) || + (!rss_inner && (items & MLX5_FLOW_LAYER_OUTER_L3_IPV6))) { + if (rss_types & MLX5_IPV6_LAYER_TYPES) { + if (rss_types & ETH_RSS_L3_SRC_ONLY) + dev_flow->hash_fields |= IBV_RX_HASH_SRC_IPV6; + else if (rss_types & ETH_RSS_L3_DST_ONLY) + dev_flow->hash_fields |= IBV_RX_HASH_DST_IPV6; + else + dev_flow->hash_fields |= MLX5_IPV6_IBV_RX_HASH; + } + } + if ((rss_inner && (items & MLX5_FLOW_LAYER_INNER_L4_UDP)) || + (!rss_inner && (items & MLX5_FLOW_LAYER_OUTER_L4_UDP))) { + if (rss_types & ETH_RSS_UDP) { + if (rss_types & ETH_RSS_L4_SRC_ONLY) + dev_flow->hash_fields |= + IBV_RX_HASH_SRC_PORT_UDP; + else if (rss_types & ETH_RSS_L4_DST_ONLY) + dev_flow->hash_fields |= + IBV_RX_HASH_DST_PORT_UDP; + else + dev_flow->hash_fields |= MLX5_UDP_IBV_RX_HASH; + } + } else if ((rss_inner && (items & MLX5_FLOW_LAYER_INNER_L4_TCP)) || + (!rss_inner && (items & MLX5_FLOW_LAYER_OUTER_L4_TCP))) { + if (rss_types & ETH_RSS_TCP) { + if (rss_types & ETH_RSS_L4_SRC_ONLY) + dev_flow->hash_fields |= + IBV_RX_HASH_SRC_PORT_TCP; + else if (rss_types & ETH_RSS_L4_DST_ONLY) + dev_flow->hash_fields |= + IBV_RX_HASH_DST_PORT_TCP; + else + dev_flow->hash_fields |= MLX5_TCP_IBV_RX_HASH; + } + } +} + +/** + * Fill the flow with DV spec, lock free + * (mutex should be acquired by caller). + * + * @param[in] dev + * Pointer to rte_eth_dev structure. + * @param[in, out] dev_flow + * Pointer to the sub flow. + * @param[in] attr + * Pointer to the flow attributes. + * @param[in] items + * Pointer to the list of items. + * @param[in] actions + * Pointer to the list of actions. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +__flow_dv_translate(struct rte_eth_dev *dev, + struct mlx5_flow *dev_flow, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_dev_config *dev_conf = &priv->config; + struct rte_flow *flow = dev_flow->flow; + struct mlx5_flow_handle *handle = dev_flow->handle; + struct mlx5_flow_rss_desc *rss_desc = &((struct mlx5_flow_rss_desc *) + priv->rss_desc) + [!!priv->flow_nested_idx]; + uint64_t item_flags = 0; + uint64_t last_item = 0; + uint64_t action_flags = 0; + uint64_t priority = attr->priority; + struct mlx5_flow_dv_matcher matcher = { + .mask = { + .size = sizeof(matcher.mask.buf), + }, + }; + int actions_n = 0; + bool actions_end = false; + union { + struct mlx5_flow_dv_modify_hdr_resource res; + uint8_t len[sizeof(struct mlx5_flow_dv_modify_hdr_resource) + + sizeof(struct mlx5_modification_cmd) * + (MLX5_MAX_MODIFY_NUM + 1)]; + } mhdr_dummy; + struct mlx5_flow_dv_modify_hdr_resource *mhdr_res = &mhdr_dummy.res; + const struct rte_flow_action_count *count = NULL; + const struct rte_flow_action_age *age = NULL; + union flow_dv_attr flow_attr = { .attr = 0 }; + uint32_t tag_be; + union mlx5_flow_tbl_key tbl_key; + uint32_t modify_action_position = UINT32_MAX; + void *match_mask = matcher.mask.buf; + void *match_value = dev_flow->dv.value.buf; + uint8_t next_protocol = 0xff; + struct rte_vlan_hdr vlan = { 0 }; + uint32_t table; + int ret = 0; + + mhdr_res->ft_type = attr->egress ? MLX5DV_FLOW_TABLE_TYPE_NIC_TX : + MLX5DV_FLOW_TABLE_TYPE_NIC_RX; + ret = mlx5_flow_group_to_table(attr, dev_flow->external, attr->group, + !!priv->fdb_def_rule, &table, error); + if (ret) + return ret; + dev_flow->dv.group = table; + if (attr->transfer) + mhdr_res->ft_type = MLX5DV_FLOW_TABLE_TYPE_FDB; + if (priority == MLX5_FLOW_PRIO_RSVD) + priority = dev_conf->flow_prio - 1; + /* number of actions must be set to 0 in case of dirty stack. */ + mhdr_res->actions_num = 0; + for (; !actions_end ; actions++) { + const struct rte_flow_action_queue *queue; + const struct rte_flow_action_rss *rss; + const struct rte_flow_action *action = actions; + const uint8_t *rss_key; + const struct rte_flow_action_jump *jump_data; + const struct rte_flow_action_meter *mtr; + struct mlx5_flow_tbl_resource *tbl; + uint32_t port_id = 0; + struct mlx5_flow_dv_port_id_action_resource port_id_resource; + int action_type = actions->type; + const struct rte_flow_action *found_action = NULL; + struct mlx5_flow_meter *fm = NULL; + + switch (action_type) { + case RTE_FLOW_ACTION_TYPE_VOID: + break; + case RTE_FLOW_ACTION_TYPE_PORT_ID: + if (flow_dv_translate_action_port_id(dev, action, + &port_id, error)) + return -rte_errno; + port_id_resource.port_id = port_id; + MLX5_ASSERT(!handle->rix_port_id_action); + if (flow_dv_port_id_action_resource_register + (dev, &port_id_resource, dev_flow, error)) + return -rte_errno; + dev_flow->dv.actions[actions_n++] = + dev_flow->dv.port_id_action->action; + action_flags |= MLX5_FLOW_ACTION_PORT_ID; + dev_flow->handle->fate_action = MLX5_FLOW_FATE_PORT_ID; + break; + case RTE_FLOW_ACTION_TYPE_FLAG: + action_flags |= MLX5_FLOW_ACTION_FLAG; + dev_flow->handle->mark = 1; + if (dev_conf->dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { + struct rte_flow_action_mark mark = { + .id = MLX5_FLOW_MARK_DEFAULT, + }; + + if (flow_dv_convert_action_mark(dev, &mark, + mhdr_res, + error)) + return -rte_errno; + action_flags |= MLX5_FLOW_ACTION_MARK_EXT; + break; + } + tag_be = mlx5_flow_mark_set(MLX5_FLOW_MARK_DEFAULT); + /* + * Only one FLAG or MARK is supported per device flow + * right now. So the pointer to the tag resource must be + * zero before the register process. + */ + MLX5_ASSERT(!handle->dvh.rix_tag); + if (flow_dv_tag_resource_register(dev, tag_be, + dev_flow, error)) + return -rte_errno; + MLX5_ASSERT(dev_flow->dv.tag_resource); + dev_flow->dv.actions[actions_n++] = + dev_flow->dv.tag_resource->action; + break; + case RTE_FLOW_ACTION_TYPE_MARK: + action_flags |= MLX5_FLOW_ACTION_MARK; + dev_flow->handle->mark = 1; + if (dev_conf->dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { + const struct rte_flow_action_mark *mark = + (const struct rte_flow_action_mark *) + actions->conf; + + if (flow_dv_convert_action_mark(dev, mark, + mhdr_res, + error)) + return -rte_errno; + action_flags |= MLX5_FLOW_ACTION_MARK_EXT; + break; + } + /* Fall-through */ + case MLX5_RTE_FLOW_ACTION_TYPE_MARK: + /* Legacy (non-extensive) MARK action. */ + tag_be = mlx5_flow_mark_set + (((const struct rte_flow_action_mark *) + (actions->conf))->id); + MLX5_ASSERT(!handle->dvh.rix_tag); + if (flow_dv_tag_resource_register(dev, tag_be, + dev_flow, error)) + return -rte_errno; + MLX5_ASSERT(dev_flow->dv.tag_resource); + dev_flow->dv.actions[actions_n++] = + dev_flow->dv.tag_resource->action; + break; + case RTE_FLOW_ACTION_TYPE_SET_META: + if (flow_dv_convert_action_set_meta + (dev, mhdr_res, attr, + (const struct rte_flow_action_set_meta *) + actions->conf, error)) + return -rte_errno; + action_flags |= MLX5_FLOW_ACTION_SET_META; + break; + case RTE_FLOW_ACTION_TYPE_SET_TAG: + if (flow_dv_convert_action_set_tag + (dev, mhdr_res, + (const struct rte_flow_action_set_tag *) + actions->conf, error)) + return -rte_errno; + action_flags |= MLX5_FLOW_ACTION_SET_TAG; + break; + case RTE_FLOW_ACTION_TYPE_DROP: + action_flags |= MLX5_FLOW_ACTION_DROP; + dev_flow->handle->fate_action = MLX5_FLOW_FATE_DROP; + break; + case RTE_FLOW_ACTION_TYPE_QUEUE: + queue = actions->conf; + rss_desc->queue_num = 1; + rss_desc->queue[0] = queue->index; + action_flags |= MLX5_FLOW_ACTION_QUEUE; + dev_flow->handle->fate_action = MLX5_FLOW_FATE_QUEUE; + break; + case RTE_FLOW_ACTION_TYPE_RSS: + rss = actions->conf; + memcpy(rss_desc->queue, rss->queue, + rss->queue_num * sizeof(uint16_t)); + rss_desc->queue_num = rss->queue_num; + /* NULL RSS key indicates default RSS key. */ + rss_key = !rss->key ? rss_hash_default_key : rss->key; + memcpy(rss_desc->key, rss_key, MLX5_RSS_HASH_KEY_LEN); + /* + * rss->level and rss.types should be set in advance + * when expanding items for RSS. + */ + action_flags |= MLX5_FLOW_ACTION_RSS; + dev_flow->handle->fate_action = MLX5_FLOW_FATE_QUEUE; + break; + case RTE_FLOW_ACTION_TYPE_AGE: + case RTE_FLOW_ACTION_TYPE_COUNT: + if (!dev_conf->devx) { + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "count action not supported"); + } + /* Save information first, will apply later. */ + if (actions->type == RTE_FLOW_ACTION_TYPE_COUNT) + count = action->conf; + else + age = action->conf; + action_flags |= MLX5_FLOW_ACTION_COUNT; + break; + case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN: + dev_flow->dv.actions[actions_n++] = + priv->sh->pop_vlan_action; + action_flags |= MLX5_FLOW_ACTION_OF_POP_VLAN; + break; + case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: + if (!(action_flags & + MLX5_FLOW_ACTION_OF_SET_VLAN_VID)) + flow_dev_get_vlan_info_from_items(items, &vlan); + vlan.eth_proto = rte_be_to_cpu_16 + ((((const struct rte_flow_action_of_push_vlan *) + actions->conf)->ethertype)); + found_action = mlx5_flow_find_action + (actions + 1, + RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID); + if (found_action) + mlx5_update_vlan_vid_pcp(found_action, &vlan); + found_action = mlx5_flow_find_action + (actions + 1, + RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP); + if (found_action) + mlx5_update_vlan_vid_pcp(found_action, &vlan); + if (flow_dv_create_action_push_vlan + (dev, attr, &vlan, dev_flow, error)) + return -rte_errno; + dev_flow->dv.actions[actions_n++] = + dev_flow->dv.push_vlan_res->action; + action_flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN; + break; + case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP: + /* of_vlan_push action handled this action */ + MLX5_ASSERT(action_flags & + MLX5_FLOW_ACTION_OF_PUSH_VLAN); + break; + case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID: + if (action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) + break; + flow_dev_get_vlan_info_from_items(items, &vlan); + mlx5_update_vlan_vid_pcp(actions, &vlan); + /* If no VLAN push - this is a modify header action */ + if (flow_dv_convert_action_modify_vlan_vid + (mhdr_res, actions, error)) + return -rte_errno; + action_flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID; + break; + case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: + case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP: + if (flow_dv_create_action_l2_encap(dev, actions, + dev_flow, + attr->transfer, + error)) + return -rte_errno; + dev_flow->dv.actions[actions_n++] = + dev_flow->dv.encap_decap->verbs_action; + action_flags |= MLX5_FLOW_ACTION_ENCAP; + break; + case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP: + case RTE_FLOW_ACTION_TYPE_NVGRE_DECAP: + if (flow_dv_create_action_l2_decap(dev, dev_flow, + attr->transfer, + error)) + return -rte_errno; + dev_flow->dv.actions[actions_n++] = + dev_flow->dv.encap_decap->verbs_action; + action_flags |= MLX5_FLOW_ACTION_DECAP; + break; + case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: + /* Handle encap with preceding decap. */ + if (action_flags & MLX5_FLOW_ACTION_DECAP) { + if (flow_dv_create_action_raw_encap + (dev, actions, dev_flow, attr, error)) + return -rte_errno; + dev_flow->dv.actions[actions_n++] = + dev_flow->dv.encap_decap->verbs_action; + } else { + /* Handle encap without preceding decap. */ + if (flow_dv_create_action_l2_encap + (dev, actions, dev_flow, attr->transfer, + error)) + return -rte_errno; + dev_flow->dv.actions[actions_n++] = + dev_flow->dv.encap_decap->verbs_action; + } + action_flags |= MLX5_FLOW_ACTION_ENCAP; + break; + case RTE_FLOW_ACTION_TYPE_RAW_DECAP: + while ((++action)->type == RTE_FLOW_ACTION_TYPE_VOID) + ; + if (action->type != RTE_FLOW_ACTION_TYPE_RAW_ENCAP) { + if (flow_dv_create_action_l2_decap + (dev, dev_flow, attr->transfer, error)) + return -rte_errno; + dev_flow->dv.actions[actions_n++] = + dev_flow->dv.encap_decap->verbs_action; + } + /* If decap is followed by encap, handle it at encap. */ + action_flags |= MLX5_FLOW_ACTION_DECAP; + break; + case RTE_FLOW_ACTION_TYPE_JUMP: + jump_data = action->conf; + ret = mlx5_flow_group_to_table(attr, dev_flow->external, + jump_data->group, + !!priv->fdb_def_rule, + &table, error); + if (ret) + return ret; + tbl = flow_dv_tbl_resource_get(dev, table, + attr->egress, + attr->transfer, error); + if (!tbl) + return rte_flow_error_set + (error, errno, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "cannot create jump action."); + if (flow_dv_jump_tbl_resource_register + (dev, tbl, dev_flow, error)) { + flow_dv_tbl_resource_release(dev, tbl); + return rte_flow_error_set + (error, errno, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "cannot create jump action."); + } + dev_flow->dv.actions[actions_n++] = + dev_flow->dv.jump->action; + action_flags |= MLX5_FLOW_ACTION_JUMP; + dev_flow->handle->fate_action = MLX5_FLOW_FATE_JUMP; + break; + case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC: + case RTE_FLOW_ACTION_TYPE_SET_MAC_DST: + if (flow_dv_convert_action_modify_mac + (mhdr_res, actions, error)) + return -rte_errno; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ? + MLX5_FLOW_ACTION_SET_MAC_SRC : + MLX5_FLOW_ACTION_SET_MAC_DST; + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC: + case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST: + if (flow_dv_convert_action_modify_ipv4 + (mhdr_res, actions, error)) + return -rte_errno; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ? + MLX5_FLOW_ACTION_SET_IPV4_SRC : + MLX5_FLOW_ACTION_SET_IPV4_DST; + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC: + case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST: + if (flow_dv_convert_action_modify_ipv6 + (mhdr_res, actions, error)) + return -rte_errno; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ? + MLX5_FLOW_ACTION_SET_IPV6_SRC : + MLX5_FLOW_ACTION_SET_IPV6_DST; + break; + case RTE_FLOW_ACTION_TYPE_SET_TP_SRC: + case RTE_FLOW_ACTION_TYPE_SET_TP_DST: + if (flow_dv_convert_action_modify_tp + (mhdr_res, actions, items, + &flow_attr, dev_flow, !!(action_flags & + MLX5_FLOW_ACTION_DECAP), error)) + return -rte_errno; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_SET_TP_SRC ? + MLX5_FLOW_ACTION_SET_TP_SRC : + MLX5_FLOW_ACTION_SET_TP_DST; + break; + case RTE_FLOW_ACTION_TYPE_DEC_TTL: + if (flow_dv_convert_action_modify_dec_ttl + (mhdr_res, items, &flow_attr, dev_flow, + !!(action_flags & + MLX5_FLOW_ACTION_DECAP), error)) + return -rte_errno; + action_flags |= MLX5_FLOW_ACTION_DEC_TTL; + break; + case RTE_FLOW_ACTION_TYPE_SET_TTL: + if (flow_dv_convert_action_modify_ttl + (mhdr_res, actions, items, &flow_attr, + dev_flow, !!(action_flags & + MLX5_FLOW_ACTION_DECAP), error)) + return -rte_errno; + action_flags |= MLX5_FLOW_ACTION_SET_TTL; + break; + case RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ: + case RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ: + if (flow_dv_convert_action_modify_tcp_seq + (mhdr_res, actions, error)) + return -rte_errno; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ ? + MLX5_FLOW_ACTION_INC_TCP_SEQ : + MLX5_FLOW_ACTION_DEC_TCP_SEQ; + break; + + case RTE_FLOW_ACTION_TYPE_INC_TCP_ACK: + case RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK: + if (flow_dv_convert_action_modify_tcp_ack + (mhdr_res, actions, error)) + return -rte_errno; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_INC_TCP_ACK ? + MLX5_FLOW_ACTION_INC_TCP_ACK : + MLX5_FLOW_ACTION_DEC_TCP_ACK; + break; + case MLX5_RTE_FLOW_ACTION_TYPE_TAG: + if (flow_dv_convert_action_set_reg + (mhdr_res, actions, error)) + return -rte_errno; + action_flags |= MLX5_FLOW_ACTION_SET_TAG; + break; + case MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG: + if (flow_dv_convert_action_copy_mreg + (dev, mhdr_res, actions, error)) + return -rte_errno; + action_flags |= MLX5_FLOW_ACTION_SET_TAG; + break; + case RTE_FLOW_ACTION_TYPE_METER: + mtr = actions->conf; + if (!flow->meter) { + fm = mlx5_flow_meter_attach(priv, mtr->mtr_id, + attr, error); + if (!fm) + return rte_flow_error_set(error, + rte_errno, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "meter not found " + "or invalid parameters"); + flow->meter = fm->idx; + } + /* Set the meter action. */ + if (!fm) { + fm = mlx5_ipool_get(priv->sh->ipool + [MLX5_IPOOL_MTR], flow->meter); + if (!fm) + return rte_flow_error_set(error, + rte_errno, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "meter not found " + "or invalid parameters"); + } + dev_flow->dv.actions[actions_n++] = + fm->mfts->meter_action; + action_flags |= MLX5_FLOW_ACTION_METER; + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP: + if (flow_dv_convert_action_modify_ipv4_dscp(mhdr_res, + actions, error)) + return -rte_errno; + action_flags |= MLX5_FLOW_ACTION_SET_IPV4_DSCP; + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP: + if (flow_dv_convert_action_modify_ipv6_dscp(mhdr_res, + actions, error)) + return -rte_errno; + action_flags |= MLX5_FLOW_ACTION_SET_IPV6_DSCP; + break; + case RTE_FLOW_ACTION_TYPE_END: + actions_end = true; + if (mhdr_res->actions_num) { + /* create modify action if needed. */ + if (flow_dv_modify_hdr_resource_register + (dev, mhdr_res, dev_flow, error)) + return -rte_errno; + dev_flow->dv.actions[modify_action_position] = + handle->dvh.modify_hdr->verbs_action; + } + if (action_flags & MLX5_FLOW_ACTION_COUNT) { + flow->counter = + flow_dv_translate_create_counter(dev, + dev_flow, count, age); + + if (!flow->counter) + return rte_flow_error_set + (error, rte_errno, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "cannot create counter" + " object."); + dev_flow->dv.actions[actions_n++] = + (flow_dv_counter_get_by_idx(dev, + flow->counter, NULL))->action; + } + break; + default: + break; + } + if (mhdr_res->actions_num && + modify_action_position == UINT32_MAX) + modify_action_position = actions_n++; + } + dev_flow->dv.actions_n = actions_n; + dev_flow->act_flags = action_flags; + for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + int item_type = items->type; + + switch (item_type) { + case RTE_FLOW_ITEM_TYPE_PORT_ID: + flow_dv_translate_item_port_id(dev, match_mask, + match_value, items); + last_item = MLX5_FLOW_ITEM_PORT_ID; + break; + case RTE_FLOW_ITEM_TYPE_ETH: + flow_dv_translate_item_eth(match_mask, match_value, + items, tunnel, + dev_flow->dv.group); + matcher.priority = MLX5_PRIORITY_MAP_L2; + last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L2 : + MLX5_FLOW_LAYER_OUTER_L2; + break; + case RTE_FLOW_ITEM_TYPE_VLAN: + flow_dv_translate_item_vlan(dev_flow, + match_mask, match_value, + items, tunnel, + dev_flow->dv.group); + matcher.priority = MLX5_PRIORITY_MAP_L2; + last_item = tunnel ? (MLX5_FLOW_LAYER_INNER_L2 | + MLX5_FLOW_LAYER_INNER_VLAN) : + (MLX5_FLOW_LAYER_OUTER_L2 | + MLX5_FLOW_LAYER_OUTER_VLAN); + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + mlx5_flow_tunnel_ip_check(items, next_protocol, + &item_flags, &tunnel); + flow_dv_translate_item_ipv4(match_mask, match_value, + items, item_flags, tunnel, + dev_flow->dv.group); + matcher.priority = MLX5_PRIORITY_MAP_L3; + last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 : + MLX5_FLOW_LAYER_OUTER_L3_IPV4; + if (items->mask != NULL && + ((const struct rte_flow_item_ipv4 *) + items->mask)->hdr.next_proto_id) { + next_protocol = + ((const struct rte_flow_item_ipv4 *) + (items->spec))->hdr.next_proto_id; + next_protocol &= + ((const struct rte_flow_item_ipv4 *) + (items->mask))->hdr.next_proto_id; + } else { + /* Reset for inner layer. */ + next_protocol = 0xff; + } + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + mlx5_flow_tunnel_ip_check(items, next_protocol, + &item_flags, &tunnel); + flow_dv_translate_item_ipv6(match_mask, match_value, + items, item_flags, tunnel, + dev_flow->dv.group); + matcher.priority = MLX5_PRIORITY_MAP_L3; + last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 : + MLX5_FLOW_LAYER_OUTER_L3_IPV6; + if (items->mask != NULL && + ((const struct rte_flow_item_ipv6 *) + items->mask)->hdr.proto) { + next_protocol = + ((const struct rte_flow_item_ipv6 *) + items->spec)->hdr.proto; + next_protocol &= + ((const struct rte_flow_item_ipv6 *) + items->mask)->hdr.proto; + } else { + /* Reset for inner layer. */ + next_protocol = 0xff; + } + break; + case RTE_FLOW_ITEM_TYPE_TCP: + flow_dv_translate_item_tcp(match_mask, match_value, + items, tunnel); + matcher.priority = MLX5_PRIORITY_MAP_L4; + last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L4_TCP : + MLX5_FLOW_LAYER_OUTER_L4_TCP; + break; + case RTE_FLOW_ITEM_TYPE_UDP: + flow_dv_translate_item_udp(match_mask, match_value, + items, tunnel); + matcher.priority = MLX5_PRIORITY_MAP_L4; + last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L4_UDP : + MLX5_FLOW_LAYER_OUTER_L4_UDP; + break; + case RTE_FLOW_ITEM_TYPE_GRE: + flow_dv_translate_item_gre(match_mask, match_value, + items, tunnel); + matcher.priority = rss_desc->level >= 2 ? + MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4; + last_item = MLX5_FLOW_LAYER_GRE; + break; + case RTE_FLOW_ITEM_TYPE_GRE_KEY: + flow_dv_translate_item_gre_key(match_mask, + match_value, items); + last_item = MLX5_FLOW_LAYER_GRE_KEY; + break; + case RTE_FLOW_ITEM_TYPE_NVGRE: + flow_dv_translate_item_nvgre(match_mask, match_value, + items, tunnel); + matcher.priority = rss_desc->level >= 2 ? + MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4; + last_item = MLX5_FLOW_LAYER_GRE; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN: + flow_dv_translate_item_vxlan(match_mask, match_value, + items, tunnel); + matcher.priority = rss_desc->level >= 2 ? + MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4; + last_item = MLX5_FLOW_LAYER_VXLAN; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: + flow_dv_translate_item_vxlan_gpe(match_mask, + match_value, items, + tunnel); + matcher.priority = rss_desc->level >= 2 ? + MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4; + last_item = MLX5_FLOW_LAYER_VXLAN_GPE; + break; + case RTE_FLOW_ITEM_TYPE_GENEVE: + flow_dv_translate_item_geneve(match_mask, match_value, + items, tunnel); + matcher.priority = rss_desc->level >= 2 ? + MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4; + last_item = MLX5_FLOW_LAYER_GENEVE; + break; + case RTE_FLOW_ITEM_TYPE_MPLS: + flow_dv_translate_item_mpls(match_mask, match_value, + items, last_item, tunnel); + matcher.priority = rss_desc->level >= 2 ? + MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4; + last_item = MLX5_FLOW_LAYER_MPLS; + break; + case RTE_FLOW_ITEM_TYPE_MARK: + flow_dv_translate_item_mark(dev, match_mask, + match_value, items); + last_item = MLX5_FLOW_ITEM_MARK; + break; + case RTE_FLOW_ITEM_TYPE_META: + flow_dv_translate_item_meta(dev, match_mask, + match_value, attr, items); + last_item = MLX5_FLOW_ITEM_METADATA; + break; + case RTE_FLOW_ITEM_TYPE_ICMP: + flow_dv_translate_item_icmp(match_mask, match_value, + items, tunnel); + last_item = MLX5_FLOW_LAYER_ICMP; + break; + case RTE_FLOW_ITEM_TYPE_ICMP6: + flow_dv_translate_item_icmp6(match_mask, match_value, + items, tunnel); + last_item = MLX5_FLOW_LAYER_ICMP6; + break; + case RTE_FLOW_ITEM_TYPE_TAG: + flow_dv_translate_item_tag(dev, match_mask, + match_value, items); + last_item = MLX5_FLOW_ITEM_TAG; + break; + case MLX5_RTE_FLOW_ITEM_TYPE_TAG: + flow_dv_translate_mlx5_item_tag(dev, match_mask, + match_value, items); + last_item = MLX5_FLOW_ITEM_TAG; + break; + case MLX5_RTE_FLOW_ITEM_TYPE_TX_QUEUE: + flow_dv_translate_item_tx_queue(dev, match_mask, + match_value, + items); + last_item = MLX5_FLOW_ITEM_TX_QUEUE; + break; + case RTE_FLOW_ITEM_TYPE_GTP: + flow_dv_translate_item_gtp(match_mask, match_value, + items, tunnel); + matcher.priority = rss_desc->level >= 2 ? + MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4; + last_item = MLX5_FLOW_LAYER_GTP; + break; + default: + break; + } + item_flags |= last_item; + } + /* + * When E-Switch mode is enabled, we have two cases where we need to + * set the source port manually. + * The first one, is in case of Nic steering rule, and the second is + * E-Switch rule where no port_id item was found. In both cases + * the source port is set according the current port in use. + */ + if (!(item_flags & MLX5_FLOW_ITEM_PORT_ID) && + (priv->representor || priv->master)) { + if (flow_dv_translate_item_port_id(dev, match_mask, + match_value, NULL)) + return -rte_errno; + } +#ifdef RTE_LIBRTE_MLX5_DEBUG + MLX5_ASSERT(!flow_dv_check_valid_spec(matcher.mask.buf, + dev_flow->dv.value.buf)); +#endif + /* + * Layers may be already initialized from prefix flow if this dev_flow + * is the suffix flow. + */ + handle->layers |= item_flags; + if (action_flags & MLX5_FLOW_ACTION_RSS) + flow_dv_hashfields_set(dev_flow, rss_desc); + /* Register matcher. */ + matcher.crc = rte_raw_cksum((const void *)matcher.mask.buf, + matcher.mask.size); + matcher.priority = mlx5_flow_adjust_priority(dev, priority, + matcher.priority); + /* reserved field no needs to be set to 0 here. */ + tbl_key.domain = attr->transfer; + tbl_key.direction = attr->egress; + tbl_key.table_id = dev_flow->dv.group; + if (flow_dv_matcher_register(dev, &matcher, &tbl_key, dev_flow, error)) + return -rte_errno; + return 0; +} + +/** + * Apply the flow to the NIC, lock free, + * (mutex should be acquired by caller). + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in, out] flow + * Pointer to flow structure. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +__flow_dv_apply(struct rte_eth_dev *dev, struct rte_flow *flow, + struct rte_flow_error *error) +{ + struct mlx5_flow_dv_workspace *dv; + struct mlx5_flow_handle *dh; + struct mlx5_flow_handle_dv *dv_h; + struct mlx5_flow *dev_flow; + struct mlx5_priv *priv = dev->data->dev_private; + uint32_t handle_idx; + int n; + int err; + int idx; + + for (idx = priv->flow_idx - 1; idx >= priv->flow_nested_idx; idx--) { + dev_flow = &((struct mlx5_flow *)priv->inter_flows)[idx]; + dv = &dev_flow->dv; + dh = dev_flow->handle; + dv_h = &dh->dvh; + n = dv->actions_n; + if (dh->fate_action == MLX5_FLOW_FATE_DROP) { + if (dv->transfer) { + dv->actions[n++] = priv->sh->esw_drop_action; + } else { + struct mlx5_hrxq *drop_hrxq; + drop_hrxq = mlx5_hrxq_drop_new(dev); + if (!drop_hrxq) { + rte_flow_error_set + (error, errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "cannot get drop hash queue"); + goto error; + } + /* + * Drop queues will be released by the specify + * mlx5_hrxq_drop_release() function. Assign + * the special index to hrxq to mark the queue + * has been allocated. + */ + dh->rix_hrxq = UINT32_MAX; + dv->actions[n++] = drop_hrxq->action; + } + } else if (dh->fate_action == MLX5_FLOW_FATE_QUEUE) { + struct mlx5_hrxq *hrxq; + uint32_t hrxq_idx; + struct mlx5_flow_rss_desc *rss_desc = + &((struct mlx5_flow_rss_desc *)priv->rss_desc) + [!!priv->flow_nested_idx]; + + MLX5_ASSERT(rss_desc->queue_num); + hrxq_idx = mlx5_hrxq_get(dev, rss_desc->key, + MLX5_RSS_HASH_KEY_LEN, + dev_flow->hash_fields, + rss_desc->queue, + rss_desc->queue_num); + if (!hrxq_idx) { + hrxq_idx = mlx5_hrxq_new + (dev, rss_desc->key, + MLX5_RSS_HASH_KEY_LEN, + dev_flow->hash_fields, + rss_desc->queue, + rss_desc->queue_num, + !!(dh->layers & + MLX5_FLOW_LAYER_TUNNEL)); + } + hrxq = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_HRXQ], + hrxq_idx); + if (!hrxq) { + rte_flow_error_set + (error, rte_errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "cannot get hash queue"); + goto error; + } + dh->rix_hrxq = hrxq_idx; + dv->actions[n++] = hrxq->action; + } + dh->ib_flow = + mlx5_glue->dv_create_flow(dv_h->matcher->matcher_object, + (void *)&dv->value, n, + dv->actions); + if (!dh->ib_flow) { + rte_flow_error_set(error, errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "hardware refuses to create flow"); + goto error; + } + if (priv->vmwa_context && + dh->vf_vlan.tag && !dh->vf_vlan.created) { + /* + * The rule contains the VLAN pattern. + * For VF we are going to create VLAN + * interface to make hypervisor set correct + * e-Switch vport context. + */ + mlx5_vlan_vmwa_acquire(dev, &dh->vf_vlan); + } + } + return 0; +error: + err = rte_errno; /* Save rte_errno before cleanup. */ + SILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], flow->dev_handles, + handle_idx, dh, next) { + /* hrxq is union, don't clear it if the flag is not set. */ + if (dh->rix_hrxq) { + if (dh->fate_action == MLX5_FLOW_FATE_DROP) { + mlx5_hrxq_drop_release(dev); + dh->rix_hrxq = 0; + } else if (dh->fate_action == MLX5_FLOW_FATE_QUEUE) { + mlx5_hrxq_release(dev, dh->rix_hrxq); + dh->rix_hrxq = 0; + } + } + if (dh->vf_vlan.tag && dh->vf_vlan.created) + mlx5_vlan_vmwa_release(dev, &dh->vf_vlan); + } + rte_errno = err; /* Restore rte_errno. */ + return -rte_errno; +} + +/** + * Release the flow matcher. + * + * @param dev + * Pointer to Ethernet device. + * @param handle + * Pointer to mlx5_flow_handle. + * + * @return + * 1 while a reference on it exists, 0 when freed. + */ +static int +flow_dv_matcher_release(struct rte_eth_dev *dev, + struct mlx5_flow_handle *handle) +{ + struct mlx5_flow_dv_matcher *matcher = handle->dvh.matcher; + + MLX5_ASSERT(matcher->matcher_object); + DRV_LOG(DEBUG, "port %u matcher %p: refcnt %d--", + dev->data->port_id, (void *)matcher, + rte_atomic32_read(&matcher->refcnt)); + if (rte_atomic32_dec_and_test(&matcher->refcnt)) { + claim_zero(mlx5_glue->dv_destroy_flow_matcher + (matcher->matcher_object)); + LIST_REMOVE(matcher, next); + /* table ref-- in release interface. */ + flow_dv_tbl_resource_release(dev, matcher->tbl); + rte_free(matcher); + DRV_LOG(DEBUG, "port %u matcher %p: removed", + dev->data->port_id, (void *)matcher); + return 0; + } + return 1; +} + +/** + * Release an encap/decap resource. + * + * @param dev + * Pointer to Ethernet device. + * @param handle + * Pointer to mlx5_flow_handle. + * + * @return + * 1 while a reference on it exists, 0 when freed. + */ +static int +flow_dv_encap_decap_resource_release(struct rte_eth_dev *dev, + struct mlx5_flow_handle *handle) +{ + struct mlx5_priv *priv = dev->data->dev_private; + uint32_t idx = handle->dvh.rix_encap_decap; + struct mlx5_flow_dv_encap_decap_resource *cache_resource; + + cache_resource = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_DECAP_ENCAP], + idx); + if (!cache_resource) + return 0; + MLX5_ASSERT(cache_resource->verbs_action); + DRV_LOG(DEBUG, "encap/decap resource %p: refcnt %d--", + (void *)cache_resource, + rte_atomic32_read(&cache_resource->refcnt)); + if (rte_atomic32_dec_and_test(&cache_resource->refcnt)) { + claim_zero(mlx5_glue->destroy_flow_action + (cache_resource->verbs_action)); + ILIST_REMOVE(priv->sh->ipool[MLX5_IPOOL_DECAP_ENCAP], + &priv->sh->encaps_decaps, idx, + cache_resource, next); + mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_DECAP_ENCAP], idx); + DRV_LOG(DEBUG, "encap/decap resource %p: removed", + (void *)cache_resource); + return 0; + } + return 1; +} + +/** + * Release an jump to table action resource. + * + * @param dev + * Pointer to Ethernet device. + * @param handle + * Pointer to mlx5_flow_handle. + * + * @return + * 1 while a reference on it exists, 0 when freed. + */ +static int +flow_dv_jump_tbl_resource_release(struct rte_eth_dev *dev, + struct mlx5_flow_handle *handle) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_dv_jump_tbl_resource *cache_resource; + struct mlx5_flow_tbl_data_entry *tbl_data; + + tbl_data = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_JUMP], + handle->rix_jump); + if (!tbl_data) + return 0; + cache_resource = &tbl_data->jump; + MLX5_ASSERT(cache_resource->action); + DRV_LOG(DEBUG, "jump table resource %p: refcnt %d--", + (void *)cache_resource, + rte_atomic32_read(&cache_resource->refcnt)); + if (rte_atomic32_dec_and_test(&cache_resource->refcnt)) { + claim_zero(mlx5_glue->destroy_flow_action + (cache_resource->action)); + /* jump action memory free is inside the table release. */ + flow_dv_tbl_resource_release(dev, &tbl_data->tbl); + DRV_LOG(DEBUG, "jump table resource %p: removed", + (void *)cache_resource); + return 0; + } + return 1; +} + +/** + * Release a modify-header resource. + * + * @param handle + * Pointer to mlx5_flow_handle. + * + * @return + * 1 while a reference on it exists, 0 when freed. + */ +static int +flow_dv_modify_hdr_resource_release(struct mlx5_flow_handle *handle) +{ + struct mlx5_flow_dv_modify_hdr_resource *cache_resource = + handle->dvh.modify_hdr; + + MLX5_ASSERT(cache_resource->verbs_action); + DRV_LOG(DEBUG, "modify-header resource %p: refcnt %d--", + (void *)cache_resource, + rte_atomic32_read(&cache_resource->refcnt)); + if (rte_atomic32_dec_and_test(&cache_resource->refcnt)) { + claim_zero(mlx5_glue->destroy_flow_action + (cache_resource->verbs_action)); + LIST_REMOVE(cache_resource, next); + rte_free(cache_resource); + DRV_LOG(DEBUG, "modify-header resource %p: removed", + (void *)cache_resource); + return 0; + } + return 1; +} + +/** + * Release port ID action resource. + * + * @param dev + * Pointer to Ethernet device. + * @param handle + * Pointer to mlx5_flow_handle. + * + * @return + * 1 while a reference on it exists, 0 when freed. + */ +static int +flow_dv_port_id_action_resource_release(struct rte_eth_dev *dev, + struct mlx5_flow_handle *handle) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_dv_port_id_action_resource *cache_resource; + uint32_t idx = handle->rix_port_id_action; + + cache_resource = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_PORT_ID], + idx); + if (!cache_resource) + return 0; + MLX5_ASSERT(cache_resource->action); + DRV_LOG(DEBUG, "port ID action resource %p: refcnt %d--", + (void *)cache_resource, + rte_atomic32_read(&cache_resource->refcnt)); + if (rte_atomic32_dec_and_test(&cache_resource->refcnt)) { + claim_zero(mlx5_glue->destroy_flow_action + (cache_resource->action)); + ILIST_REMOVE(priv->sh->ipool[MLX5_IPOOL_PORT_ID], + &priv->sh->port_id_action_list, idx, + cache_resource, next); + mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_PORT_ID], idx); + DRV_LOG(DEBUG, "port id action resource %p: removed", + (void *)cache_resource); + return 0; + } + return 1; +} + +/** + * Release push vlan action resource. + * + * @param dev + * Pointer to Ethernet device. + * @param handle + * Pointer to mlx5_flow_handle. + * + * @return + * 1 while a reference on it exists, 0 when freed. + */ +static int +flow_dv_push_vlan_action_resource_release(struct rte_eth_dev *dev, + struct mlx5_flow_handle *handle) +{ + struct mlx5_priv *priv = dev->data->dev_private; + uint32_t idx = handle->dvh.rix_push_vlan; + struct mlx5_flow_dv_push_vlan_action_resource *cache_resource; + + cache_resource = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_PUSH_VLAN], + idx); + if (!cache_resource) + return 0; + MLX5_ASSERT(cache_resource->action); + DRV_LOG(DEBUG, "push VLAN action resource %p: refcnt %d--", + (void *)cache_resource, + rte_atomic32_read(&cache_resource->refcnt)); + if (rte_atomic32_dec_and_test(&cache_resource->refcnt)) { + claim_zero(mlx5_glue->destroy_flow_action + (cache_resource->action)); + ILIST_REMOVE(priv->sh->ipool[MLX5_IPOOL_PUSH_VLAN], + &priv->sh->push_vlan_action_list, idx, + cache_resource, next); + mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_PUSH_VLAN], idx); + DRV_LOG(DEBUG, "push vlan action resource %p: removed", + (void *)cache_resource); + return 0; + } + return 1; +} + +/** + * Release the fate resource. + * + * @param dev + * Pointer to Ethernet device. + * @param handle + * Pointer to mlx5_flow_handle. + */ +static void +flow_dv_fate_resource_release(struct rte_eth_dev *dev, + struct mlx5_flow_handle *handle) +{ + if (!handle->rix_fate) + return; + if (handle->fate_action == MLX5_FLOW_FATE_DROP) + mlx5_hrxq_drop_release(dev); + else if (handle->fate_action == MLX5_FLOW_FATE_QUEUE) + mlx5_hrxq_release(dev, handle->rix_hrxq); + else if (handle->fate_action == MLX5_FLOW_FATE_JUMP) + flow_dv_jump_tbl_resource_release(dev, handle); + else if (handle->fate_action == MLX5_FLOW_FATE_PORT_ID) + flow_dv_port_id_action_resource_release(dev, handle); + else + DRV_LOG(DEBUG, "Incorrect fate action:%d", handle->fate_action); + handle->rix_fate = 0; +} + +/** + * Remove the flow from the NIC but keeps it in memory. + * Lock free, (mutex should be acquired by caller). + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in, out] flow + * Pointer to flow structure. + */ +static void +__flow_dv_remove(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + struct mlx5_flow_handle *dh; + uint32_t handle_idx; + struct mlx5_priv *priv = dev->data->dev_private; + + if (!flow) + return; + handle_idx = flow->dev_handles; + while (handle_idx) { + dh = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], + handle_idx); + if (!dh) + return; + if (dh->ib_flow) { + claim_zero(mlx5_glue->dv_destroy_flow(dh->ib_flow)); + dh->ib_flow = NULL; + } + if (dh->fate_action == MLX5_FLOW_FATE_DROP || + dh->fate_action == MLX5_FLOW_FATE_QUEUE) + flow_dv_fate_resource_release(dev, dh); + if (dh->vf_vlan.tag && dh->vf_vlan.created) + mlx5_vlan_vmwa_release(dev, &dh->vf_vlan); + handle_idx = dh->next.next; + } +} + +/** + * Remove the flow from the NIC and the memory. + * Lock free, (mutex should be acquired by caller). + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in, out] flow + * Pointer to flow structure. + */ +static void +__flow_dv_destroy(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + struct mlx5_flow_handle *dev_handle; + struct mlx5_priv *priv = dev->data->dev_private; + + if (!flow) + return; + __flow_dv_remove(dev, flow); + if (flow->counter) { + flow_dv_counter_release(dev, flow->counter); + flow->counter = 0; + } + if (flow->meter) { + struct mlx5_flow_meter *fm; + + fm = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_MTR], + flow->meter); + if (fm) + mlx5_flow_meter_detach(fm); + flow->meter = 0; + } + while (flow->dev_handles) { + uint32_t tmp_idx = flow->dev_handles; + + dev_handle = mlx5_ipool_get(priv->sh->ipool + [MLX5_IPOOL_MLX5_FLOW], tmp_idx); + if (!dev_handle) + return; + flow->dev_handles = dev_handle->next.next; + if (dev_handle->dvh.matcher) + flow_dv_matcher_release(dev, dev_handle); + if (dev_handle->dvh.rix_encap_decap) + flow_dv_encap_decap_resource_release(dev, dev_handle); + if (dev_handle->dvh.modify_hdr) + flow_dv_modify_hdr_resource_release(dev_handle); + if (dev_handle->dvh.rix_push_vlan) + flow_dv_push_vlan_action_resource_release(dev, + dev_handle); + if (dev_handle->dvh.rix_tag) + flow_dv_tag_release(dev, + dev_handle->dvh.rix_tag); + flow_dv_fate_resource_release(dev, dev_handle); + mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], + tmp_idx); + } +} + +/** + * Query a dv flow rule for its statistics via devx. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] flow + * Pointer to the sub flow. + * @param[out] data + * data retrieved by the query. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_query_count(struct rte_eth_dev *dev, struct rte_flow *flow, + void *data, struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct rte_flow_query_count *qc = data; + + if (!priv->config.devx) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "counters are not supported"); + if (flow->counter) { + uint64_t pkts, bytes; + struct mlx5_flow_counter *cnt; + + cnt = flow_dv_counter_get_by_idx(dev, flow->counter, + NULL); + int err = _flow_dv_query_count(dev, flow->counter, &pkts, + &bytes); + + if (err) + return rte_flow_error_set(error, -err, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "cannot read counters"); + qc->hits_set = 1; + qc->bytes_set = 1; + qc->hits = pkts - cnt->hits; + qc->bytes = bytes - cnt->bytes; + if (qc->reset) { + cnt->hits = pkts; + cnt->bytes = bytes; + } + return 0; + } + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "counters are not available"); +} + +/** + * Query a flow. + * + * @see rte_flow_query() + * @see rte_flow_ops + */ +static int +flow_dv_query(struct rte_eth_dev *dev, + struct rte_flow *flow __rte_unused, + const struct rte_flow_action *actions __rte_unused, + void *data __rte_unused, + struct rte_flow_error *error __rte_unused) +{ + int ret = -EINVAL; + + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_VOID: + break; + case RTE_FLOW_ACTION_TYPE_COUNT: + ret = flow_dv_query_count(dev, flow, data, error); + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "action not supported"); + } + } + return ret; +} + +/** + * Destroy the meter table set. + * Lock free, (mutex should be acquired by caller). + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] tbl + * Pointer to the meter table set. + * + * @return + * Always 0. + */ +static int +flow_dv_destroy_mtr_tbl(struct rte_eth_dev *dev, + struct mlx5_meter_domains_infos *tbl) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_meter_domains_infos *mtd = + (struct mlx5_meter_domains_infos *)tbl; + + if (!mtd || !priv->config.dv_flow_en) + return 0; + if (mtd->ingress.policer_rules[RTE_MTR_DROPPED]) + claim_zero(mlx5_glue->dv_destroy_flow + (mtd->ingress.policer_rules[RTE_MTR_DROPPED])); + if (mtd->egress.policer_rules[RTE_MTR_DROPPED]) + claim_zero(mlx5_glue->dv_destroy_flow + (mtd->egress.policer_rules[RTE_MTR_DROPPED])); + if (mtd->transfer.policer_rules[RTE_MTR_DROPPED]) + claim_zero(mlx5_glue->dv_destroy_flow + (mtd->transfer.policer_rules[RTE_MTR_DROPPED])); + if (mtd->egress.color_matcher) + claim_zero(mlx5_glue->dv_destroy_flow_matcher + (mtd->egress.color_matcher)); + if (mtd->egress.any_matcher) + claim_zero(mlx5_glue->dv_destroy_flow_matcher + (mtd->egress.any_matcher)); + if (mtd->egress.tbl) + flow_dv_tbl_resource_release(dev, mtd->egress.tbl); + if (mtd->egress.sfx_tbl) + flow_dv_tbl_resource_release(dev, mtd->egress.sfx_tbl); + if (mtd->ingress.color_matcher) + claim_zero(mlx5_glue->dv_destroy_flow_matcher + (mtd->ingress.color_matcher)); + if (mtd->ingress.any_matcher) + claim_zero(mlx5_glue->dv_destroy_flow_matcher + (mtd->ingress.any_matcher)); + if (mtd->ingress.tbl) + flow_dv_tbl_resource_release(dev, mtd->ingress.tbl); + if (mtd->ingress.sfx_tbl) + flow_dv_tbl_resource_release(dev, mtd->ingress.sfx_tbl); + if (mtd->transfer.color_matcher) + claim_zero(mlx5_glue->dv_destroy_flow_matcher + (mtd->transfer.color_matcher)); + if (mtd->transfer.any_matcher) + claim_zero(mlx5_glue->dv_destroy_flow_matcher + (mtd->transfer.any_matcher)); + if (mtd->transfer.tbl) + flow_dv_tbl_resource_release(dev, mtd->transfer.tbl); + if (mtd->transfer.sfx_tbl) + flow_dv_tbl_resource_release(dev, mtd->transfer.sfx_tbl); + if (mtd->drop_actn) + claim_zero(mlx5_glue->destroy_flow_action(mtd->drop_actn)); + rte_free(mtd); + return 0; +} + +/* Number of meter flow actions, count and jump or count and drop. */ +#define METER_ACTIONS 2 + +/** + * Create specify domain meter table and suffix table. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in,out] mtb + * Pointer to DV meter table set. + * @param[in] egress + * Table attribute. + * @param[in] transfer + * Table attribute. + * @param[in] color_reg_c_idx + * Reg C index for color match. + * + * @return + * 0 on success, -1 otherwise and rte_errno is set. + */ +static int +flow_dv_prepare_mtr_tables(struct rte_eth_dev *dev, + struct mlx5_meter_domains_infos *mtb, + uint8_t egress, uint8_t transfer, + uint32_t color_reg_c_idx) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; + struct mlx5_flow_dv_match_params mask = { + .size = sizeof(mask.buf), + }; + struct mlx5_flow_dv_match_params value = { + .size = sizeof(value.buf), + }; + struct mlx5dv_flow_matcher_attr dv_attr = { + .type = IBV_FLOW_ATTR_NORMAL, + .priority = 0, + .match_criteria_enable = 0, + .match_mask = (void *)&mask, + }; + void *actions[METER_ACTIONS]; + struct mlx5_meter_domain_info *dtb; + struct rte_flow_error error; + int i = 0; + + if (transfer) + dtb = &mtb->transfer; + else if (egress) + dtb = &mtb->egress; + else + dtb = &mtb->ingress; + /* Create the meter table with METER level. */ + dtb->tbl = flow_dv_tbl_resource_get(dev, MLX5_FLOW_TABLE_LEVEL_METER, + egress, transfer, &error); + if (!dtb->tbl) { + DRV_LOG(ERR, "Failed to create meter policer table."); + return -1; + } + /* Create the meter suffix table with SUFFIX level. */ + dtb->sfx_tbl = flow_dv_tbl_resource_get(dev, + MLX5_FLOW_TABLE_LEVEL_SUFFIX, + egress, transfer, &error); + if (!dtb->sfx_tbl) { + DRV_LOG(ERR, "Failed to create meter suffix table."); + return -1; + } + /* Create matchers, Any and Color. */ + dv_attr.priority = 3; + dv_attr.match_criteria_enable = 0; + dtb->any_matcher = mlx5_glue->dv_create_flow_matcher(sh->ctx, + &dv_attr, + dtb->tbl->obj); + if (!dtb->any_matcher) { + DRV_LOG(ERR, "Failed to create meter" + " policer default matcher."); + goto error_exit; + } + dv_attr.priority = 0; + dv_attr.match_criteria_enable = + 1 << MLX5_MATCH_CRITERIA_ENABLE_MISC2_BIT; + flow_dv_match_meta_reg(mask.buf, value.buf, color_reg_c_idx, + rte_col_2_mlx5_col(RTE_COLORS), UINT8_MAX); + dtb->color_matcher = mlx5_glue->dv_create_flow_matcher(sh->ctx, + &dv_attr, + dtb->tbl->obj); + if (!dtb->color_matcher) { + DRV_LOG(ERR, "Failed to create meter policer color matcher."); + goto error_exit; + } + if (mtb->count_actns[RTE_MTR_DROPPED]) + actions[i++] = mtb->count_actns[RTE_MTR_DROPPED]; + actions[i++] = mtb->drop_actn; + /* Default rule: lowest priority, match any, actions: drop. */ + dtb->policer_rules[RTE_MTR_DROPPED] = + mlx5_glue->dv_create_flow(dtb->any_matcher, + (void *)&value, i, actions); + if (!dtb->policer_rules[RTE_MTR_DROPPED]) { + DRV_LOG(ERR, "Failed to create meter policer drop rule."); + goto error_exit; + } + return 0; +error_exit: + return -1; +} + +/** + * Create the needed meter and suffix tables. + * Lock free, (mutex should be acquired by caller). + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] fm + * Pointer to the flow meter. + * + * @return + * Pointer to table set on success, NULL otherwise and rte_errno is set. + */ +static struct mlx5_meter_domains_infos * +flow_dv_create_mtr_tbl(struct rte_eth_dev *dev, + const struct mlx5_flow_meter *fm) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_meter_domains_infos *mtb; + int ret; + int i; + + if (!priv->mtr_en) { + rte_errno = ENOTSUP; + return NULL; + } + mtb = rte_calloc(__func__, 1, sizeof(*mtb), 0); + if (!mtb) { + DRV_LOG(ERR, "Failed to allocate memory for meter."); + return NULL; + } + /* Create meter count actions */ + for (i = 0; i <= RTE_MTR_DROPPED; i++) { + struct mlx5_flow_counter *cnt; + if (!fm->policer_stats.cnt[i]) + continue; + cnt = flow_dv_counter_get_by_idx(dev, + fm->policer_stats.cnt[i], NULL); + mtb->count_actns[i] = cnt->action; + } + /* Create drop action. */ + mtb->drop_actn = mlx5_glue->dr_create_flow_action_drop(); + if (!mtb->drop_actn) { + DRV_LOG(ERR, "Failed to create drop action."); + goto error_exit; + } + /* Egress meter table. */ + ret = flow_dv_prepare_mtr_tables(dev, mtb, 1, 0, priv->mtr_color_reg); + if (ret) { + DRV_LOG(ERR, "Failed to prepare egress meter table."); + goto error_exit; + } + /* Ingress meter table. */ + ret = flow_dv_prepare_mtr_tables(dev, mtb, 0, 0, priv->mtr_color_reg); + if (ret) { + DRV_LOG(ERR, "Failed to prepare ingress meter table."); + goto error_exit; + } + /* FDB meter table. */ + if (priv->config.dv_esw_en) { + ret = flow_dv_prepare_mtr_tables(dev, mtb, 0, 1, + priv->mtr_color_reg); + if (ret) { + DRV_LOG(ERR, "Failed to prepare fdb meter table."); + goto error_exit; + } + } + return mtb; +error_exit: + flow_dv_destroy_mtr_tbl(dev, mtb); + return NULL; +} + +/** + * Destroy domain policer rule. + * + * @param[in] dt + * Pointer to domain table. + */ +static void +flow_dv_destroy_domain_policer_rule(struct mlx5_meter_domain_info *dt) +{ + int i; + + for (i = 0; i < RTE_MTR_DROPPED; i++) { + if (dt->policer_rules[i]) { + claim_zero(mlx5_glue->dv_destroy_flow + (dt->policer_rules[i])); + dt->policer_rules[i] = NULL; + } + } + if (dt->jump_actn) { + claim_zero(mlx5_glue->destroy_flow_action(dt->jump_actn)); + dt->jump_actn = NULL; + } +} + +/** + * Destroy policer rules. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] fm + * Pointer to flow meter structure. + * @param[in] attr + * Pointer to flow attributes. + * + * @return + * Always 0. + */ +static int +flow_dv_destroy_policer_rules(struct rte_eth_dev *dev __rte_unused, + const struct mlx5_flow_meter *fm, + const struct rte_flow_attr *attr) +{ + struct mlx5_meter_domains_infos *mtb = fm ? fm->mfts : NULL; + + if (!mtb) + return 0; + if (attr->egress) + flow_dv_destroy_domain_policer_rule(&mtb->egress); + if (attr->ingress) + flow_dv_destroy_domain_policer_rule(&mtb->ingress); + if (attr->transfer) + flow_dv_destroy_domain_policer_rule(&mtb->transfer); + return 0; +} + +/** + * Create specify domain meter policer rule. + * + * @param[in] fm + * Pointer to flow meter structure. + * @param[in] mtb + * Pointer to DV meter table set. + * @param[in] mtr_reg_c + * Color match REG_C. + * + * @return + * 0 on success, -1 otherwise. + */ +static int +flow_dv_create_policer_forward_rule(struct mlx5_flow_meter *fm, + struct mlx5_meter_domain_info *dtb, + uint8_t mtr_reg_c) +{ + struct mlx5_flow_dv_match_params matcher = { + .size = sizeof(matcher.buf), + }; + struct mlx5_flow_dv_match_params value = { + .size = sizeof(value.buf), + }; + struct mlx5_meter_domains_infos *mtb = fm->mfts; + void *actions[METER_ACTIONS]; + int i; + + /* Create jump action. */ + if (!dtb->jump_actn) + dtb->jump_actn = + mlx5_glue->dr_create_flow_action_dest_flow_tbl + (dtb->sfx_tbl->obj); + if (!dtb->jump_actn) { + DRV_LOG(ERR, "Failed to create policer jump action."); + goto error; + } + for (i = 0; i < RTE_MTR_DROPPED; i++) { + int j = 0; + + flow_dv_match_meta_reg(matcher.buf, value.buf, mtr_reg_c, + rte_col_2_mlx5_col(i), UINT8_MAX); + if (mtb->count_actns[i]) + actions[j++] = mtb->count_actns[i]; + if (fm->action[i] == MTR_POLICER_ACTION_DROP) + actions[j++] = mtb->drop_actn; + else + actions[j++] = dtb->jump_actn; + dtb->policer_rules[i] = + mlx5_glue->dv_create_flow(dtb->color_matcher, + (void *)&value, + j, actions); + if (!dtb->policer_rules[i]) { + DRV_LOG(ERR, "Failed to create policer rule."); + goto error; + } + } + return 0; +error: + rte_errno = errno; + return -1; +} + +/** + * Create policer rules. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] fm + * Pointer to flow meter structure. + * @param[in] attr + * Pointer to flow attributes. + * + * @return + * 0 on success, -1 otherwise. + */ +static int +flow_dv_create_policer_rules(struct rte_eth_dev *dev, + struct mlx5_flow_meter *fm, + const struct rte_flow_attr *attr) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_meter_domains_infos *mtb = fm->mfts; + int ret; + + if (attr->egress) { + ret = flow_dv_create_policer_forward_rule(fm, &mtb->egress, + priv->mtr_color_reg); + if (ret) { + DRV_LOG(ERR, "Failed to create egress policer."); + goto error; + } + } + if (attr->ingress) { + ret = flow_dv_create_policer_forward_rule(fm, &mtb->ingress, + priv->mtr_color_reg); + if (ret) { + DRV_LOG(ERR, "Failed to create ingress policer."); + goto error; + } + } + if (attr->transfer) { + ret = flow_dv_create_policer_forward_rule(fm, &mtb->transfer, + priv->mtr_color_reg); + if (ret) { + DRV_LOG(ERR, "Failed to create transfer policer."); + goto error; + } + } + return 0; +error: + flow_dv_destroy_policer_rules(dev, fm, attr); + return -1; +} + +/** + * Query a devx counter. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] cnt + * Index to the flow counter. + * @param[in] clear + * Set to clear the counter statistics. + * @param[out] pkts + * The statistics value of packets. + * @param[out] bytes + * The statistics value of bytes. + * + * @return + * 0 on success, otherwise return -1. + */ +static int +flow_dv_counter_query(struct rte_eth_dev *dev, uint32_t counter, bool clear, + uint64_t *pkts, uint64_t *bytes) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_counter *cnt; + uint64_t inn_pkts, inn_bytes; + int ret; + + if (!priv->config.devx) + return -1; + + ret = _flow_dv_query_count(dev, counter, &inn_pkts, &inn_bytes); + if (ret) + return -1; + cnt = flow_dv_counter_get_by_idx(dev, counter, NULL); + *pkts = inn_pkts - cnt->hits; + *bytes = inn_bytes - cnt->bytes; + if (clear) { + cnt->hits = inn_pkts; + cnt->bytes = inn_bytes; + } + return 0; +} + +/** + * Get aged-out flows. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] context + * The address of an array of pointers to the aged-out flows contexts. + * @param[in] nb_contexts + * The length of context array pointers. + * @param[out] error + * Perform verbose error reporting if not NULL. Initialized in case of + * error only. + * + * @return + * how many contexts get in success, otherwise negative errno value. + * if nb_contexts is 0, return the amount of all aged contexts. + * if nb_contexts is not 0 , return the amount of aged flows reported + * in the context array. + * @note: only stub for now + */ +static int +flow_get_aged_flows(struct rte_eth_dev *dev, + void **context, + uint32_t nb_contexts, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_age_info *age_info; + struct mlx5_age_param *age_param; + struct mlx5_flow_counter *counter; + int nb_flows = 0; + + if (nb_contexts && !context) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "Should assign at least one flow or" + " context to get if nb_contexts != 0"); + age_info = GET_PORT_AGE_INFO(priv); + rte_spinlock_lock(&age_info->aged_sl); + TAILQ_FOREACH(counter, &age_info->aged_counters, next) { + nb_flows++; + if (nb_contexts) { + age_param = MLX5_CNT_TO_AGE(counter); + context[nb_flows - 1] = age_param->context; + if (!(--nb_contexts)) + break; + } + } + rte_spinlock_unlock(&age_info->aged_sl); + MLX5_AGE_SET(age_info, MLX5_AGE_TRIGGER); + return nb_flows; +} + +/* + * Mutex-protected thunk to lock-free __flow_dv_translate(). + */ +static int +flow_dv_translate(struct rte_eth_dev *dev, + struct mlx5_flow *dev_flow, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + int ret; + + flow_dv_shared_lock(dev); + ret = __flow_dv_translate(dev, dev_flow, attr, items, actions, error); + flow_dv_shared_unlock(dev); + return ret; +} + +/* + * Mutex-protected thunk to lock-free __flow_dv_apply(). + */ +static int +flow_dv_apply(struct rte_eth_dev *dev, + struct rte_flow *flow, + struct rte_flow_error *error) +{ + int ret; + + flow_dv_shared_lock(dev); + ret = __flow_dv_apply(dev, flow, error); + flow_dv_shared_unlock(dev); + return ret; +} + +/* + * Mutex-protected thunk to lock-free __flow_dv_remove(). + */ +static void +flow_dv_remove(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + flow_dv_shared_lock(dev); + __flow_dv_remove(dev, flow); + flow_dv_shared_unlock(dev); +} + +/* + * Mutex-protected thunk to lock-free __flow_dv_destroy(). + */ +static void +flow_dv_destroy(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + flow_dv_shared_lock(dev); + __flow_dv_destroy(dev, flow); + flow_dv_shared_unlock(dev); +} + +/* + * Mutex-protected thunk to lock-free flow_dv_counter_alloc(). + */ +static uint32_t +flow_dv_counter_allocate(struct rte_eth_dev *dev) +{ + uint32_t cnt; + + flow_dv_shared_lock(dev); + cnt = flow_dv_counter_alloc(dev, 0, 0, 1, 0); + flow_dv_shared_unlock(dev); + return cnt; +} + +/* + * Mutex-protected thunk to lock-free flow_dv_counter_release(). + */ +static void +flow_dv_counter_free(struct rte_eth_dev *dev, uint32_t cnt) +{ + flow_dv_shared_lock(dev); + flow_dv_counter_release(dev, cnt); + flow_dv_shared_unlock(dev); +} + +const struct mlx5_flow_driver_ops mlx5_flow_dv_drv_ops = { + .validate = flow_dv_validate, + .prepare = flow_dv_prepare, + .translate = flow_dv_translate, + .apply = flow_dv_apply, + .remove = flow_dv_remove, + .destroy = flow_dv_destroy, + .query = flow_dv_query, + .create_mtr_tbls = flow_dv_create_mtr_tbl, + .destroy_mtr_tbls = flow_dv_destroy_mtr_tbl, + .create_policer_rules = flow_dv_create_policer_rules, + .destroy_policer_rules = flow_dv_destroy_policer_rules, + .counter_alloc = flow_dv_counter_allocate, + .counter_free = flow_dv_counter_free, + .counter_query = flow_dv_counter_query, + .get_aged_flows = flow_get_aged_flows, +}; + +#endif /* HAVE_IBV_FLOW_DV_SUPPORT */ diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_meter.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_meter.c new file mode 100644 index 000000000..08f7dc8d1 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_meter.c @@ -0,0 +1,1292 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* + * Copyright 2018 Mellanox Technologies, Ltd + */ +#include <math.h> + +#include <rte_tailq.h> +#include <rte_malloc.h> +#include <rte_mtr.h> +#include <rte_mtr_driver.h> + +#include <mlx5_devx_cmds.h> + +#include "mlx5.h" +#include "mlx5_flow.h" + +/** + * Create the meter action. + * + * @param priv + * Pointer to mlx5_priv. + * @param[in] fm + * Pointer to flow meter to be converted. + * + * @return + * Pointer to the meter action on success, NULL otherwise. + */ +static void * +mlx5_flow_meter_action_create(struct mlx5_priv *priv, + struct mlx5_flow_meter *fm) +{ +#ifdef HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER + struct mlx5dv_dr_flow_meter_attr mtr_init; + void *attr = fm->mfts->fmp; + struct mlx5_flow_meter_srtcm_rfc2697_prm *srtcm = + &fm->profile->srtcm_prm; + + fm->mfts->fmp_size = MLX5_ST_SZ_BYTES(flow_meter_parameters); + memset(attr, 0, fm->mfts->fmp_size); + MLX5_SET(flow_meter_parameters, attr, valid, 1); + MLX5_SET(flow_meter_parameters, attr, bucket_overflow, 1); + MLX5_SET(flow_meter_parameters, attr, + start_color, MLX5_FLOW_COLOR_GREEN); + MLX5_SET(flow_meter_parameters, attr, both_buckets_on_green, 0); + MLX5_SET(flow_meter_parameters, + attr, cbs_exponent, srtcm->cbs_exponent); + MLX5_SET(flow_meter_parameters, + attr, cbs_mantissa, srtcm->cbs_mantissa); + MLX5_SET(flow_meter_parameters, + attr, cir_exponent, srtcm->cir_exponent); + MLX5_SET(flow_meter_parameters, + attr, cir_mantissa, srtcm->cir_mantissa); + MLX5_SET(flow_meter_parameters, + attr, ebs_exponent, srtcm->ebs_exponent); + MLX5_SET(flow_meter_parameters, + attr, ebs_mantissa, srtcm->ebs_mantissa); + mtr_init.next_table = + fm->transfer ? fm->mfts->transfer.tbl->obj : + fm->egress ? fm->mfts->egress.tbl->obj : + fm->mfts->ingress.tbl->obj; + mtr_init.reg_c_index = priv->mtr_color_reg - REG_C_0; + mtr_init.flow_meter_parameter = fm->mfts->fmp; + mtr_init.flow_meter_parameter_sz = fm->mfts->fmp_size; + mtr_init.active = fm->active_state; + return mlx5_glue->dv_create_flow_action_meter(&mtr_init); +#else + (void)priv; + (void)fm; + return NULL; +#endif +} + +/** + * Find meter profile by id. + * + * @param priv + * Pointer to mlx5_priv. + * @param meter_profile_id + * Meter profile id. + * + * @return + * Pointer to the profile found on success, NULL otherwise. + */ +static struct mlx5_flow_meter_profile * +mlx5_flow_meter_profile_find(struct mlx5_priv *priv, uint32_t meter_profile_id) +{ + struct mlx5_mtr_profiles *fmps = &priv->flow_meter_profiles; + struct mlx5_flow_meter_profile *fmp; + + TAILQ_FOREACH(fmp, fmps, next) + if (meter_profile_id == fmp->meter_profile_id) + return fmp; + return NULL; +} + +/** + * Validate the MTR profile. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] meter_profile_id + * Meter profile id. + * @param[in] profile + * Pointer to meter profile detail. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_flow_meter_profile_validate(struct rte_eth_dev *dev, + uint32_t meter_profile_id, + struct rte_mtr_meter_profile *profile, + struct rte_mtr_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_meter_profile *fmp; + + /* Profile must not be NULL. */ + if (profile == NULL) + return -rte_mtr_error_set(error, EINVAL, + RTE_MTR_ERROR_TYPE_METER_PROFILE, + NULL, "Meter profile is null."); + /* Meter profile ID must be valid. */ + if (meter_profile_id == UINT32_MAX) + return -rte_mtr_error_set(error, EINVAL, + RTE_MTR_ERROR_TYPE_METER_PROFILE_ID, + NULL, "Meter profile id not valid."); + /* Meter profile must not exist. */ + fmp = mlx5_flow_meter_profile_find(priv, meter_profile_id); + if (fmp) + return -rte_mtr_error_set(error, EEXIST, + RTE_MTR_ERROR_TYPE_METER_PROFILE_ID, + NULL, + "Meter profile already exists."); + if (profile->alg == RTE_MTR_SRTCM_RFC2697) { + if (priv->config.hca_attr.qos.srtcm_sup) { + /* Verify support for flow meter parameters. */ + if (profile->srtcm_rfc2697.cir > 0 && + profile->srtcm_rfc2697.cir <= MLX5_SRTCM_CIR_MAX && + profile->srtcm_rfc2697.cbs > 0 && + profile->srtcm_rfc2697.cbs <= MLX5_SRTCM_CBS_MAX && + profile->srtcm_rfc2697.ebs <= MLX5_SRTCM_EBS_MAX) + return 0; + else + return -rte_mtr_error_set + (error, ENOTSUP, + RTE_MTR_ERROR_TYPE_MTR_PARAMS, + NULL, + profile->srtcm_rfc2697.ebs ? + "Metering value ebs must be 0." : + "Invalid metering parameters."); + } + } + return -rte_mtr_error_set(error, ENOTSUP, + RTE_MTR_ERROR_TYPE_METER_PROFILE, + NULL, "Metering algorithm not supported."); +} + +/** + * Calculate mantissa and exponent for cir. + * + * @param[in] cir + * Value to be calculated. + * @param[out] man + * Pointer to the mantissa. + * @param[out] exp + * Pointer to the exp. + */ +static void +mlx5_flow_meter_cir_man_exp_calc(int64_t cir, uint8_t *man, uint8_t *exp) +{ + int64_t _cir; + int64_t delta = INT64_MAX; + uint8_t _man = 0; + uint8_t _exp = 0; + uint64_t m, e; + + for (m = 0; m <= 0xFF; m++) { /* man width 8 bit */ + for (e = 0; e <= 0x1F; e++) { /* exp width 5bit */ + _cir = (1000000000ULL * m) >> e; + if (llabs(cir - _cir) <= delta) { + delta = llabs(cir - _cir); + _man = m; + _exp = e; + } + } + } + *man = _man; + *exp = _exp; +} + +/** + * Calculate mantissa and exponent for xbs. + * + * @param[in] xbs + * Value to be calculated. + * @param[out] man + * Pointer to the mantissa. + * @param[out] exp + * Pointer to the exp. + */ +static void +mlx5_flow_meter_xbs_man_exp_calc(uint64_t xbs, uint8_t *man, uint8_t *exp) +{ + int _exp; + double _man; + + /* Special case xbs == 0 ? both exp and matissa are 0. */ + if (xbs == 0) { + *man = 0; + *exp = 0; + return; + } + /* xbs = xbs_mantissa * 2^xbs_exponent */ + _man = frexp(xbs, &_exp); + _man = _man * pow(2, MLX5_MAN_WIDTH); + _exp = _exp - MLX5_MAN_WIDTH; + *man = (uint8_t)ceil(_man); + *exp = _exp; +} + +/** + * Fill the prm meter parameter. + * + * @param[in,out] fmp + * Pointer to meter profie to be converted. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_flow_meter_param_fill(struct mlx5_flow_meter_profile *fmp, + struct rte_mtr_error *error) +{ + struct mlx5_flow_meter_srtcm_rfc2697_prm *srtcm = &fmp->srtcm_prm; + uint8_t man, exp; + + if (fmp->profile.alg != RTE_MTR_SRTCM_RFC2697) + return -rte_mtr_error_set(error, ENOTSUP, + RTE_MTR_ERROR_TYPE_METER_PROFILE, + NULL, "Metering algorithm not supported."); + /* cbs = cbs_mantissa * 2^cbs_exponent */ + mlx5_flow_meter_xbs_man_exp_calc(fmp->profile.srtcm_rfc2697.cbs, + &man, &exp); + srtcm->cbs_mantissa = man; + srtcm->cbs_exponent = exp; + /* Check if cbs mantissa is too large. */ + if (srtcm->cbs_exponent != exp) + return -rte_mtr_error_set(error, EINVAL, + RTE_MTR_ERROR_TYPE_MTR_PARAMS, NULL, + "Metering profile parameter cbs is" + " invalid."); + /* ebs = ebs_mantissa * 2^ebs_exponent */ + mlx5_flow_meter_xbs_man_exp_calc(fmp->profile.srtcm_rfc2697.ebs, + &man, &exp); + srtcm->ebs_mantissa = man; + srtcm->ebs_exponent = exp; + /* Check if ebs mantissa is too large. */ + if (srtcm->ebs_exponent != exp) + return -rte_mtr_error_set(error, EINVAL, + RTE_MTR_ERROR_TYPE_MTR_PARAMS, NULL, + "Metering profile parameter ebs is" + " invalid."); + /* cir = 8G * cir_mantissa * 1/(2^cir_exponent)) Bytes/Sec */ + mlx5_flow_meter_cir_man_exp_calc(fmp->profile.srtcm_rfc2697.cir, + &man, &exp); + srtcm->cir_mantissa = man; + srtcm->cir_exponent = exp; + /* Check if cir mantissa is too large. */ + if (srtcm->cir_exponent != exp) + return -rte_mtr_error_set(error, EINVAL, + RTE_MTR_ERROR_TYPE_MTR_PARAMS, NULL, + "Metering profile parameter cir is" + " invalid."); + return 0; +} + +/** + * Callback to get MTR capabilities. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[out] cap + * Pointer to save MTR capabilities. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_flow_mtr_cap_get(struct rte_eth_dev *dev, + struct rte_mtr_capabilities *cap, + struct rte_mtr_error *error __rte_unused) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_hca_qos_attr *qattr = &priv->config.hca_attr.qos; + + if (!priv->mtr_en) + return -rte_mtr_error_set(error, ENOTSUP, + RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL, + "Meter is not support"); + memset(cap, 0, sizeof(*cap)); + cap->n_max = 1 << qattr->log_max_flow_meter; + cap->n_shared_max = cap->n_max; + cap->identical = 1; + cap->shared_identical = 1; + cap->shared_n_flows_per_mtr_max = 4 << 20; + /* 2M flows can share the same meter. */ + cap->chaining_n_mtrs_per_flow_max = 1; /* Chaining is not supported. */ + cap->meter_srtcm_rfc2697_n_max = qattr->srtcm_sup ? cap->n_max : 0; + cap->meter_rate_max = 1ULL << 40; /* 1 Tera tokens per sec. */ + cap->policer_action_drop_supported = 1; + cap->stats_mask = RTE_MTR_STATS_N_BYTES_DROPPED | + RTE_MTR_STATS_N_PKTS_DROPPED; + return 0; +} + +/** + * Callback to add MTR profile. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] meter_profile_id + * Meter profile id. + * @param[in] profile + * Pointer to meter profile detail. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_flow_meter_profile_add(struct rte_eth_dev *dev, + uint32_t meter_profile_id, + struct rte_mtr_meter_profile *profile, + struct rte_mtr_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_mtr_profiles *fmps = &priv->flow_meter_profiles; + struct mlx5_flow_meter_profile *fmp; + int ret; + + if (!priv->mtr_en) + return -rte_mtr_error_set(error, ENOTSUP, + RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL, + "Meter is not support"); + /* Check input params. */ + ret = mlx5_flow_meter_profile_validate(dev, meter_profile_id, + profile, error); + if (ret) + return ret; + /* Meter profile memory allocation. */ + fmp = rte_calloc(__func__, 1, sizeof(struct mlx5_flow_meter_profile), + RTE_CACHE_LINE_SIZE); + if (fmp == NULL) + return -rte_mtr_error_set(error, ENOMEM, + RTE_MTR_ERROR_TYPE_UNSPECIFIED, + NULL, "Meter profile memory " + "alloc failed."); + /* Fill profile info. */ + fmp->meter_profile_id = meter_profile_id; + fmp->profile = *profile; + /* Fill the flow meter parameters for the PRM. */ + ret = mlx5_flow_meter_param_fill(fmp, error); + if (ret) + goto error; + /* Add to list. */ + TAILQ_INSERT_TAIL(fmps, fmp, next); + return 0; +error: + rte_free(fmp); + return ret; +} + +/** + * Callback to delete MTR profile. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] meter_profile_id + * Meter profile id. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_flow_meter_profile_delete(struct rte_eth_dev *dev, + uint32_t meter_profile_id, + struct rte_mtr_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_meter_profile *fmp; + + if (!priv->mtr_en) + return -rte_mtr_error_set(error, ENOTSUP, + RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL, + "Meter is not support"); + /* Meter profile must exist. */ + fmp = mlx5_flow_meter_profile_find(priv, meter_profile_id); + if (fmp == NULL) + return -rte_mtr_error_set(error, ENOENT, + RTE_MTR_ERROR_TYPE_METER_PROFILE_ID, + &meter_profile_id, + "Meter profile id invalid."); + /* Check profile is unused. */ + if (fmp->ref_cnt) + return -rte_mtr_error_set(error, EBUSY, + RTE_MTR_ERROR_TYPE_METER_PROFILE_ID, + NULL, "Meter profile in use."); + /* Remove from list. */ + TAILQ_REMOVE(&priv->flow_meter_profiles, fmp, next); + rte_free(fmp); + return 0; +} + +/** + * Convert wrong color setting action to verbose error. + * + * @param[in] action + * Policy color action. + * + * @return + * Verbose meter color error type. + */ +static inline enum rte_mtr_error_type +action2error(enum rte_mtr_policer_action action) +{ + switch (action) { + case MTR_POLICER_ACTION_COLOR_GREEN: + return RTE_MTR_ERROR_TYPE_POLICER_ACTION_GREEN; + case MTR_POLICER_ACTION_COLOR_YELLOW: + return RTE_MTR_ERROR_TYPE_POLICER_ACTION_YELLOW; + case MTR_POLICER_ACTION_COLOR_RED: + return RTE_MTR_ERROR_TYPE_POLICER_ACTION_RED; + default: + break; + } + return RTE_MTR_ERROR_TYPE_UNSPECIFIED; +} + +/** + * Check meter validation. + * + * @param[in] priv + * Pointer to mlx5 private data structure. + * @param[in] meter_id + * Meter id. + * @param[in] params + * Pointer to rte meter parameters. + * @param[out] error + * Pointer to rte meter error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_flow_meter_validate(struct mlx5_priv *priv, uint32_t meter_id, + struct rte_mtr_params *params, + struct rte_mtr_error *error) +{ + static enum rte_mtr_policer_action + valid_recol_action[RTE_COLORS] = { + MTR_POLICER_ACTION_COLOR_GREEN, + MTR_POLICER_ACTION_COLOR_YELLOW, + MTR_POLICER_ACTION_COLOR_RED }; + int i; + + /* Meter params must not be NULL. */ + if (params == NULL) + return -rte_mtr_error_set(error, EINVAL, + RTE_MTR_ERROR_TYPE_MTR_PARAMS, + NULL, "Meter object params null."); + /* Previous meter color is not supported. */ + if (params->use_prev_mtr_color) + return -rte_mtr_error_set(error, ENOTSUP, + RTE_MTR_ERROR_TYPE_MTR_PARAMS, + NULL, + "Previous meter color " + "not supported."); + /* Validate policer settings. */ + for (i = 0; i < RTE_COLORS; i++) + if (params->action[i] != valid_recol_action[i] && + params->action[i] != MTR_POLICER_ACTION_DROP) + return -rte_mtr_error_set + (error, ENOTSUP, + action2error(params->action[i]), NULL, + "Recolor action not supported."); + /* Validate meter id. */ + if (mlx5_flow_meter_find(priv, meter_id)) + return -rte_mtr_error_set(error, EEXIST, + RTE_MTR_ERROR_TYPE_MTR_ID, NULL, + "Meter object already exists."); + return 0; +} + +/** + * Modify the flow meter action. + * + * @param[in] priv + * Pointer to mlx5 private data structure. + * @param[in] fm + * Pointer to flow meter to be modified. + * @param[in] srtcm + * Pointer to meter srtcm description parameter. + * @param[in] modify_bits + * The bit in srtcm to be updated. + * @param[in] active_state + * The state to be updated. + * @return + * 0 on success, o negative value otherwise. + */ +static int +mlx5_flow_meter_action_modify(struct mlx5_priv *priv, + struct mlx5_flow_meter *fm, + const struct mlx5_flow_meter_srtcm_rfc2697_prm *srtcm, + uint64_t modify_bits, uint32_t active_state) +{ +#ifdef HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER + uint32_t in[MLX5_ST_SZ_DW(flow_meter_parameters)] = { 0 }; + uint32_t *attr; + struct mlx5dv_dr_flow_meter_attr mod_attr = { 0 }; + int ret; + + /* Fill command parameters. */ + mod_attr.reg_c_index = priv->mtr_color_reg - REG_C_0; + mod_attr.flow_meter_parameter = in; + mod_attr.flow_meter_parameter_sz = fm->mfts->fmp_size; + if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_ACTIVE) + mod_attr.active = !!active_state; + else + mod_attr.active = 0; + attr = in; + if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CBS) { + MLX5_SET(flow_meter_parameters, + attr, cbs_exponent, srtcm->cbs_exponent); + MLX5_SET(flow_meter_parameters, + attr, cbs_mantissa, srtcm->cbs_mantissa); + } + if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CIR) { + MLX5_SET(flow_meter_parameters, + attr, cir_exponent, srtcm->cir_exponent); + MLX5_SET(flow_meter_parameters, + attr, cir_mantissa, srtcm->cir_mantissa); + } + if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_EBS) { + MLX5_SET(flow_meter_parameters, + attr, ebs_exponent, srtcm->ebs_exponent); + MLX5_SET(flow_meter_parameters, + attr, ebs_mantissa, srtcm->ebs_mantissa); + } + /* Apply modifications to meter only if it was created. */ + if (fm->mfts->meter_action) { + ret = mlx5_glue->dv_modify_flow_action_meter + (fm->mfts->meter_action, &mod_attr, + rte_cpu_to_be_64(modify_bits)); + if (ret) + return ret; + } + /* Update succeedded modify meter parameters. */ + if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_ACTIVE) + fm->active_state = !!active_state; + attr = fm->mfts->fmp; + if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CBS) { + MLX5_SET(flow_meter_parameters, + attr, cbs_exponent, srtcm->cbs_exponent); + MLX5_SET(flow_meter_parameters, + attr, cbs_mantissa, srtcm->cbs_mantissa); + } + if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CIR) { + MLX5_SET(flow_meter_parameters, + attr, cir_exponent, srtcm->cir_exponent); + MLX5_SET(flow_meter_parameters, + attr, cir_mantissa, srtcm->cir_mantissa); + } + if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_EBS) { + MLX5_SET(flow_meter_parameters, + attr, ebs_exponent, srtcm->ebs_exponent); + MLX5_SET(flow_meter_parameters, + attr, ebs_mantissa, srtcm->ebs_mantissa); + } + + return 0; +#else + (void)priv; + (void)fm; + (void)srtcm; + (void)modify_bits; + (void)active_state; + return -ENOTSUP; +#endif +} + +/** + * Create meter rules. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] meter_id + * Meter id. + * @param[in] params + * Pointer to rte meter parameters. + * @param[in] shared + * Meter shared with other flow or not. + * @param[out] error + * Pointer to rte meter error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_flow_meter_create(struct rte_eth_dev *dev, uint32_t meter_id, + struct rte_mtr_params *params, int shared, + struct rte_mtr_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_meters *fms = &priv->flow_meters; + struct mlx5_flow_meter_profile *fmp; + struct mlx5_flow_meter *fm; + const struct rte_flow_attr attr = { + .ingress = 1, + .egress = 1, + .transfer = priv->config.dv_esw_en ? 1 : 0, + }; + int ret; + unsigned int i; + uint32_t idx = 0; + + if (!priv->mtr_en) + return -rte_mtr_error_set(error, ENOTSUP, + RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL, + "Meter is not support"); + /* Validate the parameters. */ + ret = mlx5_flow_meter_validate(priv, meter_id, params, error); + if (ret) + return ret; + /* Meter profile must exist. */ + fmp = mlx5_flow_meter_profile_find(priv, params->meter_profile_id); + if (fmp == NULL) + return -rte_mtr_error_set(error, ENOENT, + RTE_MTR_ERROR_TYPE_METER_PROFILE_ID, + NULL, "Meter profile id not valid."); + /* Allocate the flow meter memory. */ + fm = mlx5_ipool_zmalloc(priv->sh->ipool[MLX5_IPOOL_MTR], &idx); + if (fm == NULL) + return -rte_mtr_error_set(error, ENOMEM, + RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL, + "Memory alloc failed for meter."); + fm->idx = idx; + /* Fill the flow meter parameters. */ + fm->meter_id = meter_id; + fm->profile = fmp; + memcpy(fm->action, params->action, sizeof(params->action)); + fm->stats_mask = params->stats_mask; + + /* Alloc policer counters. */ + for (i = 0; i < RTE_DIM(fm->policer_stats.cnt); i++) { + fm->policer_stats.cnt[i] = mlx5_counter_alloc(dev); + if (!fm->policer_stats.cnt[i]) + goto error; + } + fm->mfts = mlx5_flow_create_mtr_tbls(dev, fm); + if (!fm->mfts) + goto error; + ret = mlx5_flow_create_policer_rules(dev, fm, &attr); + if (ret) + goto error; + /* Add to the flow meter list. */ + TAILQ_INSERT_TAIL(fms, fm, next); + fm->active_state = 1; /* Config meter starts as active. */ + fm->shared = !!shared; + fm->policer_stats.stats_mask = params->stats_mask; + fm->profile->ref_cnt++; + return 0; +error: + mlx5_flow_destroy_policer_rules(dev, fm, &attr); + mlx5_flow_destroy_mtr_tbls(dev, fm->mfts); + /* Free policer counters. */ + for (i = 0; i < RTE_DIM(fm->policer_stats.cnt); i++) + if (fm->policer_stats.cnt[i]) + mlx5_counter_free(dev, fm->policer_stats.cnt[i]); + mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MTR], idx); + return -rte_mtr_error_set(error, -ret, + RTE_MTR_ERROR_TYPE_UNSPECIFIED, + NULL, "Failed to create devx meter."); +} + +/** + * Destroy meter rules. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] meter_id + * Meter id. + * @param[out] error + * Pointer to rte meter error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_flow_meter_destroy(struct rte_eth_dev *dev, uint32_t meter_id, + struct rte_mtr_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_meters *fms = &priv->flow_meters; + struct mlx5_flow_meter_profile *fmp; + struct mlx5_flow_meter *fm; + const struct rte_flow_attr attr = { + .ingress = 1, + .egress = 1, + .transfer = priv->config.dv_esw_en ? 1 : 0, + }; + unsigned int i; + + if (!priv->mtr_en) + return -rte_mtr_error_set(error, ENOTSUP, + RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL, + "Meter is not support"); + /* Meter object must exist. */ + fm = mlx5_flow_meter_find(priv, meter_id); + if (fm == NULL) + return -rte_mtr_error_set(error, ENOENT, + RTE_MTR_ERROR_TYPE_MTR_ID, + NULL, "Meter object id not valid."); + /* Meter object must not have any owner. */ + if (fm->ref_cnt > 0) + return -rte_mtr_error_set(error, EBUSY, + RTE_MTR_ERROR_TYPE_UNSPECIFIED, + NULL, "Meter object is being used."); + /* Get the meter profile. */ + fmp = fm->profile; + MLX5_ASSERT(fmp); + /* Update dependencies. */ + fmp->ref_cnt--; + /* Remove from the flow meter list. */ + TAILQ_REMOVE(fms, fm, next); + /* Free policer counters. */ + for (i = 0; i < RTE_DIM(fm->policer_stats.cnt); i++) + if (fm->policer_stats.cnt[i]) + mlx5_counter_free(dev, fm->policer_stats.cnt[i]); + /* Free meter flow table */ + mlx5_flow_destroy_policer_rules(dev, fm, &attr); + mlx5_flow_destroy_mtr_tbls(dev, fm->mfts); + mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MTR], fm->idx); + return 0; +} + +/** + * Modify meter state. + * + * @param[in] priv + * Pointer to mlx5 private data structure. + * @param[in] fm + * Pointer to flow meter. + * @param[in] new_state + * New state to update. + * @param[out] error + * Pointer to rte meter error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_flow_meter_modify_state(struct mlx5_priv *priv, + struct mlx5_flow_meter *fm, + uint32_t new_state, + struct rte_mtr_error *error) +{ + static const struct mlx5_flow_meter_srtcm_rfc2697_prm srtcm = { + .cbs_exponent = 20, + .cbs_mantissa = 191, + .cir_exponent = 0, + .cir_mantissa = 200, + .ebs_exponent = 0, + .ebs_mantissa = 0, + }; + uint64_t modify_bits = MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CBS | + MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CIR; + int ret; + + if (new_state == MLX5_FLOW_METER_DISABLE) + ret = mlx5_flow_meter_action_modify(priv, fm, &srtcm, + modify_bits, 0); + else + ret = mlx5_flow_meter_action_modify(priv, fm, + &fm->profile->srtcm_prm, + modify_bits, 0); + if (ret) + return -rte_mtr_error_set(error, -ret, + RTE_MTR_ERROR_TYPE_MTR_PARAMS, + NULL, + new_state ? + "Failed to enable meter." : + "Failed to disable meter."); + return 0; +} + +/** + * Callback to enable flow meter. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] meter_id + * Meter id. + * @param[out] error + * Pointer to rte meter error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_flow_meter_enable(struct rte_eth_dev *dev, + uint32_t meter_id, + struct rte_mtr_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_meter *fm; + int ret; + + if (!priv->mtr_en) + return -rte_mtr_error_set(error, ENOTSUP, + RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL, + "Meter is not support"); + /* Meter object must exist. */ + fm = mlx5_flow_meter_find(priv, meter_id); + if (fm == NULL) + return -rte_mtr_error_set(error, ENOENT, + RTE_MTR_ERROR_TYPE_MTR_ID, + NULL, "Meter not found."); + if (fm->active_state == MLX5_FLOW_METER_ENABLE) + return 0; + ret = mlx5_flow_meter_modify_state(priv, fm, MLX5_FLOW_METER_ENABLE, + error); + if (!ret) + fm->active_state = MLX5_FLOW_METER_ENABLE; + return ret; +} + +/** + * Callback to disable flow meter. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] meter_id + * Meter id. + * @param[out] error + * Pointer to rte meter error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_flow_meter_disable(struct rte_eth_dev *dev, + uint32_t meter_id, + struct rte_mtr_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_meter *fm; + int ret; + + if (!priv->mtr_en) + return -rte_mtr_error_set(error, ENOTSUP, + RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL, + "Meter is not support"); + /* Meter object must exist. */ + fm = mlx5_flow_meter_find(priv, meter_id); + if (fm == NULL) + return -rte_mtr_error_set(error, ENOENT, + RTE_MTR_ERROR_TYPE_MTR_ID, + NULL, "Meter not found."); + if (fm->active_state == MLX5_FLOW_METER_DISABLE) + return 0; + ret = mlx5_flow_meter_modify_state(priv, fm, MLX5_FLOW_METER_DISABLE, + error); + if (!ret) + fm->active_state = MLX5_FLOW_METER_DISABLE; + return ret; +} + +/** + * Callback to update meter profile. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] meter_id + * Meter id. + * @param[in] meter_profile_id + * To be updated meter profile id. + * @param[out] error + * Pointer to rte meter error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_flow_meter_profile_update(struct rte_eth_dev *dev, + uint32_t meter_id, + uint32_t meter_profile_id, + struct rte_mtr_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_meter_profile *fmp; + struct mlx5_flow_meter_profile *old_fmp; + struct mlx5_flow_meter *fm; + uint64_t modify_bits = MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CBS | + MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CIR; + int ret; + + if (!priv->mtr_en) + return -rte_mtr_error_set(error, ENOTSUP, + RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL, + "Meter is not support"); + /* Meter profile must exist. */ + fmp = mlx5_flow_meter_profile_find(priv, meter_profile_id); + if (fmp == NULL) + return -rte_mtr_error_set(error, ENOENT, + RTE_MTR_ERROR_TYPE_METER_PROFILE_ID, + NULL, "Meter profile not found."); + /* Meter object must exist. */ + fm = mlx5_flow_meter_find(priv, meter_id); + if (fm == NULL) + return -rte_mtr_error_set(error, ENOENT, + RTE_MTR_ERROR_TYPE_MTR_ID, + NULL, "Meter not found."); + /* MTR object already set to meter profile id. */ + old_fmp = fm->profile; + if (fmp == old_fmp) + return 0; + /* Update the profile. */ + fm->profile = fmp; + /* Update meter params in HW (if not disabled). */ + if (fm->active_state == MLX5_FLOW_METER_DISABLE) + return 0; + ret = mlx5_flow_meter_action_modify(priv, fm, &fm->profile->srtcm_prm, + modify_bits, fm->active_state); + if (ret) { + fm->profile = old_fmp; + return -rte_mtr_error_set(error, -ret, + RTE_MTR_ERROR_TYPE_MTR_PARAMS, + NULL, "Failed to update meter" + " parmeters in hardware."); + } + old_fmp->ref_cnt--; + fmp->ref_cnt++; + return 0; +} + +/** + * Callback to update meter stats mask. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] meter_id + * Meter id. + * @param[in] stats_mask + * To be updated stats_mask. + * @param[out] error + * Pointer to rte meter error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_flow_meter_stats_update(struct rte_eth_dev *dev, + uint32_t meter_id, + uint64_t stats_mask, + struct rte_mtr_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_meter *fm; + + if (!priv->mtr_en) + return -rte_mtr_error_set(error, ENOTSUP, + RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL, + "Meter is not support"); + /* Meter object must exist. */ + fm = mlx5_flow_meter_find(priv, meter_id); + if (fm == NULL) + return -rte_mtr_error_set(error, ENOENT, + RTE_MTR_ERROR_TYPE_MTR_ID, + NULL, "Meter object id not valid."); + fm->policer_stats.stats_mask = stats_mask; + return 0; +} + +/** + * Callback to read meter statistics. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] meter_id + * Meter id. + * @param[out] stats + * Pointer to store the statistics. + * @param[out] stats_mask + * Pointer to store the stats_mask. + * @param[in] clear + * Statistic to be cleared after read or not. + * @param[out] error + * Pointer to rte meter error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_flow_meter_stats_read(struct rte_eth_dev *dev, + uint32_t meter_id, + struct rte_mtr_stats *stats, + uint64_t *stats_mask, + int clear, + struct rte_mtr_error *error) +{ + static uint64_t meter2mask[RTE_MTR_DROPPED + 1] = { + RTE_MTR_STATS_N_PKTS_GREEN | RTE_MTR_STATS_N_BYTES_GREEN, + RTE_MTR_STATS_N_PKTS_YELLOW | RTE_MTR_STATS_N_BYTES_YELLOW, + RTE_MTR_STATS_N_PKTS_RED | RTE_MTR_STATS_N_BYTES_RED, + RTE_MTR_STATS_N_PKTS_DROPPED | RTE_MTR_STATS_N_BYTES_DROPPED + }; + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_meter *fm; + struct mlx5_flow_policer_stats *ps; + uint64_t pkts_dropped = 0; + uint64_t bytes_dropped = 0; + uint64_t pkts; + uint64_t bytes; + int i; + int ret = 0; + + if (!priv->mtr_en) + return -rte_mtr_error_set(error, ENOTSUP, + RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL, + "Meter is not support"); + /* Meter object must exist. */ + fm = mlx5_flow_meter_find(priv, meter_id); + if (fm == NULL) + return -rte_mtr_error_set(error, ENOENT, + RTE_MTR_ERROR_TYPE_MTR_ID, + NULL, "Meter object id not valid."); + ps = &fm->policer_stats; + *stats_mask = ps->stats_mask; + for (i = 0; i < RTE_MTR_DROPPED; i++) { + if (*stats_mask & meter2mask[i]) { + ret = mlx5_counter_query(dev, ps->cnt[i], clear, &pkts, + &bytes); + if (ret) + goto error; + if (fm->action[i] == MTR_POLICER_ACTION_DROP) { + pkts_dropped += pkts; + bytes_dropped += bytes; + } + /* If need to read the packets, set it. */ + if ((1 << i) & (*stats_mask & meter2mask[i])) + stats->n_pkts[i] = pkts; + /* If need to read the bytes, set it. */ + if ((1 << (RTE_MTR_DROPPED + 1 + i)) & + (*stats_mask & meter2mask[i])) + stats->n_bytes[i] = bytes; + } + } + /* Dropped packets/bytes are treated differently. */ + if (*stats_mask & meter2mask[i]) { + ret = mlx5_counter_query(dev, ps->cnt[i], clear, &pkts, + &bytes); + if (ret) + goto error; + pkts += pkts_dropped; + bytes += bytes_dropped; + /* If need to read the packets, set it. */ + if ((*stats_mask & meter2mask[i]) & + RTE_MTR_STATS_N_PKTS_DROPPED) + stats->n_pkts_dropped = pkts; + /* If need to read the bytes, set it. */ + if ((*stats_mask & meter2mask[i]) & + RTE_MTR_STATS_N_BYTES_DROPPED) + stats->n_bytes_dropped = bytes; + } + return 0; +error: + return -rte_mtr_error_set(error, ret, RTE_MTR_ERROR_TYPE_STATS, NULL, + "Failed to read policer counters."); +} + +static const struct rte_mtr_ops mlx5_flow_mtr_ops = { + .capabilities_get = mlx5_flow_mtr_cap_get, + .meter_profile_add = mlx5_flow_meter_profile_add, + .meter_profile_delete = mlx5_flow_meter_profile_delete, + .create = mlx5_flow_meter_create, + .destroy = mlx5_flow_meter_destroy, + .meter_enable = mlx5_flow_meter_enable, + .meter_disable = mlx5_flow_meter_disable, + .meter_profile_update = mlx5_flow_meter_profile_update, + .meter_dscp_table_update = NULL, + .policer_actions_update = NULL, + .stats_update = mlx5_flow_meter_stats_update, + .stats_read = mlx5_flow_meter_stats_read, +}; + +/** + * Get meter operations. + * + * @param dev + * Pointer to Ethernet device structure. + * @param arg + * Pointer to set the mtr operations. + * + * @return + * Always 0. + */ +int +mlx5_flow_meter_ops_get(struct rte_eth_dev *dev __rte_unused, void *arg) +{ + *(const struct rte_mtr_ops **)arg = &mlx5_flow_mtr_ops; + return 0; +} + +/** + * Find meter by id. + * + * @param priv + * Pointer to mlx5_priv. + * @param meter_id + * Meter id. + * + * @return + * Pointer to the profile found on success, NULL otherwise. + */ +struct mlx5_flow_meter * +mlx5_flow_meter_find(struct mlx5_priv *priv, uint32_t meter_id) +{ + struct mlx5_flow_meters *fms = &priv->flow_meters; + struct mlx5_flow_meter *fm; + + TAILQ_FOREACH(fm, fms, next) + if (meter_id == fm->meter_id) + return fm; + return NULL; +} + +/** + * Attach meter to flow. + * Unidirectional Meter creation can only be done + * when flow direction is known, i.e. when calling meter_attach. + * + * @param [in] priv + * Pointer to mlx5 private data. + * @param [in] meter_id + * Flow meter id. + * @param [in] attr + * Pointer to flow attributes. + * @param [out] error + * Pointer to error structure. + * + * @return the flow meter pointer, NULL otherwise. + */ +struct mlx5_flow_meter * +mlx5_flow_meter_attach(struct mlx5_priv *priv, uint32_t meter_id, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + struct mlx5_flow_meter *fm; + + fm = mlx5_flow_meter_find(priv, meter_id); + if (fm == NULL) { + rte_flow_error_set(error, ENOENT, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "Meter object id not valid"); + goto error; + } + if (!fm->shared && fm->ref_cnt) { + DRV_LOG(ERR, "Cannot share a non-shared meter."); + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "Meter can't be shared"); + goto error; + } + if (!fm->ref_cnt++) { + MLX5_ASSERT(!fm->mfts->meter_action); + fm->ingress = attr->ingress; + fm->egress = attr->egress; + fm->transfer = attr->transfer; + /* This also creates the meter object. */ + fm->mfts->meter_action = mlx5_flow_meter_action_create(priv, + fm); + if (!fm->mfts->meter_action) + goto error_detach; + } else { + MLX5_ASSERT(fm->mfts->meter_action); + if (attr->transfer != fm->transfer || + attr->ingress != fm->ingress || + attr->egress != fm->egress) { + DRV_LOG(ERR, "meter I/O attributes do not " + "match flow I/O attributes."); + goto error_detach; + } + } + return fm; +error_detach: + mlx5_flow_meter_detach(fm); + rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + fm->mfts->meter_action ? "Meter attr not match" : + "Meter action create failed"); +error: + return NULL; +} + +/** + * Detach meter from flow. + * + * @param [in] fm + * Pointer to flow meter. + */ +void +mlx5_flow_meter_detach(struct mlx5_flow_meter *fm) +{ + MLX5_ASSERT(fm->ref_cnt); + if (--fm->ref_cnt) + return; + if (fm->mfts->meter_action) + mlx5_glue->destroy_flow_action(fm->mfts->meter_action); + fm->mfts->meter_action = NULL; + fm->ingress = 0; + fm->egress = 0; + fm->transfer = 0; +} + +/** + * Flush meter configuration. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[out] error + * Pointer to rte meter error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_meter_flush(struct rte_eth_dev *dev, struct rte_mtr_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_meters *fms = &priv->flow_meters; + struct mlx5_mtr_profiles *fmps = &priv->flow_meter_profiles; + struct mlx5_flow_meter_profile *fmp; + struct mlx5_flow_meter *fm; + const struct rte_flow_attr attr = { + .ingress = 1, + .egress = 1, + .transfer = priv->config.dv_esw_en ? 1 : 0, + }; + void *tmp; + uint32_t i; + + TAILQ_FOREACH_SAFE(fm, fms, next, tmp) { + /* Meter object must not have any owner. */ + MLX5_ASSERT(!fm->ref_cnt); + /* Get meter profile. */ + fmp = fm->profile; + if (fmp == NULL) + return -rte_mtr_error_set(error, EINVAL, + RTE_MTR_ERROR_TYPE_METER_PROFILE_ID, + NULL, "MTR object meter profile invalid."); + /* Update dependencies. */ + fmp->ref_cnt--; + /* Remove from list. */ + TAILQ_REMOVE(fms, fm, next); + /* Free policer counters. */ + for (i = 0; i < RTE_DIM(fm->policer_stats.cnt); i++) + if (fm->policer_stats.cnt[i]) + mlx5_counter_free(dev, + fm->policer_stats.cnt[i]); + /* Free meter flow table. */ + mlx5_flow_destroy_policer_rules(dev, fm, &attr); + mlx5_flow_destroy_mtr_tbls(dev, fm->mfts); + mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MTR], fm->idx); + } + TAILQ_FOREACH_SAFE(fmp, fmps, next, tmp) { + /* Check unused. */ + MLX5_ASSERT(!fmp->ref_cnt); + /* Remove from list. */ + TAILQ_REMOVE(&priv->flow_meter_profiles, fmp, next); + rte_free(fmp); + } + return 0; +} diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_verbs.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_verbs.c new file mode 100644 index 000000000..c266e5683 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_verbs.c @@ -0,0 +1,1987 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 Mellanox Technologies, Ltd + */ + +#include <netinet/in.h> +#include <sys/queue.h> +#include <stdalign.h> +#include <stdint.h> +#include <string.h> + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_common.h> +#include <rte_ether.h> +#include <rte_ethdev_driver.h> +#include <rte_flow.h> +#include <rte_flow_driver.h> +#include <rte_malloc.h> +#include <rte_ip.h> + +#include <mlx5_glue.h> +#include <mlx5_prm.h> + +#include "mlx5_defs.h" +#include "mlx5.h" +#include "mlx5_flow.h" +#include "mlx5_rxtx.h" + +#define VERBS_SPEC_INNER(item_flags) \ + (!!((item_flags) & MLX5_FLOW_LAYER_TUNNEL) ? IBV_FLOW_SPEC_INNER : 0) + +/** + * Get Verbs flow counter by index. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] idx + * mlx5 flow counter index in the container. + * @param[out] ppool + * mlx5 flow counter pool in the container, + * + * @return + * A pointer to the counter, NULL otherwise. + */ +static struct mlx5_flow_counter * +flow_verbs_counter_get_by_idx(struct rte_eth_dev *dev, + uint32_t idx, + struct mlx5_flow_counter_pool **ppool) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, 0, 0); + struct mlx5_flow_counter_pool *pool; + + idx--; + pool = cont->pools[idx / MLX5_COUNTERS_PER_POOL]; + MLX5_ASSERT(pool); + if (ppool) + *ppool = pool; + return MLX5_POOL_GET_CNT(pool, idx % MLX5_COUNTERS_PER_POOL); +} + +/** + * Create Verbs flow counter with Verbs library. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in, out] counter + * mlx5 flow counter object, contains the counter id, + * handle of created Verbs flow counter is returned + * in cs field (if counters are supported). + * + * @return + * 0 On success else a negative errno value is returned + * and rte_errno is set. + */ +static int +flow_verbs_counter_create(struct rte_eth_dev *dev, + struct mlx5_flow_counter_ext *counter) +{ +#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) + struct mlx5_priv *priv = dev->data->dev_private; + struct ibv_context *ctx = priv->sh->ctx; + struct ibv_counter_set_init_attr init = { + .counter_set_id = counter->id}; + + counter->cs = mlx5_glue->create_counter_set(ctx, &init); + if (!counter->cs) { + rte_errno = ENOTSUP; + return -ENOTSUP; + } + return 0; +#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) + struct mlx5_priv *priv = dev->data->dev_private; + struct ibv_context *ctx = priv->sh->ctx; + struct ibv_counters_init_attr init = {0}; + struct ibv_counter_attach_attr attach; + int ret; + + memset(&attach, 0, sizeof(attach)); + counter->cs = mlx5_glue->create_counters(ctx, &init); + if (!counter->cs) { + rte_errno = ENOTSUP; + return -ENOTSUP; + } + attach.counter_desc = IBV_COUNTER_PACKETS; + attach.index = 0; + ret = mlx5_glue->attach_counters(counter->cs, &attach, NULL); + if (!ret) { + attach.counter_desc = IBV_COUNTER_BYTES; + attach.index = 1; + ret = mlx5_glue->attach_counters + (counter->cs, &attach, NULL); + } + if (ret) { + claim_zero(mlx5_glue->destroy_counters(counter->cs)); + counter->cs = NULL; + rte_errno = ret; + return -ret; + } + return 0; +#else + (void)dev; + (void)counter; + rte_errno = ENOTSUP; + return -ENOTSUP; +#endif +} + +/** + * Get a flow counter. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] shared + * Indicate if this counter is shared with other flows. + * @param[in] id + * Counter identifier. + * + * @return + * Index to the counter, 0 otherwise and rte_errno is set. + */ +static uint32_t +flow_verbs_counter_new(struct rte_eth_dev *dev, uint32_t shared, uint32_t id) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, 0, 0); + struct mlx5_flow_counter_pool *pool = NULL; + struct mlx5_flow_counter_ext *cnt_ext = NULL; + struct mlx5_flow_counter *cnt = NULL; + uint32_t n_valid = rte_atomic16_read(&cont->n_valid); + uint32_t pool_idx; + uint32_t i; + int ret; + + if (shared) { + for (pool_idx = 0; pool_idx < n_valid; ++pool_idx) { + pool = cont->pools[pool_idx]; + for (i = 0; i < MLX5_COUNTERS_PER_POOL; ++i) { + cnt_ext = MLX5_GET_POOL_CNT_EXT(pool, i); + if (cnt_ext->shared && cnt_ext->id == id) { + cnt_ext->ref_cnt++; + return MLX5_MAKE_CNT_IDX(pool_idx, i); + } + } + } + } + for (pool_idx = 0; pool_idx < n_valid; ++pool_idx) { + pool = cont->pools[pool_idx]; + if (!pool) + continue; + cnt = TAILQ_FIRST(&pool->counters); + if (cnt) + break; + } + if (!cnt) { + struct mlx5_flow_counter_pool **pools; + uint32_t size; + + if (n_valid == cont->n) { + /* Resize the container pool array. */ + size = sizeof(struct mlx5_flow_counter_pool *) * + (n_valid + MLX5_CNT_CONTAINER_RESIZE); + pools = rte_zmalloc(__func__, size, 0); + if (!pools) + return 0; + if (n_valid) { + memcpy(pools, cont->pools, + sizeof(struct mlx5_flow_counter_pool *) * + n_valid); + rte_free(cont->pools); + } + cont->pools = pools; + cont->n += MLX5_CNT_CONTAINER_RESIZE; + } + /* Allocate memory for new pool*/ + size = sizeof(*pool) + (sizeof(*cnt_ext) + sizeof(*cnt)) * + MLX5_COUNTERS_PER_POOL; + pool = rte_calloc(__func__, 1, size, 0); + if (!pool) + return 0; + pool->type |= CNT_POOL_TYPE_EXT; + for (i = 0; i < MLX5_COUNTERS_PER_POOL; ++i) { + cnt = MLX5_POOL_GET_CNT(pool, i); + TAILQ_INSERT_HEAD(&pool->counters, cnt, next); + } + cnt = MLX5_POOL_GET_CNT(pool, 0); + cont->pools[n_valid] = pool; + pool_idx = n_valid; + rte_atomic16_add(&cont->n_valid, 1); + TAILQ_INSERT_HEAD(&cont->pool_list, pool, next); + } + i = MLX5_CNT_ARRAY_IDX(pool, cnt); + cnt_ext = MLX5_GET_POOL_CNT_EXT(pool, i); + cnt_ext->id = id; + cnt_ext->shared = shared; + cnt_ext->ref_cnt = 1; + cnt->hits = 0; + cnt->bytes = 0; + /* Create counter with Verbs. */ + ret = flow_verbs_counter_create(dev, cnt_ext); + if (!ret) { + TAILQ_REMOVE(&pool->counters, cnt, next); + return MLX5_MAKE_CNT_IDX(pool_idx, i); + } + /* Some error occurred in Verbs library. */ + rte_errno = -ret; + return 0; +} + +/** + * Release a flow counter. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] counter + * Index to the counter handler. + */ +static void +flow_verbs_counter_release(struct rte_eth_dev *dev, uint32_t counter) +{ + struct mlx5_flow_counter_pool *pool; + struct mlx5_flow_counter *cnt; + struct mlx5_flow_counter_ext *cnt_ext; + + cnt = flow_verbs_counter_get_by_idx(dev, counter, + &pool); + cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt); + if (--cnt_ext->ref_cnt == 0) { +#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) + claim_zero(mlx5_glue->destroy_counter_set(cnt_ext->cs)); + cnt_ext->cs = NULL; +#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) + claim_zero(mlx5_glue->destroy_counters(cnt_ext->cs)); + cnt_ext->cs = NULL; +#endif + TAILQ_INSERT_HEAD(&pool->counters, cnt, next); + } +} + +/** + * Query a flow counter via Verbs library call. + * + * @see rte_flow_query() + * @see rte_flow_ops + */ +static int +flow_verbs_counter_query(struct rte_eth_dev *dev __rte_unused, + struct rte_flow *flow, void *data, + struct rte_flow_error *error) +{ +#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) || \ + defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) + if (flow->counter) { + struct mlx5_flow_counter_pool *pool; + struct mlx5_flow_counter *cnt = flow_verbs_counter_get_by_idx + (dev, flow->counter, &pool); + struct mlx5_flow_counter_ext *cnt_ext = MLX5_CNT_TO_CNT_EXT + (pool, cnt); + struct rte_flow_query_count *qc = data; + uint64_t counters[2] = {0, 0}; +#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) + struct ibv_query_counter_set_attr query_cs_attr = { + .cs = cnt_ext->cs, + .query_flags = IBV_COUNTER_SET_FORCE_UPDATE, + }; + struct ibv_counter_set_data query_out = { + .out = counters, + .outlen = 2 * sizeof(uint64_t), + }; + int err = mlx5_glue->query_counter_set(&query_cs_attr, + &query_out); +#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) + int err = mlx5_glue->query_counters + (cnt_ext->cs, counters, + RTE_DIM(counters), + IBV_READ_COUNTERS_ATTR_PREFER_CACHED); +#endif + if (err) + return rte_flow_error_set + (error, err, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "cannot read counter"); + qc->hits_set = 1; + qc->bytes_set = 1; + qc->hits = counters[0] - cnt->hits; + qc->bytes = counters[1] - cnt->bytes; + if (qc->reset) { + cnt->hits = counters[0]; + cnt->bytes = counters[1]; + } + return 0; + } + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "flow does not have counter"); +#else + (void)flow; + (void)data; + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "counters are not available"); +#endif +} + +/** + * Add a verbs item specification into @p verbs. + * + * @param[out] verbs + * Pointer to verbs structure. + * @param[in] src + * Create specification. + * @param[in] size + * Size in bytes of the specification to copy. + */ +static void +flow_verbs_spec_add(struct mlx5_flow_verbs_workspace *verbs, + void *src, unsigned int size) +{ + void *dst; + + if (!verbs) + return; + MLX5_ASSERT(verbs->specs); + dst = (void *)(verbs->specs + verbs->size); + memcpy(dst, src, size); + ++verbs->attr.num_of_specs; + verbs->size += size; +} + +/** + * Convert the @p item into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested item + * into the flow. + * + * @param[in, out] dev_flow + * Pointer to dev_flow structure. + * @param[in] item + * Item specification. + * @param[in] item_flags + * Parsed item flags. + */ +static void +flow_verbs_translate_item_eth(struct mlx5_flow *dev_flow, + const struct rte_flow_item *item, + uint64_t item_flags) +{ + const struct rte_flow_item_eth *spec = item->spec; + const struct rte_flow_item_eth *mask = item->mask; + const unsigned int size = sizeof(struct ibv_flow_spec_eth); + struct ibv_flow_spec_eth eth = { + .type = IBV_FLOW_SPEC_ETH | VERBS_SPEC_INNER(item_flags), + .size = size, + }; + + if (!mask) + mask = &rte_flow_item_eth_mask; + if (spec) { + unsigned int i; + + memcpy(ð.val.dst_mac, spec->dst.addr_bytes, + RTE_ETHER_ADDR_LEN); + memcpy(ð.val.src_mac, spec->src.addr_bytes, + RTE_ETHER_ADDR_LEN); + eth.val.ether_type = spec->type; + memcpy(ð.mask.dst_mac, mask->dst.addr_bytes, + RTE_ETHER_ADDR_LEN); + memcpy(ð.mask.src_mac, mask->src.addr_bytes, + RTE_ETHER_ADDR_LEN); + eth.mask.ether_type = mask->type; + /* Remove unwanted bits from values. */ + for (i = 0; i < RTE_ETHER_ADDR_LEN; ++i) { + eth.val.dst_mac[i] &= eth.mask.dst_mac[i]; + eth.val.src_mac[i] &= eth.mask.src_mac[i]; + } + eth.val.ether_type &= eth.mask.ether_type; + } + flow_verbs_spec_add(&dev_flow->verbs, ð, size); +} + +/** + * Update the VLAN tag in the Verbs Ethernet specification. + * This function assumes that the input is valid and there is space to add + * the requested item. + * + * @param[in, out] attr + * Pointer to Verbs attributes structure. + * @param[in] eth + * Verbs structure containing the VLAN information to copy. + */ +static void +flow_verbs_item_vlan_update(struct ibv_flow_attr *attr, + struct ibv_flow_spec_eth *eth) +{ + unsigned int i; + const enum ibv_flow_spec_type search = eth->type; + struct ibv_spec_header *hdr = (struct ibv_spec_header *) + ((uint8_t *)attr + sizeof(struct ibv_flow_attr)); + + for (i = 0; i != attr->num_of_specs; ++i) { + if (hdr->type == search) { + struct ibv_flow_spec_eth *e = + (struct ibv_flow_spec_eth *)hdr; + + e->val.vlan_tag = eth->val.vlan_tag; + e->mask.vlan_tag = eth->mask.vlan_tag; + e->val.ether_type = eth->val.ether_type; + e->mask.ether_type = eth->mask.ether_type; + break; + } + hdr = (struct ibv_spec_header *)((uint8_t *)hdr + hdr->size); + } +} + +/** + * Convert the @p item into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested item + * into the flow. + * + * @param[in, out] dev_flow + * Pointer to dev_flow structure. + * @param[in] item + * Item specification. + * @param[in] item_flags + * Parsed item flags. + */ +static void +flow_verbs_translate_item_vlan(struct mlx5_flow *dev_flow, + const struct rte_flow_item *item, + uint64_t item_flags) +{ + const struct rte_flow_item_vlan *spec = item->spec; + const struct rte_flow_item_vlan *mask = item->mask; + unsigned int size = sizeof(struct ibv_flow_spec_eth); + const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + struct ibv_flow_spec_eth eth = { + .type = IBV_FLOW_SPEC_ETH | VERBS_SPEC_INNER(item_flags), + .size = size, + }; + const uint32_t l2m = tunnel ? MLX5_FLOW_LAYER_INNER_L2 : + MLX5_FLOW_LAYER_OUTER_L2; + + if (!mask) + mask = &rte_flow_item_vlan_mask; + if (spec) { + eth.val.vlan_tag = spec->tci; + eth.mask.vlan_tag = mask->tci; + eth.val.vlan_tag &= eth.mask.vlan_tag; + eth.val.ether_type = spec->inner_type; + eth.mask.ether_type = mask->inner_type; + eth.val.ether_type &= eth.mask.ether_type; + } + if (!(item_flags & l2m)) + flow_verbs_spec_add(&dev_flow->verbs, ð, size); + else + flow_verbs_item_vlan_update(&dev_flow->verbs.attr, ð); + if (!tunnel) + dev_flow->handle->vf_vlan.tag = + rte_be_to_cpu_16(spec->tci) & 0x0fff; +} + +/** + * Convert the @p item into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested item + * into the flow. + * + * @param[in, out] dev_flow + * Pointer to dev_flow structure. + * @param[in] item + * Item specification. + * @param[in] item_flags + * Parsed item flags. + */ +static void +flow_verbs_translate_item_ipv4(struct mlx5_flow *dev_flow, + const struct rte_flow_item *item, + uint64_t item_flags) +{ + const struct rte_flow_item_ipv4 *spec = item->spec; + const struct rte_flow_item_ipv4 *mask = item->mask; + unsigned int size = sizeof(struct ibv_flow_spec_ipv4_ext); + struct ibv_flow_spec_ipv4_ext ipv4 = { + .type = IBV_FLOW_SPEC_IPV4_EXT | VERBS_SPEC_INNER(item_flags), + .size = size, + }; + + if (!mask) + mask = &rte_flow_item_ipv4_mask; + if (spec) { + ipv4.val = (struct ibv_flow_ipv4_ext_filter){ + .src_ip = spec->hdr.src_addr, + .dst_ip = spec->hdr.dst_addr, + .proto = spec->hdr.next_proto_id, + .tos = spec->hdr.type_of_service, + }; + ipv4.mask = (struct ibv_flow_ipv4_ext_filter){ + .src_ip = mask->hdr.src_addr, + .dst_ip = mask->hdr.dst_addr, + .proto = mask->hdr.next_proto_id, + .tos = mask->hdr.type_of_service, + }; + /* Remove unwanted bits from values. */ + ipv4.val.src_ip &= ipv4.mask.src_ip; + ipv4.val.dst_ip &= ipv4.mask.dst_ip; + ipv4.val.proto &= ipv4.mask.proto; + ipv4.val.tos &= ipv4.mask.tos; + } + flow_verbs_spec_add(&dev_flow->verbs, &ipv4, size); +} + +/** + * Convert the @p item into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested item + * into the flow. + * + * @param[in, out] dev_flow + * Pointer to dev_flow structure. + * @param[in] item + * Item specification. + * @param[in] item_flags + * Parsed item flags. + */ +static void +flow_verbs_translate_item_ipv6(struct mlx5_flow *dev_flow, + const struct rte_flow_item *item, + uint64_t item_flags) +{ + const struct rte_flow_item_ipv6 *spec = item->spec; + const struct rte_flow_item_ipv6 *mask = item->mask; + unsigned int size = sizeof(struct ibv_flow_spec_ipv6); + struct ibv_flow_spec_ipv6 ipv6 = { + .type = IBV_FLOW_SPEC_IPV6 | VERBS_SPEC_INNER(item_flags), + .size = size, + }; + + if (!mask) + mask = &rte_flow_item_ipv6_mask; + if (spec) { + unsigned int i; + uint32_t vtc_flow_val; + uint32_t vtc_flow_mask; + + memcpy(&ipv6.val.src_ip, spec->hdr.src_addr, + RTE_DIM(ipv6.val.src_ip)); + memcpy(&ipv6.val.dst_ip, spec->hdr.dst_addr, + RTE_DIM(ipv6.val.dst_ip)); + memcpy(&ipv6.mask.src_ip, mask->hdr.src_addr, + RTE_DIM(ipv6.mask.src_ip)); + memcpy(&ipv6.mask.dst_ip, mask->hdr.dst_addr, + RTE_DIM(ipv6.mask.dst_ip)); + vtc_flow_val = rte_be_to_cpu_32(spec->hdr.vtc_flow); + vtc_flow_mask = rte_be_to_cpu_32(mask->hdr.vtc_flow); + ipv6.val.flow_label = + rte_cpu_to_be_32((vtc_flow_val & RTE_IPV6_HDR_FL_MASK) >> + RTE_IPV6_HDR_FL_SHIFT); + ipv6.val.traffic_class = (vtc_flow_val & RTE_IPV6_HDR_TC_MASK) >> + RTE_IPV6_HDR_TC_SHIFT; + ipv6.val.next_hdr = spec->hdr.proto; + ipv6.mask.flow_label = + rte_cpu_to_be_32((vtc_flow_mask & RTE_IPV6_HDR_FL_MASK) >> + RTE_IPV6_HDR_FL_SHIFT); + ipv6.mask.traffic_class = (vtc_flow_mask & RTE_IPV6_HDR_TC_MASK) >> + RTE_IPV6_HDR_TC_SHIFT; + ipv6.mask.next_hdr = mask->hdr.proto; + /* Remove unwanted bits from values. */ + for (i = 0; i < RTE_DIM(ipv6.val.src_ip); ++i) { + ipv6.val.src_ip[i] &= ipv6.mask.src_ip[i]; + ipv6.val.dst_ip[i] &= ipv6.mask.dst_ip[i]; + } + ipv6.val.flow_label &= ipv6.mask.flow_label; + ipv6.val.traffic_class &= ipv6.mask.traffic_class; + ipv6.val.next_hdr &= ipv6.mask.next_hdr; + } + flow_verbs_spec_add(&dev_flow->verbs, &ipv6, size); +} + +/** + * Convert the @p item into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested item + * into the flow. + * + * @param[in, out] dev_flow + * Pointer to dev_flow structure. + * @param[in] item + * Item specification. + * @param[in] item_flags + * Parsed item flags. + */ +static void +flow_verbs_translate_item_tcp(struct mlx5_flow *dev_flow, + const struct rte_flow_item *item, + uint64_t item_flags __rte_unused) +{ + const struct rte_flow_item_tcp *spec = item->spec; + const struct rte_flow_item_tcp *mask = item->mask; + unsigned int size = sizeof(struct ibv_flow_spec_tcp_udp); + struct ibv_flow_spec_tcp_udp tcp = { + .type = IBV_FLOW_SPEC_TCP | VERBS_SPEC_INNER(item_flags), + .size = size, + }; + + if (!mask) + mask = &rte_flow_item_tcp_mask; + if (spec) { + tcp.val.dst_port = spec->hdr.dst_port; + tcp.val.src_port = spec->hdr.src_port; + tcp.mask.dst_port = mask->hdr.dst_port; + tcp.mask.src_port = mask->hdr.src_port; + /* Remove unwanted bits from values. */ + tcp.val.src_port &= tcp.mask.src_port; + tcp.val.dst_port &= tcp.mask.dst_port; + } + flow_verbs_spec_add(&dev_flow->verbs, &tcp, size); +} + +/** + * Convert the @p item into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested item + * into the flow. + * + * @param[in, out] dev_flow + * Pointer to dev_flow structure. + * @param[in] item + * Item specification. + * @param[in] item_flags + * Parsed item flags. + */ +static void +flow_verbs_translate_item_udp(struct mlx5_flow *dev_flow, + const struct rte_flow_item *item, + uint64_t item_flags __rte_unused) +{ + const struct rte_flow_item_udp *spec = item->spec; + const struct rte_flow_item_udp *mask = item->mask; + unsigned int size = sizeof(struct ibv_flow_spec_tcp_udp); + struct ibv_flow_spec_tcp_udp udp = { + .type = IBV_FLOW_SPEC_UDP | VERBS_SPEC_INNER(item_flags), + .size = size, + }; + + if (!mask) + mask = &rte_flow_item_udp_mask; + if (spec) { + udp.val.dst_port = spec->hdr.dst_port; + udp.val.src_port = spec->hdr.src_port; + udp.mask.dst_port = mask->hdr.dst_port; + udp.mask.src_port = mask->hdr.src_port; + /* Remove unwanted bits from values. */ + udp.val.src_port &= udp.mask.src_port; + udp.val.dst_port &= udp.mask.dst_port; + } + item++; + while (item->type == RTE_FLOW_ITEM_TYPE_VOID) + item++; + if (!(udp.val.dst_port & udp.mask.dst_port)) { + switch ((item)->type) { + case RTE_FLOW_ITEM_TYPE_VXLAN: + udp.val.dst_port = htons(MLX5_UDP_PORT_VXLAN); + udp.mask.dst_port = 0xffff; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: + udp.val.dst_port = htons(MLX5_UDP_PORT_VXLAN_GPE); + udp.mask.dst_port = 0xffff; + break; + case RTE_FLOW_ITEM_TYPE_MPLS: + udp.val.dst_port = htons(MLX5_UDP_PORT_MPLS); + udp.mask.dst_port = 0xffff; + break; + default: + break; + } + } + + flow_verbs_spec_add(&dev_flow->verbs, &udp, size); +} + +/** + * Convert the @p item into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested item + * into the flow. + * + * @param[in, out] dev_flow + * Pointer to dev_flow structure. + * @param[in] item + * Item specification. + * @param[in] item_flags + * Parsed item flags. + */ +static void +flow_verbs_translate_item_vxlan(struct mlx5_flow *dev_flow, + const struct rte_flow_item *item, + uint64_t item_flags __rte_unused) +{ + const struct rte_flow_item_vxlan *spec = item->spec; + const struct rte_flow_item_vxlan *mask = item->mask; + unsigned int size = sizeof(struct ibv_flow_spec_tunnel); + struct ibv_flow_spec_tunnel vxlan = { + .type = IBV_FLOW_SPEC_VXLAN_TUNNEL, + .size = size, + }; + union vni { + uint32_t vlan_id; + uint8_t vni[4]; + } id = { .vlan_id = 0, }; + + if (!mask) + mask = &rte_flow_item_vxlan_mask; + if (spec) { + memcpy(&id.vni[1], spec->vni, 3); + vxlan.val.tunnel_id = id.vlan_id; + memcpy(&id.vni[1], mask->vni, 3); + vxlan.mask.tunnel_id = id.vlan_id; + /* Remove unwanted bits from values. */ + vxlan.val.tunnel_id &= vxlan.mask.tunnel_id; + } + flow_verbs_spec_add(&dev_flow->verbs, &vxlan, size); +} + +/** + * Convert the @p item into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested item + * into the flow. + * + * @param[in, out] dev_flow + * Pointer to dev_flow structure. + * @param[in] item + * Item specification. + * @param[in] item_flags + * Parsed item flags. + */ +static void +flow_verbs_translate_item_vxlan_gpe(struct mlx5_flow *dev_flow, + const struct rte_flow_item *item, + uint64_t item_flags __rte_unused) +{ + const struct rte_flow_item_vxlan_gpe *spec = item->spec; + const struct rte_flow_item_vxlan_gpe *mask = item->mask; + unsigned int size = sizeof(struct ibv_flow_spec_tunnel); + struct ibv_flow_spec_tunnel vxlan_gpe = { + .type = IBV_FLOW_SPEC_VXLAN_TUNNEL, + .size = size, + }; + union vni { + uint32_t vlan_id; + uint8_t vni[4]; + } id = { .vlan_id = 0, }; + + if (!mask) + mask = &rte_flow_item_vxlan_gpe_mask; + if (spec) { + memcpy(&id.vni[1], spec->vni, 3); + vxlan_gpe.val.tunnel_id = id.vlan_id; + memcpy(&id.vni[1], mask->vni, 3); + vxlan_gpe.mask.tunnel_id = id.vlan_id; + /* Remove unwanted bits from values. */ + vxlan_gpe.val.tunnel_id &= vxlan_gpe.mask.tunnel_id; + } + flow_verbs_spec_add(&dev_flow->verbs, &vxlan_gpe, size); +} + +/** + * Update the protocol in Verbs IPv4/IPv6 spec. + * + * @param[in, out] attr + * Pointer to Verbs attributes structure. + * @param[in] search + * Specification type to search in order to update the IP protocol. + * @param[in] protocol + * Protocol value to set if none is present in the specification. + */ +static void +flow_verbs_item_gre_ip_protocol_update(struct ibv_flow_attr *attr, + enum ibv_flow_spec_type search, + uint8_t protocol) +{ + unsigned int i; + struct ibv_spec_header *hdr = (struct ibv_spec_header *) + ((uint8_t *)attr + sizeof(struct ibv_flow_attr)); + + if (!attr) + return; + for (i = 0; i != attr->num_of_specs; ++i) { + if (hdr->type == search) { + union { + struct ibv_flow_spec_ipv4_ext *ipv4; + struct ibv_flow_spec_ipv6 *ipv6; + } ip; + + switch (search) { + case IBV_FLOW_SPEC_IPV4_EXT: + ip.ipv4 = (struct ibv_flow_spec_ipv4_ext *)hdr; + if (!ip.ipv4->val.proto) { + ip.ipv4->val.proto = protocol; + ip.ipv4->mask.proto = 0xff; + } + break; + case IBV_FLOW_SPEC_IPV6: + ip.ipv6 = (struct ibv_flow_spec_ipv6 *)hdr; + if (!ip.ipv6->val.next_hdr) { + ip.ipv6->val.next_hdr = protocol; + ip.ipv6->mask.next_hdr = 0xff; + } + break; + default: + break; + } + break; + } + hdr = (struct ibv_spec_header *)((uint8_t *)hdr + hdr->size); + } +} + +/** + * Convert the @p item into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested item + * into the flow. + * + * @param[in, out] dev_flow + * Pointer to dev_flow structure. + * @param[in] item + * Item specification. + * @param[in] item_flags + * Parsed item flags. + */ +static void +flow_verbs_translate_item_gre(struct mlx5_flow *dev_flow, + const struct rte_flow_item *item __rte_unused, + uint64_t item_flags) +{ + struct mlx5_flow_verbs_workspace *verbs = &dev_flow->verbs; +#ifndef HAVE_IBV_DEVICE_MPLS_SUPPORT + unsigned int size = sizeof(struct ibv_flow_spec_tunnel); + struct ibv_flow_spec_tunnel tunnel = { + .type = IBV_FLOW_SPEC_VXLAN_TUNNEL, + .size = size, + }; +#else + const struct rte_flow_item_gre *spec = item->spec; + const struct rte_flow_item_gre *mask = item->mask; + unsigned int size = sizeof(struct ibv_flow_spec_gre); + struct ibv_flow_spec_gre tunnel = { + .type = IBV_FLOW_SPEC_GRE, + .size = size, + }; + + if (!mask) + mask = &rte_flow_item_gre_mask; + if (spec) { + tunnel.val.c_ks_res0_ver = spec->c_rsvd0_ver; + tunnel.val.protocol = spec->protocol; + tunnel.mask.c_ks_res0_ver = mask->c_rsvd0_ver; + tunnel.mask.protocol = mask->protocol; + /* Remove unwanted bits from values. */ + tunnel.val.c_ks_res0_ver &= tunnel.mask.c_ks_res0_ver; + tunnel.val.protocol &= tunnel.mask.protocol; + tunnel.val.key &= tunnel.mask.key; + } +#endif + if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) + flow_verbs_item_gre_ip_protocol_update(&verbs->attr, + IBV_FLOW_SPEC_IPV4_EXT, + IPPROTO_GRE); + else + flow_verbs_item_gre_ip_protocol_update(&verbs->attr, + IBV_FLOW_SPEC_IPV6, + IPPROTO_GRE); + flow_verbs_spec_add(verbs, &tunnel, size); +} + +/** + * Convert the @p action into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested action + * into the flow. This function also return the action that was added. + * + * @param[in, out] dev_flow + * Pointer to dev_flow structure. + * @param[in] item + * Item specification. + * @param[in] item_flags + * Parsed item flags. + */ +static void +flow_verbs_translate_item_mpls(struct mlx5_flow *dev_flow __rte_unused, + const struct rte_flow_item *item __rte_unused, + uint64_t item_flags __rte_unused) +{ +#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT + const struct rte_flow_item_mpls *spec = item->spec; + const struct rte_flow_item_mpls *mask = item->mask; + unsigned int size = sizeof(struct ibv_flow_spec_mpls); + struct ibv_flow_spec_mpls mpls = { + .type = IBV_FLOW_SPEC_MPLS, + .size = size, + }; + + if (!mask) + mask = &rte_flow_item_mpls_mask; + if (spec) { + memcpy(&mpls.val.label, spec, sizeof(mpls.val.label)); + memcpy(&mpls.mask.label, mask, sizeof(mpls.mask.label)); + /* Remove unwanted bits from values. */ + mpls.val.label &= mpls.mask.label; + } + flow_verbs_spec_add(&dev_flow->verbs, &mpls, size); +#endif +} + +/** + * Convert the @p action into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested action + * into the flow. + * + * @param[in] dev_flow + * Pointer to mlx5_flow. + * @param[in] action + * Action configuration. + */ +static void +flow_verbs_translate_action_drop + (struct mlx5_flow *dev_flow, + const struct rte_flow_action *action __rte_unused) +{ + unsigned int size = sizeof(struct ibv_flow_spec_action_drop); + struct ibv_flow_spec_action_drop drop = { + .type = IBV_FLOW_SPEC_ACTION_DROP, + .size = size, + }; + + flow_verbs_spec_add(&dev_flow->verbs, &drop, size); +} + +/** + * Convert the @p action into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested action + * into the flow. + * + * @param[in] rss_desc + * Pointer to mlx5_flow_rss_desc. + * @param[in] action + * Action configuration. + */ +static void +flow_verbs_translate_action_queue(struct mlx5_flow_rss_desc *rss_desc, + const struct rte_flow_action *action) +{ + const struct rte_flow_action_queue *queue = action->conf; + + rss_desc->queue[0] = queue->index; + rss_desc->queue_num = 1; +} + +/** + * Convert the @p action into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested action + * into the flow. + * + * @param[in] rss_desc + * Pointer to mlx5_flow_rss_desc. + * @param[in] action + * Action configuration. + */ +static void +flow_verbs_translate_action_rss(struct mlx5_flow_rss_desc *rss_desc, + const struct rte_flow_action *action) +{ + const struct rte_flow_action_rss *rss = action->conf; + const uint8_t *rss_key; + + memcpy(rss_desc->queue, rss->queue, rss->queue_num * sizeof(uint16_t)); + rss_desc->queue_num = rss->queue_num; + /* NULL RSS key indicates default RSS key. */ + rss_key = !rss->key ? rss_hash_default_key : rss->key; + memcpy(rss_desc->key, rss_key, MLX5_RSS_HASH_KEY_LEN); + /* + * rss->level and rss.types should be set in advance when expanding + * items for RSS. + */ +} + +/** + * Convert the @p action into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested action + * into the flow. + * + * @param[in] dev_flow + * Pointer to mlx5_flow. + * @param[in] action + * Action configuration. + */ +static void +flow_verbs_translate_action_flag + (struct mlx5_flow *dev_flow, + const struct rte_flow_action *action __rte_unused) +{ + unsigned int size = sizeof(struct ibv_flow_spec_action_tag); + struct ibv_flow_spec_action_tag tag = { + .type = IBV_FLOW_SPEC_ACTION_TAG, + .size = size, + .tag_id = mlx5_flow_mark_set(MLX5_FLOW_MARK_DEFAULT), + }; + + flow_verbs_spec_add(&dev_flow->verbs, &tag, size); +} + +/** + * Convert the @p action into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested action + * into the flow. + * + * @param[in] dev_flow + * Pointer to mlx5_flow. + * @param[in] action + * Action configuration. + */ +static void +flow_verbs_translate_action_mark(struct mlx5_flow *dev_flow, + const struct rte_flow_action *action) +{ + const struct rte_flow_action_mark *mark = action->conf; + unsigned int size = sizeof(struct ibv_flow_spec_action_tag); + struct ibv_flow_spec_action_tag tag = { + .type = IBV_FLOW_SPEC_ACTION_TAG, + .size = size, + .tag_id = mlx5_flow_mark_set(mark->id), + }; + + flow_verbs_spec_add(&dev_flow->verbs, &tag, size); +} + +/** + * Convert the @p action into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested action + * into the flow. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] action + * Action configuration. + * @param[in] dev_flow + * Pointer to mlx5_flow. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 On success else a negative errno value is returned and rte_errno is set. + */ +static int +flow_verbs_translate_action_count(struct mlx5_flow *dev_flow, + const struct rte_flow_action *action, + struct rte_eth_dev *dev, + struct rte_flow_error *error) +{ + const struct rte_flow_action_count *count = action->conf; + struct rte_flow *flow = dev_flow->flow; +#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) || \ + defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) + struct mlx5_flow_counter_pool *pool; + struct mlx5_flow_counter *cnt = NULL; + struct mlx5_flow_counter_ext *cnt_ext; + unsigned int size = sizeof(struct ibv_flow_spec_counter_action); + struct ibv_flow_spec_counter_action counter = { + .type = IBV_FLOW_SPEC_ACTION_COUNT, + .size = size, + }; +#endif + + if (!flow->counter) { + flow->counter = flow_verbs_counter_new(dev, count->shared, + count->id); + if (!flow->counter) + return rte_flow_error_set(error, rte_errno, + RTE_FLOW_ERROR_TYPE_ACTION, + action, + "cannot get counter" + " context."); + } +#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) + cnt = flow_verbs_counter_get_by_idx(dev, flow->counter, &pool); + cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt); + counter.counter_set_handle = cnt_ext->cs->handle; + flow_verbs_spec_add(&dev_flow->verbs, &counter, size); +#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) + cnt = flow_verbs_counter_get_by_idx(dev, flow->counter, &pool); + cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt); + counter.counters = cnt_ext->cs; + flow_verbs_spec_add(&dev_flow->verbs, &counter, size); +#endif + return 0; +} + +/** + * Internal validation function. For validating both actions and items. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] attr + * Pointer to the flow attributes. + * @param[in] items + * Pointer to the list of items. + * @param[in] actions + * Pointer to the list of actions. + * @param[in] external + * This flow rule is created by request external to PMD. + * @param[in] hairpin + * Number of hairpin TX actions, 0 means classic flow. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_verbs_validate(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + bool external __rte_unused, + int hairpin __rte_unused, + struct rte_flow_error *error) +{ + int ret; + uint64_t action_flags = 0; + uint64_t item_flags = 0; + uint64_t last_item = 0; + uint8_t next_protocol = 0xff; + uint16_t ether_type = 0; + + if (items == NULL) + return -1; + ret = mlx5_flow_validate_attributes(dev, attr, error); + if (ret < 0) + return ret; + for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + int ret = 0; + + switch (items->type) { + case RTE_FLOW_ITEM_TYPE_VOID: + break; + case RTE_FLOW_ITEM_TYPE_ETH: + ret = mlx5_flow_validate_item_eth(items, item_flags, + error); + if (ret < 0) + return ret; + last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L2 : + MLX5_FLOW_LAYER_OUTER_L2; + if (items->mask != NULL && items->spec != NULL) { + ether_type = + ((const struct rte_flow_item_eth *) + items->spec)->type; + ether_type &= + ((const struct rte_flow_item_eth *) + items->mask)->type; + ether_type = rte_be_to_cpu_16(ether_type); + } else { + ether_type = 0; + } + break; + case RTE_FLOW_ITEM_TYPE_VLAN: + ret = mlx5_flow_validate_item_vlan(items, item_flags, + dev, error); + if (ret < 0) + return ret; + last_item = tunnel ? (MLX5_FLOW_LAYER_INNER_L2 | + MLX5_FLOW_LAYER_INNER_VLAN) : + (MLX5_FLOW_LAYER_OUTER_L2 | + MLX5_FLOW_LAYER_OUTER_VLAN); + if (items->mask != NULL && items->spec != NULL) { + ether_type = + ((const struct rte_flow_item_vlan *) + items->spec)->inner_type; + ether_type &= + ((const struct rte_flow_item_vlan *) + items->mask)->inner_type; + ether_type = rte_be_to_cpu_16(ether_type); + } else { + ether_type = 0; + } + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + ret = mlx5_flow_validate_item_ipv4(items, item_flags, + last_item, + ether_type, NULL, + error); + if (ret < 0) + return ret; + last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 : + MLX5_FLOW_LAYER_OUTER_L3_IPV4; + if (items->mask != NULL && + ((const struct rte_flow_item_ipv4 *) + items->mask)->hdr.next_proto_id) { + next_protocol = + ((const struct rte_flow_item_ipv4 *) + (items->spec))->hdr.next_proto_id; + next_protocol &= + ((const struct rte_flow_item_ipv4 *) + (items->mask))->hdr.next_proto_id; + } else { + /* Reset for inner layer. */ + next_protocol = 0xff; + } + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + ret = mlx5_flow_validate_item_ipv6(items, item_flags, + last_item, + ether_type, NULL, + error); + if (ret < 0) + return ret; + last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 : + MLX5_FLOW_LAYER_OUTER_L3_IPV6; + if (items->mask != NULL && + ((const struct rte_flow_item_ipv6 *) + items->mask)->hdr.proto) { + next_protocol = + ((const struct rte_flow_item_ipv6 *) + items->spec)->hdr.proto; + next_protocol &= + ((const struct rte_flow_item_ipv6 *) + items->mask)->hdr.proto; + } else { + /* Reset for inner layer. */ + next_protocol = 0xff; + } + break; + case RTE_FLOW_ITEM_TYPE_UDP: + ret = mlx5_flow_validate_item_udp(items, item_flags, + next_protocol, + error); + if (ret < 0) + return ret; + last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L4_UDP : + MLX5_FLOW_LAYER_OUTER_L4_UDP; + break; + case RTE_FLOW_ITEM_TYPE_TCP: + ret = mlx5_flow_validate_item_tcp + (items, item_flags, + next_protocol, + &rte_flow_item_tcp_mask, + error); + if (ret < 0) + return ret; + last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L4_TCP : + MLX5_FLOW_LAYER_OUTER_L4_TCP; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN: + ret = mlx5_flow_validate_item_vxlan(items, item_flags, + error); + if (ret < 0) + return ret; + last_item = MLX5_FLOW_LAYER_VXLAN; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: + ret = mlx5_flow_validate_item_vxlan_gpe(items, + item_flags, + dev, error); + if (ret < 0) + return ret; + last_item = MLX5_FLOW_LAYER_VXLAN_GPE; + break; + case RTE_FLOW_ITEM_TYPE_GRE: + ret = mlx5_flow_validate_item_gre(items, item_flags, + next_protocol, error); + if (ret < 0) + return ret; + last_item = MLX5_FLOW_LAYER_GRE; + break; + case RTE_FLOW_ITEM_TYPE_MPLS: + ret = mlx5_flow_validate_item_mpls(dev, items, + item_flags, + last_item, error); + if (ret < 0) + return ret; + last_item = MLX5_FLOW_LAYER_MPLS; + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + NULL, "item not supported"); + } + item_flags |= last_item; + } + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_VOID: + break; + case RTE_FLOW_ACTION_TYPE_FLAG: + ret = mlx5_flow_validate_action_flag(action_flags, + attr, + error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_FLAG; + break; + case RTE_FLOW_ACTION_TYPE_MARK: + ret = mlx5_flow_validate_action_mark(actions, + action_flags, + attr, + error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_MARK; + break; + case RTE_FLOW_ACTION_TYPE_DROP: + ret = mlx5_flow_validate_action_drop(action_flags, + attr, + error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_DROP; + break; + case RTE_FLOW_ACTION_TYPE_QUEUE: + ret = mlx5_flow_validate_action_queue(actions, + action_flags, dev, + attr, + error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_QUEUE; + break; + case RTE_FLOW_ACTION_TYPE_RSS: + ret = mlx5_flow_validate_action_rss(actions, + action_flags, dev, + attr, item_flags, + error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_RSS; + break; + case RTE_FLOW_ACTION_TYPE_COUNT: + ret = mlx5_flow_validate_action_count(dev, attr, error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_COUNT; + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "action not supported"); + } + } + /* + * Validate the drop action mutual exclusion with other actions. + * Drop action is mutually-exclusive with any other action, except for + * Count action. + */ + if ((action_flags & MLX5_FLOW_ACTION_DROP) && + (action_flags & ~(MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_COUNT))) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "Drop action is mutually-exclusive " + "with any other action, except for " + "Count action"); + if (!(action_flags & MLX5_FLOW_FATE_ACTIONS)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, actions, + "no fate action is found"); + return 0; +} + +/** + * Calculate the required bytes that are needed for the action part of the verbs + * flow. + * + * @param[in] actions + * Pointer to the list of actions. + * + * @return + * The size of the memory needed for all actions. + */ +static int +flow_verbs_get_actions_size(const struct rte_flow_action actions[]) +{ + int size = 0; + + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_VOID: + break; + case RTE_FLOW_ACTION_TYPE_FLAG: + size += sizeof(struct ibv_flow_spec_action_tag); + break; + case RTE_FLOW_ACTION_TYPE_MARK: + size += sizeof(struct ibv_flow_spec_action_tag); + break; + case RTE_FLOW_ACTION_TYPE_DROP: + size += sizeof(struct ibv_flow_spec_action_drop); + break; + case RTE_FLOW_ACTION_TYPE_QUEUE: + break; + case RTE_FLOW_ACTION_TYPE_RSS: + break; + case RTE_FLOW_ACTION_TYPE_COUNT: +#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) || \ + defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) + size += sizeof(struct ibv_flow_spec_counter_action); +#endif + break; + default: + break; + } + } + return size; +} + +/** + * Calculate the required bytes that are needed for the item part of the verbs + * flow. + * + * @param[in] items + * Pointer to the list of items. + * + * @return + * The size of the memory needed for all items. + */ +static int +flow_verbs_get_items_size(const struct rte_flow_item items[]) +{ + int size = 0; + + for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + switch (items->type) { + case RTE_FLOW_ITEM_TYPE_VOID: + break; + case RTE_FLOW_ITEM_TYPE_ETH: + size += sizeof(struct ibv_flow_spec_eth); + break; + case RTE_FLOW_ITEM_TYPE_VLAN: + size += sizeof(struct ibv_flow_spec_eth); + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + size += sizeof(struct ibv_flow_spec_ipv4_ext); + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + size += sizeof(struct ibv_flow_spec_ipv6); + break; + case RTE_FLOW_ITEM_TYPE_UDP: + size += sizeof(struct ibv_flow_spec_tcp_udp); + break; + case RTE_FLOW_ITEM_TYPE_TCP: + size += sizeof(struct ibv_flow_spec_tcp_udp); + break; + case RTE_FLOW_ITEM_TYPE_VXLAN: + size += sizeof(struct ibv_flow_spec_tunnel); + break; + case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: + size += sizeof(struct ibv_flow_spec_tunnel); + break; +#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT + case RTE_FLOW_ITEM_TYPE_GRE: + size += sizeof(struct ibv_flow_spec_gre); + break; + case RTE_FLOW_ITEM_TYPE_MPLS: + size += sizeof(struct ibv_flow_spec_mpls); + break; +#else + case RTE_FLOW_ITEM_TYPE_GRE: + size += sizeof(struct ibv_flow_spec_tunnel); + break; +#endif + default: + break; + } + } + return size; +} + +/** + * Internal preparation function. Allocate mlx5_flow with the required size. + * The required size is calculate based on the actions and items. This function + * also returns the detected actions and items for later use. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] attr + * Pointer to the flow attributes. + * @param[in] items + * Pointer to the list of items. + * @param[in] actions + * Pointer to the list of actions. + * @param[out] error + * Pointer to the error structure. + * + * @return + * Pointer to mlx5_flow object on success, otherwise NULL and rte_errno + * is set. + */ +static struct mlx5_flow * +flow_verbs_prepare(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr __rte_unused, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + size_t size = 0; + uint32_t handle_idx = 0; + struct mlx5_flow *dev_flow; + struct mlx5_flow_handle *dev_handle; + struct mlx5_priv *priv = dev->data->dev_private; + + size += flow_verbs_get_actions_size(actions); + size += flow_verbs_get_items_size(items); + if (size > MLX5_VERBS_MAX_SPEC_ACT_SIZE) { + rte_flow_error_set(error, E2BIG, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "Verbs spec/action size too large"); + return NULL; + } + /* In case of corrupting the memory. */ + if (priv->flow_idx >= MLX5_NUM_MAX_DEV_FLOWS) { + rte_flow_error_set(error, ENOSPC, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "not free temporary device flow"); + return NULL; + } + dev_handle = mlx5_ipool_zmalloc(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], + &handle_idx); + if (!dev_handle) { + rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "not enough memory to create flow handle"); + return NULL; + } + /* No multi-thread supporting. */ + dev_flow = &((struct mlx5_flow *)priv->inter_flows)[priv->flow_idx++]; + dev_flow->handle = dev_handle; + dev_flow->handle_idx = handle_idx; + /* Memcpy is used, only size needs to be cleared to 0. */ + dev_flow->verbs.size = 0; + dev_flow->verbs.attr.num_of_specs = 0; + dev_flow->ingress = attr->ingress; + dev_flow->hash_fields = 0; + /* Need to set transfer attribute: not supported in Verbs mode. */ + return dev_flow; +} + +/** + * Fill the flow with verb spec. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in, out] dev_flow + * Pointer to the mlx5 flow. + * @param[in] attr + * Pointer to the flow attributes. + * @param[in] items + * Pointer to the list of items. + * @param[in] actions + * Pointer to the list of actions. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, else a negative errno value otherwise and rte_errno is set. + */ +static int +flow_verbs_translate(struct rte_eth_dev *dev, + struct mlx5_flow *dev_flow, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + uint64_t item_flags = 0; + uint64_t action_flags = 0; + uint64_t priority = attr->priority; + uint32_t subpriority = 0; + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_rss_desc *rss_desc = &((struct mlx5_flow_rss_desc *) + priv->rss_desc) + [!!priv->flow_nested_idx]; + + if (priority == MLX5_FLOW_PRIO_RSVD) + priority = priv->config.flow_prio - 1; + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + int ret; + + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_VOID: + break; + case RTE_FLOW_ACTION_TYPE_FLAG: + flow_verbs_translate_action_flag(dev_flow, actions); + action_flags |= MLX5_FLOW_ACTION_FLAG; + dev_flow->handle->mark = 1; + break; + case RTE_FLOW_ACTION_TYPE_MARK: + flow_verbs_translate_action_mark(dev_flow, actions); + action_flags |= MLX5_FLOW_ACTION_MARK; + dev_flow->handle->mark = 1; + break; + case RTE_FLOW_ACTION_TYPE_DROP: + flow_verbs_translate_action_drop(dev_flow, actions); + action_flags |= MLX5_FLOW_ACTION_DROP; + dev_flow->handle->fate_action = MLX5_FLOW_FATE_DROP; + break; + case RTE_FLOW_ACTION_TYPE_QUEUE: + flow_verbs_translate_action_queue(rss_desc, actions); + action_flags |= MLX5_FLOW_ACTION_QUEUE; + dev_flow->handle->fate_action = MLX5_FLOW_FATE_QUEUE; + break; + case RTE_FLOW_ACTION_TYPE_RSS: + flow_verbs_translate_action_rss(rss_desc, actions); + action_flags |= MLX5_FLOW_ACTION_RSS; + dev_flow->handle->fate_action = MLX5_FLOW_FATE_QUEUE; + break; + case RTE_FLOW_ACTION_TYPE_COUNT: + ret = flow_verbs_translate_action_count(dev_flow, + actions, + dev, error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_COUNT; + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "action not supported"); + } + } + dev_flow->act_flags = action_flags; + for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + + switch (items->type) { + case RTE_FLOW_ITEM_TYPE_VOID: + break; + case RTE_FLOW_ITEM_TYPE_ETH: + flow_verbs_translate_item_eth(dev_flow, items, + item_flags); + subpriority = MLX5_PRIORITY_MAP_L2; + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L2 : + MLX5_FLOW_LAYER_OUTER_L2; + break; + case RTE_FLOW_ITEM_TYPE_VLAN: + flow_verbs_translate_item_vlan(dev_flow, items, + item_flags); + subpriority = MLX5_PRIORITY_MAP_L2; + item_flags |= tunnel ? (MLX5_FLOW_LAYER_INNER_L2 | + MLX5_FLOW_LAYER_INNER_VLAN) : + (MLX5_FLOW_LAYER_OUTER_L2 | + MLX5_FLOW_LAYER_OUTER_VLAN); + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + flow_verbs_translate_item_ipv4(dev_flow, items, + item_flags); + subpriority = MLX5_PRIORITY_MAP_L3; + dev_flow->hash_fields |= + mlx5_flow_hashfields_adjust + (rss_desc, tunnel, + MLX5_IPV4_LAYER_TYPES, + MLX5_IPV4_IBV_RX_HASH); + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 : + MLX5_FLOW_LAYER_OUTER_L3_IPV4; + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + flow_verbs_translate_item_ipv6(dev_flow, items, + item_flags); + subpriority = MLX5_PRIORITY_MAP_L3; + dev_flow->hash_fields |= + mlx5_flow_hashfields_adjust + (rss_desc, tunnel, + MLX5_IPV6_LAYER_TYPES, + MLX5_IPV6_IBV_RX_HASH); + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 : + MLX5_FLOW_LAYER_OUTER_L3_IPV6; + break; + case RTE_FLOW_ITEM_TYPE_TCP: + flow_verbs_translate_item_tcp(dev_flow, items, + item_flags); + subpriority = MLX5_PRIORITY_MAP_L4; + dev_flow->hash_fields |= + mlx5_flow_hashfields_adjust + (rss_desc, tunnel, ETH_RSS_TCP, + (IBV_RX_HASH_SRC_PORT_TCP | + IBV_RX_HASH_DST_PORT_TCP)); + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_TCP : + MLX5_FLOW_LAYER_OUTER_L4_TCP; + break; + case RTE_FLOW_ITEM_TYPE_UDP: + flow_verbs_translate_item_udp(dev_flow, items, + item_flags); + subpriority = MLX5_PRIORITY_MAP_L4; + dev_flow->hash_fields |= + mlx5_flow_hashfields_adjust + (rss_desc, tunnel, ETH_RSS_UDP, + (IBV_RX_HASH_SRC_PORT_UDP | + IBV_RX_HASH_DST_PORT_UDP)); + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_UDP : + MLX5_FLOW_LAYER_OUTER_L4_UDP; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN: + flow_verbs_translate_item_vxlan(dev_flow, items, + item_flags); + subpriority = MLX5_PRIORITY_MAP_L2; + item_flags |= MLX5_FLOW_LAYER_VXLAN; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: + flow_verbs_translate_item_vxlan_gpe(dev_flow, items, + item_flags); + subpriority = MLX5_PRIORITY_MAP_L2; + item_flags |= MLX5_FLOW_LAYER_VXLAN_GPE; + break; + case RTE_FLOW_ITEM_TYPE_GRE: + flow_verbs_translate_item_gre(dev_flow, items, + item_flags); + subpriority = MLX5_PRIORITY_MAP_L2; + item_flags |= MLX5_FLOW_LAYER_GRE; + break; + case RTE_FLOW_ITEM_TYPE_MPLS: + flow_verbs_translate_item_mpls(dev_flow, items, + item_flags); + subpriority = MLX5_PRIORITY_MAP_L2; + item_flags |= MLX5_FLOW_LAYER_MPLS; + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + NULL, + "item not supported"); + } + } + dev_flow->handle->layers = item_flags; + /* Other members of attr will be ignored. */ + dev_flow->verbs.attr.priority = + mlx5_flow_adjust_priority(dev, priority, subpriority); + dev_flow->verbs.attr.port = (uint8_t)priv->ibv_port; + return 0; +} + +/** + * Remove the flow from the NIC but keeps it in memory. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in, out] flow + * Pointer to flow structure. + */ +static void +flow_verbs_remove(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_handle *handle; + uint32_t handle_idx; + + if (!flow) + return; + SILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], flow->dev_handles, + handle_idx, handle, next) { + if (handle->ib_flow) { + claim_zero(mlx5_glue->destroy_flow(handle->ib_flow)); + handle->ib_flow = NULL; + } + /* hrxq is union, don't touch it only the flag is set. */ + if (handle->rix_hrxq) { + if (handle->fate_action == MLX5_FLOW_FATE_DROP) { + mlx5_hrxq_drop_release(dev); + handle->rix_hrxq = 0; + } else if (handle->fate_action == + MLX5_FLOW_FATE_QUEUE) { + mlx5_hrxq_release(dev, handle->rix_hrxq); + handle->rix_hrxq = 0; + } + } + if (handle->vf_vlan.tag && handle->vf_vlan.created) + mlx5_vlan_vmwa_release(dev, &handle->vf_vlan); + } +} + +/** + * Remove the flow from the NIC and the memory. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in, out] flow + * Pointer to flow structure. + */ +static void +flow_verbs_destroy(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_handle *handle; + + if (!flow) + return; + flow_verbs_remove(dev, flow); + while (flow->dev_handles) { + uint32_t tmp_idx = flow->dev_handles; + + handle = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], + tmp_idx); + if (!handle) + return; + flow->dev_handles = handle->next.next; + mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], + tmp_idx); + } + if (flow->counter) { + flow_verbs_counter_release(dev, flow->counter); + flow->counter = 0; + } +} + +/** + * Apply the flow to the NIC. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in, out] flow + * Pointer to flow structure. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_verbs_apply(struct rte_eth_dev *dev, struct rte_flow *flow, + struct rte_flow_error *error) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_handle *handle; + struct mlx5_flow *dev_flow; + struct mlx5_hrxq *hrxq; + uint32_t dev_handles; + int err; + int idx; + + for (idx = priv->flow_idx - 1; idx >= priv->flow_nested_idx; idx--) { + dev_flow = &((struct mlx5_flow *)priv->inter_flows)[idx]; + handle = dev_flow->handle; + if (handle->fate_action == MLX5_FLOW_FATE_DROP) { + hrxq = mlx5_hrxq_drop_new(dev); + if (!hrxq) { + rte_flow_error_set + (error, errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "cannot get drop hash queue"); + goto error; + } + } else { + uint32_t hrxq_idx; + struct mlx5_flow_rss_desc *rss_desc = + &((struct mlx5_flow_rss_desc *)priv->rss_desc) + [!!priv->flow_nested_idx]; + + MLX5_ASSERT(rss_desc->queue_num); + hrxq_idx = mlx5_hrxq_get(dev, rss_desc->key, + MLX5_RSS_HASH_KEY_LEN, + dev_flow->hash_fields, + rss_desc->queue, + rss_desc->queue_num); + if (!hrxq_idx) + hrxq_idx = mlx5_hrxq_new(dev, rss_desc->key, + MLX5_RSS_HASH_KEY_LEN, + dev_flow->hash_fields, + rss_desc->queue, + rss_desc->queue_num, + !!(handle->layers & + MLX5_FLOW_LAYER_TUNNEL)); + hrxq = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_HRXQ], + hrxq_idx); + if (!hrxq) { + rte_flow_error_set + (error, rte_errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "cannot get hash queue"); + goto error; + } + handle->rix_hrxq = hrxq_idx; + } + MLX5_ASSERT(hrxq); + handle->ib_flow = mlx5_glue->create_flow(hrxq->qp, + &dev_flow->verbs.attr); + if (!handle->ib_flow) { + rte_flow_error_set(error, errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "hardware refuses to create flow"); + goto error; + } + if (priv->vmwa_context && + handle->vf_vlan.tag && !handle->vf_vlan.created) { + /* + * The rule contains the VLAN pattern. + * For VF we are going to create VLAN + * interface to make hypervisor set correct + * e-Switch vport context. + */ + mlx5_vlan_vmwa_acquire(dev, &handle->vf_vlan); + } + } + return 0; +error: + err = rte_errno; /* Save rte_errno before cleanup. */ + SILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], flow->dev_handles, + dev_handles, handle, next) { + /* hrxq is union, don't touch it only the flag is set. */ + if (handle->rix_hrxq) { + if (handle->fate_action == MLX5_FLOW_FATE_DROP) { + mlx5_hrxq_drop_release(dev); + handle->rix_hrxq = 0; + } else if (handle->fate_action == + MLX5_FLOW_FATE_QUEUE) { + mlx5_hrxq_release(dev, handle->rix_hrxq); + handle->rix_hrxq = 0; + } + } + if (handle->vf_vlan.tag && handle->vf_vlan.created) + mlx5_vlan_vmwa_release(dev, &handle->vf_vlan); + } + rte_errno = err; /* Restore rte_errno. */ + return -rte_errno; +} + +/** + * Query a flow. + * + * @see rte_flow_query() + * @see rte_flow_ops + */ +static int +flow_verbs_query(struct rte_eth_dev *dev, + struct rte_flow *flow, + const struct rte_flow_action *actions, + void *data, + struct rte_flow_error *error) +{ + int ret = -EINVAL; + + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_VOID: + break; + case RTE_FLOW_ACTION_TYPE_COUNT: + ret = flow_verbs_counter_query(dev, flow, data, error); + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "action not supported"); + } + } + return ret; +} + +const struct mlx5_flow_driver_ops mlx5_flow_verbs_drv_ops = { + .validate = flow_verbs_validate, + .prepare = flow_verbs_prepare, + .translate = flow_verbs_translate, + .apply = flow_verbs_apply, + .remove = flow_verbs_remove, + .destroy = flow_verbs_destroy, + .query = flow_verbs_query, +}; diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_mac.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mac.c new file mode 100644 index 000000000..291f7724c --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mac.c @@ -0,0 +1,255 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2015 6WIND S.A. + * Copyright 2015 Mellanox Technologies, Ltd + */ + +#include <stddef.h> +#include <stdint.h> +#include <string.h> +#include <inttypes.h> +#include <errno.h> +#include <netinet/in.h> +#include <sys/ioctl.h> +#include <arpa/inet.h> + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_ether.h> +#include <rte_ethdev_driver.h> +#include <rte_common.h> + +#include "mlx5_defs.h" +#include "mlx5.h" +#include "mlx5_utils.h" +#include "mlx5_rxtx.h" + +/** + * Get MAC address by querying netdevice. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[out] mac + * MAC address output buffer. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[RTE_ETHER_ADDR_LEN]) +{ + struct ifreq request; + int ret; + + ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request); + if (ret) + return ret; + memcpy(mac, request.ifr_hwaddr.sa_data, RTE_ETHER_ADDR_LEN); + return 0; +} + +/** + * Remove a MAC address from the internal array. + * + * @param dev + * Pointer to Ethernet device structure. + * @param index + * MAC address index. + */ +static void +mlx5_internal_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) +{ + struct mlx5_priv *priv = dev->data->dev_private; + const int vf = priv->config.vf; + + MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES); + if (rte_is_zero_ether_addr(&dev->data->mac_addrs[index])) + return; + if (vf) + mlx5_nl_mac_addr_remove(priv->nl_socket_route, + mlx5_ifindex(dev), priv->mac_own, + &dev->data->mac_addrs[index], index); + memset(&dev->data->mac_addrs[index], 0, sizeof(struct rte_ether_addr)); +} + +/** + * Adds a MAC address to the internal array. + * + * @param dev + * Pointer to Ethernet device structure. + * @param mac_addr + * MAC address to register. + * @param index + * MAC address index. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_internal_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac, + uint32_t index) +{ + struct mlx5_priv *priv = dev->data->dev_private; + const int vf = priv->config.vf; + unsigned int i; + + MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES); + if (rte_is_zero_ether_addr(mac)) { + rte_errno = EINVAL; + return -rte_errno; + } + /* First, make sure this address isn't already configured. */ + for (i = 0; (i != MLX5_MAX_MAC_ADDRESSES); ++i) { + /* Skip this index, it's going to be reconfigured. */ + if (i == index) + continue; + if (memcmp(&dev->data->mac_addrs[i], mac, sizeof(*mac))) + continue; + /* Address already configured elsewhere, return with error. */ + rte_errno = EADDRINUSE; + return -rte_errno; + } + if (vf) { + int ret = mlx5_nl_mac_addr_add(priv->nl_socket_route, + mlx5_ifindex(dev), priv->mac_own, + mac, index); + + if (ret) + return ret; + } + dev->data->mac_addrs[index] = *mac; + return 0; +} + +/** + * DPDK callback to remove a MAC address. + * + * @param dev + * Pointer to Ethernet device structure. + * @param index + * MAC address index. + */ +void +mlx5_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) +{ + int ret; + + if (index >= MLX5_MAX_UC_MAC_ADDRESSES) + return; + mlx5_internal_mac_addr_remove(dev, index); + if (!dev->data->promiscuous) { + ret = mlx5_traffic_restart(dev); + if (ret) + DRV_LOG(ERR, "port %u cannot restart traffic: %s", + dev->data->port_id, strerror(rte_errno)); + } +} + +/** + * DPDK callback to add a MAC address. + * + * @param dev + * Pointer to Ethernet device structure. + * @param mac_addr + * MAC address to register. + * @param index + * MAC address index. + * @param vmdq + * VMDq pool index to associate address with (ignored). + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac, + uint32_t index, uint32_t vmdq __rte_unused) +{ + int ret; + + if (index >= MLX5_MAX_UC_MAC_ADDRESSES) { + rte_errno = EINVAL; + return -rte_errno; + } + ret = mlx5_internal_mac_addr_add(dev, mac, index); + if (ret < 0) + return ret; + if (!dev->data->promiscuous) + return mlx5_traffic_restart(dev); + return 0; +} + +/** + * DPDK callback to set primary MAC address. + * + * @param dev + * Pointer to Ethernet device structure. + * @param mac_addr + * MAC address to register. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_mac_addr_set(struct rte_eth_dev *dev, struct rte_ether_addr *mac_addr) +{ + uint16_t port_id; + struct mlx5_priv *priv = dev->data->dev_private; + + /* Configuring the VF instead of its representor. */ + if (priv->representor) { + DRV_LOG(DEBUG, "VF represented by port %u setting primary MAC address", + dev->data->port_id); + RTE_ETH_FOREACH_DEV_SIBLING(port_id, dev->data->port_id) { + priv = rte_eth_devices[port_id].data->dev_private; + if (priv->master == 1) { + priv = dev->data->dev_private; + return mlx5_nl_vf_mac_addr_modify + (priv->nl_socket_route, + mlx5_ifindex(&rte_eth_devices[port_id]), + mac_addr, priv->representor_id); + } + } + rte_errno = -ENOTSUP; + return rte_errno; + } + + DRV_LOG(DEBUG, "port %u setting primary MAC address", + dev->data->port_id); + return mlx5_mac_addr_add(dev, mac_addr, 0, 0); +} + +/** + * DPDK callback to set multicast addresses list. + * + * @see rte_eth_dev_set_mc_addr_list() + */ +int +mlx5_set_mc_addr_list(struct rte_eth_dev *dev, + struct rte_ether_addr *mc_addr_set, uint32_t nb_mc_addr) +{ + uint32_t i; + int ret; + + if (nb_mc_addr >= MLX5_MAX_MC_MAC_ADDRESSES) { + rte_errno = ENOSPC; + return -rte_errno; + } + for (i = MLX5_MAX_UC_MAC_ADDRESSES; i != MLX5_MAX_MAC_ADDRESSES; ++i) + mlx5_internal_mac_addr_remove(dev, i); + i = MLX5_MAX_UC_MAC_ADDRESSES; + while (nb_mc_addr--) { + ret = mlx5_internal_mac_addr_add(dev, mc_addr_set++, i++); + if (ret) + return ret; + } + if (!dev->data->promiscuous) + return mlx5_traffic_restart(dev); + return 0; +} diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_mp.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mp.c new file mode 100644 index 000000000..7ad322d47 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mp.c @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2019 6WIND S.A. + * Copyright 2019 Mellanox Technologies, Ltd + */ + +#include <stdio.h> +#include <time.h> + +#include <rte_eal.h> +#include <rte_ethdev_driver.h> +#include <rte_string_fns.h> + +#include <mlx5_common_mp.h> +#include <mlx5_common_mr.h> + +#include "mlx5.h" +#include "mlx5_rxtx.h" +#include "mlx5_utils.h" + +int +mlx5_mp_primary_handle(const struct rte_mp_msg *mp_msg, const void *peer) +{ + struct rte_mp_msg mp_res; + struct mlx5_mp_param *res = (struct mlx5_mp_param *)mp_res.param; + const struct mlx5_mp_param *param = + (const struct mlx5_mp_param *)mp_msg->param; + struct rte_eth_dev *dev; + struct mlx5_priv *priv; + struct mr_cache_entry entry; + uint32_t lkey; + int ret; + + MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); + if (!rte_eth_dev_is_valid_port(param->port_id)) { + rte_errno = ENODEV; + DRV_LOG(ERR, "port %u invalid port ID", param->port_id); + return -rte_errno; + } + dev = &rte_eth_devices[param->port_id]; + priv = dev->data->dev_private; + switch (param->type) { + case MLX5_MP_REQ_CREATE_MR: + mp_init_msg(&priv->mp_id, &mp_res, param->type); + lkey = mlx5_mr_create_primary(priv->sh->pd, + &priv->sh->share_cache, + &entry, param->args.addr, + priv->config.mr_ext_memseg_en); + if (lkey == UINT32_MAX) + res->result = -rte_errno; + ret = rte_mp_reply(&mp_res, peer); + break; + case MLX5_MP_REQ_VERBS_CMD_FD: + mp_init_msg(&priv->mp_id, &mp_res, param->type); + mp_res.num_fds = 1; + mp_res.fds[0] = priv->sh->ctx->cmd_fd; + res->result = 0; + ret = rte_mp_reply(&mp_res, peer); + break; + case MLX5_MP_REQ_QUEUE_STATE_MODIFY: + mp_init_msg(&priv->mp_id, &mp_res, param->type); + res->result = mlx5_queue_state_modify_primary + (dev, ¶m->args.state_modify); + ret = rte_mp_reply(&mp_res, peer); + break; + default: + rte_errno = EINVAL; + DRV_LOG(ERR, "port %u invalid mp request type", + dev->data->port_id); + return -rte_errno; + } + return ret; +} + +/** + * IPC message handler of a secondary process. + * + * @param[in] dev + * Pointer to Ethernet structure. + * @param[in] peer + * Pointer to the peer socket path. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_mp_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer) +{ + struct rte_mp_msg mp_res; + struct mlx5_mp_param *res = (struct mlx5_mp_param *)mp_res.param; + const struct mlx5_mp_param *param = + (const struct mlx5_mp_param *)mp_msg->param; + struct rte_eth_dev *dev; + struct mlx5_priv *priv; + int ret; + + MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_SECONDARY); + if (!rte_eth_dev_is_valid_port(param->port_id)) { + rte_errno = ENODEV; + DRV_LOG(ERR, "port %u invalid port ID", param->port_id); + return -rte_errno; + } + dev = &rte_eth_devices[param->port_id]; + priv = dev->data->dev_private; + switch (param->type) { + case MLX5_MP_REQ_START_RXTX: + DRV_LOG(INFO, "port %u starting datapath", dev->data->port_id); + rte_mb(); + dev->rx_pkt_burst = mlx5_select_rx_function(dev); + dev->tx_pkt_burst = mlx5_select_tx_function(dev); + mp_init_msg(&priv->mp_id, &mp_res, param->type); + res->result = 0; + ret = rte_mp_reply(&mp_res, peer); + break; + case MLX5_MP_REQ_STOP_RXTX: + DRV_LOG(INFO, "port %u stopping datapath", dev->data->port_id); + dev->rx_pkt_burst = removed_rx_burst; + dev->tx_pkt_burst = removed_tx_burst; + rte_mb(); + mp_init_msg(&priv->mp_id, &mp_res, param->type); + res->result = 0; + ret = rte_mp_reply(&mp_res, peer); + break; + default: + rte_errno = EINVAL; + DRV_LOG(ERR, "port %u invalid mp request type", + dev->data->port_id); + return -rte_errno; + } + return ret; +} + +/** + * Broadcast request of stopping/starting data-path to secondary processes. + * + * @param[in] dev + * Pointer to Ethernet structure. + * @param[in] type + * Request type. + */ +static void +mp_req_on_rxtx(struct rte_eth_dev *dev, enum mlx5_mp_req_type type) +{ + struct rte_mp_msg mp_req; + struct rte_mp_msg *mp_res; + struct rte_mp_reply mp_rep; + struct mlx5_mp_param *res; + struct timespec ts = {.tv_sec = MLX5_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0}; + struct mlx5_priv *priv = dev->data->dev_private; + int ret; + int i; + + MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); + if (!mlx5_shared_data->secondary_cnt) + return; + if (type != MLX5_MP_REQ_START_RXTX && type != MLX5_MP_REQ_STOP_RXTX) { + DRV_LOG(ERR, "port %u unknown request (req_type %d)", + dev->data->port_id, type); + return; + } + mp_init_msg(&priv->mp_id, &mp_req, type); + ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts); + if (ret) { + if (rte_errno != ENOTSUP) + DRV_LOG(ERR, "port %u failed to request stop/start Rx/Tx (%d)", + dev->data->port_id, type); + goto exit; + } + if (mp_rep.nb_sent != mp_rep.nb_received) { + DRV_LOG(ERR, + "port %u not all secondaries responded (req_type %d)", + dev->data->port_id, type); + goto exit; + } + for (i = 0; i < mp_rep.nb_received; i++) { + mp_res = &mp_rep.msgs[i]; + res = (struct mlx5_mp_param *)mp_res->param; + if (res->result) { + DRV_LOG(ERR, "port %u request failed on secondary #%d", + dev->data->port_id, i); + goto exit; + } + } +exit: + free(mp_rep.msgs); +} + +/** + * Broadcast request of starting data-path to secondary processes. The request + * is synchronous. + * + * @param[in] dev + * Pointer to Ethernet structure. + */ +void +mlx5_mp_req_start_rxtx(struct rte_eth_dev *dev) +{ + mp_req_on_rxtx(dev, MLX5_MP_REQ_START_RXTX); +} + +/** + * Broadcast request of stopping data-path to secondary processes. The request + * is synchronous. + * + * @param[in] dev + * Pointer to Ethernet structure. + */ +void +mlx5_mp_req_stop_rxtx(struct rte_eth_dev *dev) +{ + mp_req_on_rxtx(dev, MLX5_MP_REQ_STOP_RXTX); +} diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_mr.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mr.c new file mode 100644 index 000000000..2b4b3e289 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mr.c @@ -0,0 +1,551 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2016 6WIND S.A. + * Copyright 2016 Mellanox Technologies, Ltd + */ + +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_eal_memconfig.h> +#include <rte_mempool.h> +#include <rte_malloc.h> +#include <rte_rwlock.h> +#include <rte_bus_pci.h> + +#include <mlx5_glue.h> +#include <mlx5_common_mp.h> +#include <mlx5_common_mr.h> + +#include "mlx5.h" +#include "mlx5_mr.h" +#include "mlx5_rxtx.h" + +struct mr_find_contig_memsegs_data { + uintptr_t addr; + uintptr_t start; + uintptr_t end; + const struct rte_memseg_list *msl; +}; + +struct mr_update_mp_data { + struct rte_eth_dev *dev; + struct mlx5_mr_ctrl *mr_ctrl; + int ret; +}; + +/** + * Callback for memory free event. Iterate freed memsegs and check whether it + * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a + * result, the MR would be fragmented. If it becomes empty, the MR will be freed + * later by mlx5_mr_garbage_collect(). Even if this callback is called from a + * secondary process, the garbage collector will be called in primary process + * as the secondary process can't call mlx5_mr_create(). + * + * The global cache must be rebuilt if there's any change and this event has to + * be propagated to dataplane threads to flush the local caches. + * + * @param sh + * Pointer to the Ethernet device shared context. + * @param addr + * Address of freed memory. + * @param len + * Size of freed memory. + */ +static void +mlx5_mr_mem_event_free_cb(struct mlx5_ibv_shared *sh, + const void *addr, size_t len) +{ + const struct rte_memseg_list *msl; + struct mlx5_mr *mr; + int ms_n; + int i; + int rebuild = 0; + + DEBUG("device %s free callback: addr=%p, len=%zu", + sh->ibdev_name, addr, len); + msl = rte_mem_virt2memseg_list(addr); + /* addr and len must be page-aligned. */ + MLX5_ASSERT((uintptr_t)addr == + RTE_ALIGN((uintptr_t)addr, msl->page_sz)); + MLX5_ASSERT(len == RTE_ALIGN(len, msl->page_sz)); + ms_n = len / msl->page_sz; + rte_rwlock_write_lock(&sh->share_cache.rwlock); + /* Clear bits of freed memsegs from MR. */ + for (i = 0; i < ms_n; ++i) { + const struct rte_memseg *ms; + struct mr_cache_entry entry; + uintptr_t start; + int ms_idx; + uint32_t pos; + + /* Find MR having this memseg. */ + start = (uintptr_t)addr + i * msl->page_sz; + mr = mlx5_mr_lookup_list(&sh->share_cache, &entry, start); + if (mr == NULL) + continue; + MLX5_ASSERT(mr->msl); /* Can't be external memory. */ + ms = rte_mem_virt2memseg((void *)start, msl); + MLX5_ASSERT(ms != NULL); + MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); + ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + pos = ms_idx - mr->ms_base_idx; + MLX5_ASSERT(rte_bitmap_get(mr->ms_bmp, pos)); + MLX5_ASSERT(pos < mr->ms_bmp_n); + DEBUG("device %s MR(%p): clear bitmap[%u] for addr %p", + sh->ibdev_name, (void *)mr, pos, (void *)start); + rte_bitmap_clear(mr->ms_bmp, pos); + if (--mr->ms_n == 0) { + LIST_REMOVE(mr, mr); + LIST_INSERT_HEAD(&sh->share_cache.mr_free_list, mr, mr); + DEBUG("device %s remove MR(%p) from list", + sh->ibdev_name, (void *)mr); + } + /* + * MR is fragmented or will be freed. the global cache must be + * rebuilt. + */ + rebuild = 1; + } + if (rebuild) { + mlx5_mr_rebuild_cache(&sh->share_cache); + /* + * Flush local caches by propagating invalidation across cores. + * rte_smp_wmb() is enough to synchronize this event. If one of + * freed memsegs is seen by other core, that means the memseg + * has been allocated by allocator, which will come after this + * free call. Therefore, this store instruction (incrementing + * generation below) will be guaranteed to be seen by other core + * before the core sees the newly allocated memory. + */ + ++sh->share_cache.dev_gen; + DEBUG("broadcasting local cache flush, gen=%d", + sh->share_cache.dev_gen); + rte_smp_wmb(); + } + rte_rwlock_write_unlock(&sh->share_cache.rwlock); +} + +/** + * Callback for memory event. This can be called from both primary and secondary + * process. + * + * @param event_type + * Memory event type. + * @param addr + * Address of memory. + * @param len + * Size of memory. + */ +void +mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr, + size_t len, void *arg __rte_unused) +{ + struct mlx5_ibv_shared *sh; + struct mlx5_dev_list *dev_list = &mlx5_shared_data->mem_event_cb_list; + + /* Must be called from the primary process. */ + MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); + switch (event_type) { + case RTE_MEM_EVENT_FREE: + rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); + /* Iterate all the existing mlx5 devices. */ + LIST_FOREACH(sh, dev_list, mem_event_cb) + mlx5_mr_mem_event_free_cb(sh, addr, len); + rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); + break; + case RTE_MEM_EVENT_ALLOC: + default: + break; + } +} + +/** + * Bottom-half of LKey search on Rx. + * + * @param rxq + * Pointer to Rx queue structure. + * @param addr + * Search key. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +uint32_t +mlx5_rx_addr2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr) +{ + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of(rxq, struct mlx5_rxq_ctrl, rxq); + struct mlx5_mr_ctrl *mr_ctrl = &rxq->mr_ctrl; + struct mlx5_priv *priv = rxq_ctrl->priv; + + return mlx5_mr_addr2mr_bh(priv->sh->pd, &priv->mp_id, + &priv->sh->share_cache, mr_ctrl, addr, + priv->config.mr_ext_memseg_en); +} + +/** + * Bottom-half of LKey search on Tx. + * + * @param txq + * Pointer to Tx queue structure. + * @param addr + * Search key. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +static uint32_t +mlx5_tx_addr2mr_bh(struct mlx5_txq_data *txq, uintptr_t addr) +{ + struct mlx5_txq_ctrl *txq_ctrl = + container_of(txq, struct mlx5_txq_ctrl, txq); + struct mlx5_mr_ctrl *mr_ctrl = &txq->mr_ctrl; + struct mlx5_priv *priv = txq_ctrl->priv; + + return mlx5_mr_addr2mr_bh(priv->sh->pd, &priv->mp_id, + &priv->sh->share_cache, mr_ctrl, addr, + priv->config.mr_ext_memseg_en); +} + +/** + * Bottom-half of LKey search on Tx. If it can't be searched in the memseg + * list, register the mempool of the mbuf as externally allocated memory. + * + * @param txq + * Pointer to Tx queue structure. + * @param mb + * Pointer to mbuf. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +uint32_t +mlx5_tx_mb2mr_bh(struct mlx5_txq_data *txq, struct rte_mbuf *mb) +{ + uintptr_t addr = (uintptr_t)mb->buf_addr; + uint32_t lkey; + + lkey = mlx5_tx_addr2mr_bh(txq, addr); + if (lkey == UINT32_MAX && rte_errno == ENXIO) { + /* Mempool may have externally allocated memory. */ + return mlx5_tx_update_ext_mp(txq, addr, mlx5_mb2mp(mb)); + } + return lkey; +} + +/** + * Called during rte_mempool_mem_iter() by mlx5_mr_update_ext_mp(). + * + * Externally allocated chunk is registered and a MR is created for the chunk. + * The MR object is added to the global list. If memseg list of a MR object + * (mr->msl) is null, the MR object can be regarded as externally allocated + * memory. + * + * Once external memory is registered, it should be static. If the memory is + * freed and the virtual address range has different physical memory mapped + * again, it may cause crash on device due to the wrong translation entry. PMD + * can't track the free event of the external memory for now. + */ +static void +mlx5_mr_update_ext_mp_cb(struct rte_mempool *mp, void *opaque, + struct rte_mempool_memhdr *memhdr, + unsigned mem_idx __rte_unused) +{ + struct mr_update_mp_data *data = opaque; + struct rte_eth_dev *dev = data->dev; + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; + struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl; + struct mlx5_mr *mr = NULL; + uintptr_t addr = (uintptr_t)memhdr->addr; + size_t len = memhdr->len; + struct mr_cache_entry entry; + uint32_t lkey; + + MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); + /* If already registered, it should return. */ + rte_rwlock_read_lock(&sh->share_cache.rwlock); + lkey = mlx5_mr_lookup_cache(&sh->share_cache, &entry, addr); + rte_rwlock_read_unlock(&sh->share_cache.rwlock); + if (lkey != UINT32_MAX) + return; + DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool (%s)", + dev->data->port_id, mem_idx, mp->name); + mr = mlx5_create_mr_ext(sh->pd, addr, len, mp->socket_id); + if (!mr) { + DRV_LOG(WARNING, + "port %u unable to allocate a new MR of" + " mempool (%s).", + dev->data->port_id, mp->name); + data->ret = -1; + return; + } + rte_rwlock_write_lock(&sh->share_cache.rwlock); + LIST_INSERT_HEAD(&sh->share_cache.mr_list, mr, mr); + /* Insert to the global cache table. */ + mlx5_mr_insert_cache(&sh->share_cache, mr); + rte_rwlock_write_unlock(&sh->share_cache.rwlock); + /* Insert to the local cache table */ + mlx5_mr_addr2mr_bh(sh->pd, &priv->mp_id, &sh->share_cache, + mr_ctrl, addr, priv->config.mr_ext_memseg_en); +} + +/** + * Finds the first ethdev that match the pci device. + * The existence of multiple ethdev per pci device is only with representors. + * On such case, it is enough to get only one of the ports as they all share + * the same ibv context. + * + * @param pdev + * Pointer to the PCI device. + * + * @return + * Pointer to the ethdev if found, NULL otherwise. + */ +static struct rte_eth_dev * +pci_dev_to_eth_dev(struct rte_pci_device *pdev) +{ + uint16_t port_id; + + RTE_ETH_FOREACH_DEV_OF(port_id, &pdev->device) + return &rte_eth_devices[port_id]; + return NULL; +} + +/** + * DPDK callback to DMA map external memory to a PCI device. + * + * @param pdev + * Pointer to the PCI device. + * @param addr + * Starting virtual address of memory to be mapped. + * @param iova + * Starting IOVA address of memory to be mapped. + * @param len + * Length of memory segment being mapped. + * + * @return + * 0 on success, negative value on error. + */ +int +mlx5_dma_map(struct rte_pci_device *pdev, void *addr, + uint64_t iova __rte_unused, size_t len) +{ + struct rte_eth_dev *dev; + struct mlx5_mr *mr; + struct mlx5_priv *priv; + struct mlx5_ibv_shared *sh; + + dev = pci_dev_to_eth_dev(pdev); + if (!dev) { + DRV_LOG(WARNING, "unable to find matching ethdev " + "to PCI device %p", (void *)pdev); + rte_errno = ENODEV; + return -1; + } + priv = dev->data->dev_private; + sh = priv->sh; + mr = mlx5_create_mr_ext(sh->pd, (uintptr_t)addr, len, SOCKET_ID_ANY); + if (!mr) { + DRV_LOG(WARNING, + "port %u unable to dma map", dev->data->port_id); + rte_errno = EINVAL; + return -1; + } + rte_rwlock_write_lock(&sh->share_cache.rwlock); + LIST_INSERT_HEAD(&sh->share_cache.mr_list, mr, mr); + /* Insert to the global cache table. */ + mlx5_mr_insert_cache(&sh->share_cache, mr); + rte_rwlock_write_unlock(&sh->share_cache.rwlock); + return 0; +} + +/** + * DPDK callback to DMA unmap external memory to a PCI device. + * + * @param pdev + * Pointer to the PCI device. + * @param addr + * Starting virtual address of memory to be unmapped. + * @param iova + * Starting IOVA address of memory to be unmapped. + * @param len + * Length of memory segment being unmapped. + * + * @return + * 0 on success, negative value on error. + */ +int +mlx5_dma_unmap(struct rte_pci_device *pdev, void *addr, + uint64_t iova __rte_unused, size_t len __rte_unused) +{ + struct rte_eth_dev *dev; + struct mlx5_priv *priv; + struct mlx5_ibv_shared *sh; + struct mlx5_mr *mr; + struct mr_cache_entry entry; + + dev = pci_dev_to_eth_dev(pdev); + if (!dev) { + DRV_LOG(WARNING, "unable to find matching ethdev " + "to PCI device %p", (void *)pdev); + rte_errno = ENODEV; + return -1; + } + priv = dev->data->dev_private; + sh = priv->sh; + rte_rwlock_read_lock(&sh->share_cache.rwlock); + mr = mlx5_mr_lookup_list(&sh->share_cache, &entry, (uintptr_t)addr); + if (!mr) { + rte_rwlock_read_unlock(&sh->share_cache.rwlock); + DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered " + "to PCI device %p", (uintptr_t)addr, + (void *)pdev); + rte_errno = EINVAL; + return -1; + } + LIST_REMOVE(mr, mr); + LIST_INSERT_HEAD(&sh->share_cache.mr_free_list, mr, mr); + DEBUG("port %u remove MR(%p) from list", dev->data->port_id, + (void *)mr); + mlx5_mr_rebuild_cache(&sh->share_cache); + /* + * Flush local caches by propagating invalidation across cores. + * rte_smp_wmb() is enough to synchronize this event. If one of + * freed memsegs is seen by other core, that means the memseg + * has been allocated by allocator, which will come after this + * free call. Therefore, this store instruction (incrementing + * generation below) will be guaranteed to be seen by other core + * before the core sees the newly allocated memory. + */ + ++sh->share_cache.dev_gen; + DEBUG("broadcasting local cache flush, gen=%d", + sh->share_cache.dev_gen); + rte_smp_wmb(); + rte_rwlock_read_unlock(&sh->share_cache.rwlock); + return 0; +} + +/** + * Register MR for entire memory chunks in a Mempool having externally allocated + * memory and fill in local cache. + * + * @param dev + * Pointer to Ethernet device. + * @param mr_ctrl + * Pointer to per-queue MR control structure. + * @param mp + * Pointer to registering Mempool. + * + * @return + * 0 on success, -1 on failure. + */ +static uint32_t +mlx5_mr_update_ext_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl, + struct rte_mempool *mp) +{ + struct mr_update_mp_data data = { + .dev = dev, + .mr_ctrl = mr_ctrl, + .ret = 0, + }; + + rte_mempool_mem_iter(mp, mlx5_mr_update_ext_mp_cb, &data); + return data.ret; +} + +/** + * Register MR entire memory chunks in a Mempool having externally allocated + * memory and search LKey of the address to return. + * + * @param dev + * Pointer to Ethernet device. + * @param addr + * Search key. + * @param mp + * Pointer to registering Mempool where addr belongs. + * + * @return + * LKey for address on success, UINT32_MAX on failure. + */ +uint32_t +mlx5_tx_update_ext_mp(struct mlx5_txq_data *txq, uintptr_t addr, + struct rte_mempool *mp) +{ + struct mlx5_txq_ctrl *txq_ctrl = + container_of(txq, struct mlx5_txq_ctrl, txq); + struct mlx5_mr_ctrl *mr_ctrl = &txq->mr_ctrl; + struct mlx5_priv *priv = txq_ctrl->priv; + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) { + DRV_LOG(WARNING, + "port %u using address (%p) from unregistered mempool" + " having externally allocated memory" + " in secondary process, please create mempool" + " prior to rte_eth_dev_start()", + PORT_ID(priv), (void *)addr); + return UINT32_MAX; + } + mlx5_mr_update_ext_mp(ETH_DEV(priv), mr_ctrl, mp); + return mlx5_tx_addr2mr_bh(txq, addr); +} + +/* Called during rte_mempool_mem_iter() by mlx5_mr_update_mp(). */ +static void +mlx5_mr_update_mp_cb(struct rte_mempool *mp __rte_unused, void *opaque, + struct rte_mempool_memhdr *memhdr, + unsigned mem_idx __rte_unused) +{ + struct mr_update_mp_data *data = opaque; + struct rte_eth_dev *dev = data->dev; + struct mlx5_priv *priv = dev->data->dev_private; + + uint32_t lkey; + + /* Stop iteration if failed in the previous walk. */ + if (data->ret < 0) + return; + /* Register address of the chunk and update local caches. */ + lkey = mlx5_mr_addr2mr_bh(priv->sh->pd, &priv->mp_id, + &priv->sh->share_cache, data->mr_ctrl, + (uintptr_t)memhdr->addr, + priv->config.mr_ext_memseg_en); + if (lkey == UINT32_MAX) + data->ret = -1; +} + +/** + * Register entire memory chunks in a Mempool. + * + * @param dev + * Pointer to Ethernet device. + * @param mr_ctrl + * Pointer to per-queue MR control structure. + * @param mp + * Pointer to registering Mempool. + * + * @return + * 0 on success, -1 on failure. + */ +int +mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl, + struct rte_mempool *mp) +{ + struct mr_update_mp_data data = { + .dev = dev, + .mr_ctrl = mr_ctrl, + .ret = 0, + }; + + rte_mempool_mem_iter(mp, mlx5_mr_update_mp_cb, &data); + if (data.ret < 0 && rte_errno == ENXIO) { + /* Mempool may have externally allocated memory. */ + return mlx5_mr_update_ext_mp(dev, mr_ctrl, mp); + } + return data.ret; +} diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_mr.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mr.h new file mode 100644 index 000000000..0c5877b3d --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mr.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 6WIND S.A. + * Copyright 2018 Mellanox Technologies, Ltd + */ + +#ifndef RTE_PMD_MLX5_MR_H_ +#define RTE_PMD_MLX5_MR_H_ + +#include <stddef.h> +#include <stdint.h> +#include <sys/queue.h> + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#include <infiniband/mlx5dv.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_ethdev.h> +#include <rte_rwlock.h> +#include <rte_bitmap.h> +#include <rte_memory.h> + +#include <mlx5_common_mr.h> + +/* First entry must be NULL for comparison. */ +#define mlx5_mr_btree_len(bt) ((bt)->len - 1) + +void mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr, + size_t len, void *arg); +int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl, + struct rte_mempool *mp); + +#endif /* RTE_PMD_MLX5_MR_H_ */ diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rss.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rss.c new file mode 100644 index 000000000..653b06914 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rss.c @@ -0,0 +1,229 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2015 6WIND S.A. + * Copyright 2015 Mellanox Technologies, Ltd + */ + +#include <stddef.h> +#include <stdint.h> +#include <errno.h> +#include <string.h> + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_malloc.h> +#include <rte_ethdev_driver.h> + +#include "mlx5_defs.h" +#include "mlx5.h" +#include "mlx5_rxtx.h" + +/** + * DPDK callback to update the RSS hash configuration. + * + * @param dev + * Pointer to Ethernet device structure. + * @param[in] rss_conf + * RSS configuration data. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_rss_hash_update(struct rte_eth_dev *dev, + struct rte_eth_rss_conf *rss_conf) +{ + struct mlx5_priv *priv = dev->data->dev_private; + unsigned int i; + unsigned int idx; + + if (rss_conf->rss_hf & MLX5_RSS_HF_MASK) { + rte_errno = EINVAL; + return -rte_errno; + } + if (rss_conf->rss_key && rss_conf->rss_key_len) { + if (rss_conf->rss_key_len != MLX5_RSS_HASH_KEY_LEN) { + DRV_LOG(ERR, + "port %u RSS key len must be %s Bytes long", + dev->data->port_id, + RTE_STR(MLX5_RSS_HASH_KEY_LEN)); + rte_errno = EINVAL; + return -rte_errno; + } + priv->rss_conf.rss_key = rte_realloc(priv->rss_conf.rss_key, + rss_conf->rss_key_len, 0); + if (!priv->rss_conf.rss_key) { + rte_errno = ENOMEM; + return -rte_errno; + } + memcpy(priv->rss_conf.rss_key, rss_conf->rss_key, + rss_conf->rss_key_len); + priv->rss_conf.rss_key_len = rss_conf->rss_key_len; + } + priv->rss_conf.rss_hf = rss_conf->rss_hf; + /* Enable the RSS hash in all Rx queues. */ + for (i = 0, idx = 0; idx != priv->rxqs_n; ++i) { + if (!(*priv->rxqs)[i]) + continue; + (*priv->rxqs)[i]->rss_hash = !!rss_conf->rss_hf && + !!(dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS); + ++idx; + } + return 0; +} + +/** + * DPDK callback to get the RSS hash configuration. + * + * @param dev + * Pointer to Ethernet device structure. + * @param[in, out] rss_conf + * RSS configuration data. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_rss_hash_conf_get(struct rte_eth_dev *dev, + struct rte_eth_rss_conf *rss_conf) +{ + struct mlx5_priv *priv = dev->data->dev_private; + + if (!rss_conf) { + rte_errno = EINVAL; + return -rte_errno; + } + if (rss_conf->rss_key && + (rss_conf->rss_key_len >= priv->rss_conf.rss_key_len)) { + memcpy(rss_conf->rss_key, priv->rss_conf.rss_key, + priv->rss_conf.rss_key_len); + } + rss_conf->rss_key_len = priv->rss_conf.rss_key_len; + rss_conf->rss_hf = priv->rss_conf.rss_hf; + return 0; +} + +/** + * Allocate/reallocate RETA index table. + * + * @param dev + * Pointer to Ethernet device. + * @praram reta_size + * The size of the array to allocate. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_rss_reta_index_resize(struct rte_eth_dev *dev, unsigned int reta_size) +{ + struct mlx5_priv *priv = dev->data->dev_private; + void *mem; + unsigned int old_size = priv->reta_idx_n; + + if (priv->reta_idx_n == reta_size) + return 0; + + mem = rte_realloc(priv->reta_idx, + reta_size * sizeof((*priv->reta_idx)[0]), 0); + if (!mem) { + rte_errno = ENOMEM; + return -rte_errno; + } + priv->reta_idx = mem; + priv->reta_idx_n = reta_size; + if (old_size < reta_size) + memset(&(*priv->reta_idx)[old_size], 0, + (reta_size - old_size) * + sizeof((*priv->reta_idx)[0])); + return 0; +} + +/** + * DPDK callback to get the RETA indirection table. + * + * @param dev + * Pointer to Ethernet device structure. + * @param reta_conf + * Pointer to RETA configuration structure array. + * @param reta_size + * Size of the RETA table. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_dev_rss_reta_query(struct rte_eth_dev *dev, + struct rte_eth_rss_reta_entry64 *reta_conf, + uint16_t reta_size) +{ + struct mlx5_priv *priv = dev->data->dev_private; + unsigned int idx; + unsigned int i; + + if (!reta_size || reta_size > priv->reta_idx_n) { + rte_errno = EINVAL; + return -rte_errno; + } + /* Fill each entry of the table even if its bit is not set. */ + for (idx = 0, i = 0; (i != reta_size); ++i) { + idx = i / RTE_RETA_GROUP_SIZE; + reta_conf[idx].reta[i % RTE_RETA_GROUP_SIZE] = + (*priv->reta_idx)[i]; + } + return 0; +} + +/** + * DPDK callback to update the RETA indirection table. + * + * @param dev + * Pointer to Ethernet device structure. + * @param reta_conf + * Pointer to RETA configuration structure array. + * @param reta_size + * Size of the RETA table. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_dev_rss_reta_update(struct rte_eth_dev *dev, + struct rte_eth_rss_reta_entry64 *reta_conf, + uint16_t reta_size) +{ + int ret; + struct mlx5_priv *priv = dev->data->dev_private; + unsigned int idx; + unsigned int i; + unsigned int pos; + + if (!reta_size) { + rte_errno = EINVAL; + return -rte_errno; + } + ret = mlx5_rss_reta_index_resize(dev, reta_size); + if (ret) + return ret; + for (idx = 0, i = 0; (i != reta_size); ++i) { + idx = i / RTE_RETA_GROUP_SIZE; + pos = i % RTE_RETA_GROUP_SIZE; + if (((reta_conf[idx].mask >> i) & 0x1) == 0) + continue; + MLX5_ASSERT(reta_conf[idx].reta[pos] < priv->rxqs_n); + (*priv->reta_idx)[i] = reta_conf[idx].reta[pos]; + } + if (dev->data->dev_started) { + mlx5_dev_stop(dev); + priv->skip_default_rss_reta = 1; + return mlx5_dev_start(dev); + } + return 0; +} diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxmode.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxmode.c new file mode 100644 index 000000000..84c8b0526 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxmode.c @@ -0,0 +1,174 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2015 6WIND S.A. + * Copyright 2015 Mellanox Technologies, Ltd + */ + +#include <stddef.h> +#include <errno.h> +#include <string.h> + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_ethdev_driver.h> + +#include "mlx5.h" +#include "mlx5_rxtx.h" +#include "mlx5_utils.h" + +/** + * DPDK callback to enable promiscuous mode. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_promiscuous_enable(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + int ret; + + dev->data->promiscuous = 1; + if (priv->isolated) { + DRV_LOG(WARNING, + "port %u cannot enable promiscuous mode" + " in flow isolation mode", + dev->data->port_id); + return 0; + } + if (priv->config.vf) { + ret = mlx5_nl_promisc(priv->nl_socket_route, mlx5_ifindex(dev), + 1); + if (ret) + return ret; + } + ret = mlx5_traffic_restart(dev); + if (ret) + DRV_LOG(ERR, "port %u cannot enable promiscuous mode: %s", + dev->data->port_id, strerror(rte_errno)); + + /* + * rte_eth_dev_promiscuous_enable() rollback + * dev->data->promiscuous in the case of failure. + */ + return ret; +} + +/** + * DPDK callback to disable promiscuous mode. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_promiscuous_disable(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + int ret; + + dev->data->promiscuous = 0; + if (priv->config.vf) { + ret = mlx5_nl_promisc(priv->nl_socket_route, mlx5_ifindex(dev), + 0); + if (ret) + return ret; + } + ret = mlx5_traffic_restart(dev); + if (ret) + DRV_LOG(ERR, "port %u cannot disable promiscuous mode: %s", + dev->data->port_id, strerror(rte_errno)); + + /* + * rte_eth_dev_promiscuous_disable() rollback + * dev->data->promiscuous in the case of failure. + */ + return ret; +} + +/** + * DPDK callback to enable allmulti mode. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_allmulticast_enable(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + int ret; + + dev->data->all_multicast = 1; + if (priv->isolated) { + DRV_LOG(WARNING, + "port %u cannot enable allmulticast mode" + " in flow isolation mode", + dev->data->port_id); + return 0; + } + if (priv->config.vf) { + ret = mlx5_nl_allmulti(priv->nl_socket_route, mlx5_ifindex(dev), + 1); + if (ret) + goto error; + } + ret = mlx5_traffic_restart(dev); + if (ret) + DRV_LOG(ERR, "port %u cannot enable allmulicast mode: %s", + dev->data->port_id, strerror(rte_errno)); +error: + /* + * rte_eth_allmulticast_enable() rollback + * dev->data->all_multicast in the case of failure. + */ + return ret; +} + +/** + * DPDK callback to disable allmulti mode. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_allmulticast_disable(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + int ret; + + dev->data->all_multicast = 0; + if (priv->config.vf) { + ret = mlx5_nl_allmulti(priv->nl_socket_route, mlx5_ifindex(dev), + 0); + if (ret) + goto error; + } + ret = mlx5_traffic_restart(dev); + if (ret) + DRV_LOG(ERR, "port %u cannot disable allmulicast mode: %s", + dev->data->port_id, strerror(rte_errno)); +error: + /* + * rte_eth_allmulticast_disable() rollback + * dev->data->all_multicast in the case of failure. + */ + return ret; +} diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxq.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxq.c new file mode 100644 index 000000000..7a50ec6f1 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxq.c @@ -0,0 +1,2976 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2015 6WIND S.A. + * Copyright 2015 Mellanox Technologies, Ltd + */ + +#include <stddef.h> +#include <errno.h> +#include <string.h> +#include <stdint.h> +#include <fcntl.h> +#include <sys/queue.h> + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#include <infiniband/mlx5dv.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_mbuf.h> +#include <rte_malloc.h> +#include <rte_ethdev_driver.h> +#include <rte_common.h> +#include <rte_interrupts.h> +#include <rte_debug.h> +#include <rte_io.h> + +#include <mlx5_glue.h> +#include <mlx5_devx_cmds.h> + +#include "mlx5_defs.h" +#include "mlx5.h" +#include "mlx5_rxtx.h" +#include "mlx5_utils.h" +#include "mlx5_autoconf.h" +#include "mlx5_flow.h" + + +/* Default RSS hash key also used for ConnectX-3. */ +uint8_t rss_hash_default_key[] = { + 0x2c, 0xc6, 0x81, 0xd1, + 0x5b, 0xdb, 0xf4, 0xf7, + 0xfc, 0xa2, 0x83, 0x19, + 0xdb, 0x1a, 0x3e, 0x94, + 0x6b, 0x9e, 0x38, 0xd9, + 0x2c, 0x9c, 0x03, 0xd1, + 0xad, 0x99, 0x44, 0xa7, + 0xd9, 0x56, 0x3d, 0x59, + 0x06, 0x3c, 0x25, 0xf3, + 0xfc, 0x1f, 0xdc, 0x2a, +}; + +/* Length of the default RSS hash key. */ +static_assert(MLX5_RSS_HASH_KEY_LEN == + (unsigned int)sizeof(rss_hash_default_key), + "wrong RSS default key size."); + +/** + * Check whether Multi-Packet RQ can be enabled for the device. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * 1 if supported, negative errno value if not. + */ +inline int +mlx5_check_mprq_support(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + + if (priv->config.mprq.enabled && + priv->rxqs_n >= priv->config.mprq.min_rxqs_num) + return 1; + return -ENOTSUP; +} + +/** + * Check whether Multi-Packet RQ is enabled for the Rx queue. + * + * @param rxq + * Pointer to receive queue structure. + * + * @return + * 0 if disabled, otherwise enabled. + */ +inline int +mlx5_rxq_mprq_enabled(struct mlx5_rxq_data *rxq) +{ + return rxq->strd_num_n > 0; +} + +/** + * Check whether Multi-Packet RQ is enabled for the device. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * 0 if disabled, otherwise enabled. + */ +inline int +mlx5_mprq_enabled(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + uint16_t i; + uint16_t n = 0; + uint16_t n_ibv = 0; + + if (mlx5_check_mprq_support(dev) < 0) + return 0; + /* All the configured queues should be enabled. */ + for (i = 0; i < priv->rxqs_n; ++i) { + struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; + struct mlx5_rxq_ctrl *rxq_ctrl = container_of + (rxq, struct mlx5_rxq_ctrl, rxq); + + if (rxq == NULL || rxq_ctrl->type != MLX5_RXQ_TYPE_STANDARD) + continue; + n_ibv++; + if (mlx5_rxq_mprq_enabled(rxq)) + ++n; + } + /* Multi-Packet RQ can't be partially configured. */ + MLX5_ASSERT(n == 0 || n == n_ibv); + return n == n_ibv; +} + +/** + * Allocate RX queue elements for Multi-Packet RQ. + * + * @param rxq_ctrl + * Pointer to RX queue structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl) +{ + struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq; + unsigned int wqe_n = 1 << rxq->elts_n; + unsigned int i; + int err; + + /* Iterate on segments. */ + for (i = 0; i <= wqe_n; ++i) { + struct mlx5_mprq_buf *buf; + + if (rte_mempool_get(rxq->mprq_mp, (void **)&buf) < 0) { + DRV_LOG(ERR, "port %u empty mbuf pool", rxq->port_id); + rte_errno = ENOMEM; + goto error; + } + if (i < wqe_n) + (*rxq->mprq_bufs)[i] = buf; + else + rxq->mprq_repl = buf; + } + DRV_LOG(DEBUG, + "port %u Rx queue %u allocated and configured %u segments", + rxq->port_id, rxq->idx, wqe_n); + return 0; +error: + err = rte_errno; /* Save rte_errno before cleanup. */ + wqe_n = i; + for (i = 0; (i != wqe_n); ++i) { + if ((*rxq->mprq_bufs)[i] != NULL) + rte_mempool_put(rxq->mprq_mp, + (*rxq->mprq_bufs)[i]); + (*rxq->mprq_bufs)[i] = NULL; + } + DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything", + rxq->port_id, rxq->idx); + rte_errno = err; /* Restore rte_errno. */ + return -rte_errno; +} + +/** + * Allocate RX queue elements for Single-Packet RQ. + * + * @param rxq_ctrl + * Pointer to RX queue structure. + * + * @return + * 0 on success, errno value on failure. + */ +static int +rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl) +{ + const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n; + unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n; + unsigned int i; + int err; + + /* Iterate on segments. */ + for (i = 0; (i != elts_n); ++i) { + struct rte_mbuf *buf; + + buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp); + if (buf == NULL) { + DRV_LOG(ERR, "port %u empty mbuf pool", + PORT_ID(rxq_ctrl->priv)); + rte_errno = ENOMEM; + goto error; + } + /* Headroom is reserved by rte_pktmbuf_alloc(). */ + MLX5_ASSERT(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM); + /* Buffer is supposed to be empty. */ + MLX5_ASSERT(rte_pktmbuf_data_len(buf) == 0); + MLX5_ASSERT(rte_pktmbuf_pkt_len(buf) == 0); + MLX5_ASSERT(!buf->next); + /* Only the first segment keeps headroom. */ + if (i % sges_n) + SET_DATA_OFF(buf, 0); + PORT(buf) = rxq_ctrl->rxq.port_id; + DATA_LEN(buf) = rte_pktmbuf_tailroom(buf); + PKT_LEN(buf) = DATA_LEN(buf); + NB_SEGS(buf) = 1; + (*rxq_ctrl->rxq.elts)[i] = buf; + } + /* If Rx vector is activated. */ + if (mlx5_rxq_check_vec_support(&rxq_ctrl->rxq) > 0) { + struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq; + struct rte_mbuf *mbuf_init = &rxq->fake_mbuf; + struct rte_pktmbuf_pool_private *priv = + (struct rte_pktmbuf_pool_private *) + rte_mempool_get_priv(rxq_ctrl->rxq.mp); + int j; + + /* Initialize default rearm_data for vPMD. */ + mbuf_init->data_off = RTE_PKTMBUF_HEADROOM; + rte_mbuf_refcnt_set(mbuf_init, 1); + mbuf_init->nb_segs = 1; + mbuf_init->port = rxq->port_id; + if (priv->flags & RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF) + mbuf_init->ol_flags = EXT_ATTACHED_MBUF; + /* + * prevent compiler reordering: + * rearm_data covers previous fields. + */ + rte_compiler_barrier(); + rxq->mbuf_initializer = + *(rte_xmm_t *)&mbuf_init->rearm_data; + /* Padding with a fake mbuf for vectorized Rx. */ + for (j = 0; j < MLX5_VPMD_DESCS_PER_LOOP; ++j) + (*rxq->elts)[elts_n + j] = &rxq->fake_mbuf; + } + DRV_LOG(DEBUG, + "port %u Rx queue %u allocated and configured %u segments" + " (max %u packets)", + PORT_ID(rxq_ctrl->priv), rxq_ctrl->rxq.idx, elts_n, + elts_n / (1 << rxq_ctrl->rxq.sges_n)); + return 0; +error: + err = rte_errno; /* Save rte_errno before cleanup. */ + elts_n = i; + for (i = 0; (i != elts_n); ++i) { + if ((*rxq_ctrl->rxq.elts)[i] != NULL) + rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]); + (*rxq_ctrl->rxq.elts)[i] = NULL; + } + DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything", + PORT_ID(rxq_ctrl->priv), rxq_ctrl->rxq.idx); + rte_errno = err; /* Restore rte_errno. */ + return -rte_errno; +} + +/** + * Allocate RX queue elements. + * + * @param rxq_ctrl + * Pointer to RX queue structure. + * + * @return + * 0 on success, errno value on failure. + */ +int +rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl) +{ + return mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ? + rxq_alloc_elts_mprq(rxq_ctrl) : rxq_alloc_elts_sprq(rxq_ctrl); +} + +/** + * Free RX queue elements for Multi-Packet RQ. + * + * @param rxq_ctrl + * Pointer to RX queue structure. + */ +static void +rxq_free_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl) +{ + struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq; + uint16_t i; + + DRV_LOG(DEBUG, "port %u Multi-Packet Rx queue %u freeing WRs", + rxq->port_id, rxq->idx); + if (rxq->mprq_bufs == NULL) + return; + MLX5_ASSERT(mlx5_rxq_check_vec_support(rxq) < 0); + for (i = 0; (i != (1u << rxq->elts_n)); ++i) { + if ((*rxq->mprq_bufs)[i] != NULL) + mlx5_mprq_buf_free((*rxq->mprq_bufs)[i]); + (*rxq->mprq_bufs)[i] = NULL; + } + if (rxq->mprq_repl != NULL) { + mlx5_mprq_buf_free(rxq->mprq_repl); + rxq->mprq_repl = NULL; + } +} + +/** + * Free RX queue elements for Single-Packet RQ. + * + * @param rxq_ctrl + * Pointer to RX queue structure. + */ +static void +rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl) +{ + struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq; + const uint16_t q_n = (1 << rxq->elts_n); + const uint16_t q_mask = q_n - 1; + uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi); + uint16_t i; + + DRV_LOG(DEBUG, "port %u Rx queue %u freeing WRs", + PORT_ID(rxq_ctrl->priv), rxq->idx); + if (rxq->elts == NULL) + return; + /** + * Some mbuf in the Ring belongs to the application. They cannot be + * freed. + */ + if (mlx5_rxq_check_vec_support(rxq) > 0) { + for (i = 0; i < used; ++i) + (*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL; + rxq->rq_pi = rxq->rq_ci; + } + for (i = 0; (i != (1u << rxq->elts_n)); ++i) { + if ((*rxq->elts)[i] != NULL) + rte_pktmbuf_free_seg((*rxq->elts)[i]); + (*rxq->elts)[i] = NULL; + } +} + +/** + * Free RX queue elements. + * + * @param rxq_ctrl + * Pointer to RX queue structure. + */ +static void +rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl) +{ + if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq)) + rxq_free_elts_mprq(rxq_ctrl); + else + rxq_free_elts_sprq(rxq_ctrl); +} + +/** + * Returns the per-queue supported offloads. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * Supported Rx offloads. + */ +uint64_t +mlx5_get_rx_queue_offloads(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_dev_config *config = &priv->config; + uint64_t offloads = (DEV_RX_OFFLOAD_SCATTER | + DEV_RX_OFFLOAD_TIMESTAMP | + DEV_RX_OFFLOAD_JUMBO_FRAME | + DEV_RX_OFFLOAD_RSS_HASH); + + if (config->hw_fcs_strip) + offloads |= DEV_RX_OFFLOAD_KEEP_CRC; + + if (config->hw_csum) + offloads |= (DEV_RX_OFFLOAD_IPV4_CKSUM | + DEV_RX_OFFLOAD_UDP_CKSUM | + DEV_RX_OFFLOAD_TCP_CKSUM); + if (config->hw_vlan_strip) + offloads |= DEV_RX_OFFLOAD_VLAN_STRIP; + if (MLX5_LRO_SUPPORTED(dev)) + offloads |= DEV_RX_OFFLOAD_TCP_LRO; + return offloads; +} + + +/** + * Returns the per-port supported offloads. + * + * @return + * Supported Rx offloads. + */ +uint64_t +mlx5_get_rx_port_offloads(void) +{ + uint64_t offloads = DEV_RX_OFFLOAD_VLAN_FILTER; + + return offloads; +} + +/** + * Verify if the queue can be released. + * + * @param dev + * Pointer to Ethernet device. + * @param idx + * RX queue index. + * + * @return + * 1 if the queue can be released + * 0 if the queue can not be released, there are references to it. + * Negative errno and rte_errno is set if queue doesn't exist. + */ +static int +mlx5_rxq_releasable(struct rte_eth_dev *dev, uint16_t idx) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_ctrl *rxq_ctrl; + + if (!(*priv->rxqs)[idx]) { + rte_errno = EINVAL; + return -rte_errno; + } + rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq); + return (rte_atomic32_read(&rxq_ctrl->refcnt) == 1); +} + +/** + * Rx queue presetup checks. + * + * @param dev + * Pointer to Ethernet device structure. + * @param idx + * RX queue index. + * @param desc + * Number of descriptors to configure in queue. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_rx_queue_pre_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc) +{ + struct mlx5_priv *priv = dev->data->dev_private; + + if (!rte_is_power_of_2(desc)) { + desc = 1 << log2above(desc); + DRV_LOG(WARNING, + "port %u increased number of descriptors in Rx queue %u" + " to the next power of two (%d)", + dev->data->port_id, idx, desc); + } + DRV_LOG(DEBUG, "port %u configuring Rx queue %u for %u descriptors", + dev->data->port_id, idx, desc); + if (idx >= priv->rxqs_n) { + DRV_LOG(ERR, "port %u Rx queue index out of range (%u >= %u)", + dev->data->port_id, idx, priv->rxqs_n); + rte_errno = EOVERFLOW; + return -rte_errno; + } + if (!mlx5_rxq_releasable(dev, idx)) { + DRV_LOG(ERR, "port %u unable to release queue index %u", + dev->data->port_id, idx); + rte_errno = EBUSY; + return -rte_errno; + } + mlx5_rxq_release(dev, idx); + return 0; +} + +/** + * + * @param dev + * Pointer to Ethernet device structure. + * @param idx + * RX queue index. + * @param desc + * Number of descriptors to configure in queue. + * @param socket + * NUMA socket on which memory must be allocated. + * @param[in] conf + * Thresholds parameters. + * @param mp + * Memory pool for buffer allocations. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + unsigned int socket, const struct rte_eth_rxconf *conf, + struct rte_mempool *mp) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx]; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of(rxq, struct mlx5_rxq_ctrl, rxq); + int res; + + res = mlx5_rx_queue_pre_setup(dev, idx, desc); + if (res) + return res; + rxq_ctrl = mlx5_rxq_new(dev, idx, desc, socket, conf, mp); + if (!rxq_ctrl) { + DRV_LOG(ERR, "port %u unable to allocate queue index %u", + dev->data->port_id, idx); + rte_errno = ENOMEM; + return -rte_errno; + } + DRV_LOG(DEBUG, "port %u adding Rx queue %u to list", + dev->data->port_id, idx); + (*priv->rxqs)[idx] = &rxq_ctrl->rxq; + return 0; +} + +/** + * + * @param dev + * Pointer to Ethernet device structure. + * @param idx + * RX queue index. + * @param desc + * Number of descriptors to configure in queue. + * @param hairpin_conf + * Hairpin configuration parameters. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_rx_hairpin_queue_setup(struct rte_eth_dev *dev, uint16_t idx, + uint16_t desc, + const struct rte_eth_hairpin_conf *hairpin_conf) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx]; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of(rxq, struct mlx5_rxq_ctrl, rxq); + int res; + + res = mlx5_rx_queue_pre_setup(dev, idx, desc); + if (res) + return res; + if (hairpin_conf->peer_count != 1 || + hairpin_conf->peers[0].port != dev->data->port_id || + hairpin_conf->peers[0].queue >= priv->txqs_n) { + DRV_LOG(ERR, "port %u unable to setup hairpin queue index %u " + " invalid hairpind configuration", dev->data->port_id, + idx); + rte_errno = EINVAL; + return -rte_errno; + } + rxq_ctrl = mlx5_rxq_hairpin_new(dev, idx, desc, hairpin_conf); + if (!rxq_ctrl) { + DRV_LOG(ERR, "port %u unable to allocate queue index %u", + dev->data->port_id, idx); + rte_errno = ENOMEM; + return -rte_errno; + } + DRV_LOG(DEBUG, "port %u adding Rx queue %u to list", + dev->data->port_id, idx); + (*priv->rxqs)[idx] = &rxq_ctrl->rxq; + return 0; +} + +/** + * DPDK callback to release a RX queue. + * + * @param dpdk_rxq + * Generic RX queue pointer. + */ +void +mlx5_rx_queue_release(void *dpdk_rxq) +{ + struct mlx5_rxq_data *rxq = (struct mlx5_rxq_data *)dpdk_rxq; + struct mlx5_rxq_ctrl *rxq_ctrl; + struct mlx5_priv *priv; + + if (rxq == NULL) + return; + rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq); + priv = rxq_ctrl->priv; + if (!mlx5_rxq_releasable(ETH_DEV(priv), rxq_ctrl->rxq.idx)) + rte_panic("port %u Rx queue %u is still used by a flow and" + " cannot be removed\n", + PORT_ID(priv), rxq->idx); + mlx5_rxq_release(ETH_DEV(priv), rxq_ctrl->rxq.idx); +} + +/** + * Get an Rx queue Verbs/DevX object. + * + * @param dev + * Pointer to Ethernet device. + * @param idx + * Queue index in DPDK Rx queue array + * + * @return + * The Verbs/DevX object if it exists. + */ +static struct mlx5_rxq_obj * +mlx5_rxq_obj_get(struct rte_eth_dev *dev, uint16_t idx) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx]; + struct mlx5_rxq_ctrl *rxq_ctrl; + + if (idx >= priv->rxqs_n) + return NULL; + if (!rxq_data) + return NULL; + rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); + if (rxq_ctrl->obj) + rte_atomic32_inc(&rxq_ctrl->obj->refcnt); + return rxq_ctrl->obj; +} + +/** + * Release the resources allocated for an RQ DevX object. + * + * @param rxq_ctrl + * DevX Rx queue object. + */ +static void +rxq_release_rq_resources(struct mlx5_rxq_ctrl *rxq_ctrl) +{ + if (rxq_ctrl->rxq.wqes) { + rte_free((void *)(uintptr_t)rxq_ctrl->rxq.wqes); + rxq_ctrl->rxq.wqes = NULL; + } + if (rxq_ctrl->wq_umem) { + mlx5_glue->devx_umem_dereg(rxq_ctrl->wq_umem); + rxq_ctrl->wq_umem = NULL; + } +} + +/** + * Release an Rx hairpin related resources. + * + * @param rxq_obj + * Hairpin Rx queue object. + */ +static void +rxq_obj_hairpin_release(struct mlx5_rxq_obj *rxq_obj) +{ + struct mlx5_devx_modify_rq_attr rq_attr = { 0 }; + + MLX5_ASSERT(rxq_obj); + rq_attr.state = MLX5_RQC_STATE_RST; + rq_attr.rq_state = MLX5_RQC_STATE_RDY; + mlx5_devx_cmd_modify_rq(rxq_obj->rq, &rq_attr); + claim_zero(mlx5_devx_cmd_destroy(rxq_obj->rq)); +} + +/** + * Release an Rx verbs/DevX queue object. + * + * @param rxq_obj + * Verbs/DevX Rx queue object. + * + * @return + * 1 while a reference on it exists, 0 when freed. + */ +static int +mlx5_rxq_obj_release(struct mlx5_rxq_obj *rxq_obj) +{ + MLX5_ASSERT(rxq_obj); + if (rte_atomic32_dec_and_test(&rxq_obj->refcnt)) { + switch (rxq_obj->type) { + case MLX5_RXQ_OBJ_TYPE_IBV: + MLX5_ASSERT(rxq_obj->wq); + MLX5_ASSERT(rxq_obj->cq); + rxq_free_elts(rxq_obj->rxq_ctrl); + claim_zero(mlx5_glue->destroy_wq(rxq_obj->wq)); + claim_zero(mlx5_glue->destroy_cq(rxq_obj->cq)); + break; + case MLX5_RXQ_OBJ_TYPE_DEVX_RQ: + MLX5_ASSERT(rxq_obj->cq); + MLX5_ASSERT(rxq_obj->rq); + rxq_free_elts(rxq_obj->rxq_ctrl); + claim_zero(mlx5_devx_cmd_destroy(rxq_obj->rq)); + rxq_release_rq_resources(rxq_obj->rxq_ctrl); + claim_zero(mlx5_glue->destroy_cq(rxq_obj->cq)); + break; + case MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN: + MLX5_ASSERT(rxq_obj->rq); + rxq_obj_hairpin_release(rxq_obj); + break; + } + if (rxq_obj->channel) + claim_zero(mlx5_glue->destroy_comp_channel + (rxq_obj->channel)); + LIST_REMOVE(rxq_obj, next); + rte_free(rxq_obj); + return 0; + } + return 1; +} + +/** + * Allocate queue vector and fill epoll fd list for Rx interrupts. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_rx_intr_vec_enable(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + unsigned int i; + unsigned int rxqs_n = priv->rxqs_n; + unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID); + unsigned int count = 0; + struct rte_intr_handle *intr_handle = dev->intr_handle; + + if (!dev->data->dev_conf.intr_conf.rxq) + return 0; + mlx5_rx_intr_vec_disable(dev); + intr_handle->intr_vec = malloc(n * sizeof(intr_handle->intr_vec[0])); + if (intr_handle->intr_vec == NULL) { + DRV_LOG(ERR, + "port %u failed to allocate memory for interrupt" + " vector, Rx interrupts will not be supported", + dev->data->port_id); + rte_errno = ENOMEM; + return -rte_errno; + } + intr_handle->type = RTE_INTR_HANDLE_EXT; + for (i = 0; i != n; ++i) { + /* This rxq obj must not be released in this function. */ + struct mlx5_rxq_obj *rxq_obj = mlx5_rxq_obj_get(dev, i); + int fd; + int flags; + int rc; + + /* Skip queues that cannot request interrupts. */ + if (!rxq_obj || !rxq_obj->channel) { + /* Use invalid intr_vec[] index to disable entry. */ + intr_handle->intr_vec[i] = + RTE_INTR_VEC_RXTX_OFFSET + + RTE_MAX_RXTX_INTR_VEC_ID; + continue; + } + if (count >= RTE_MAX_RXTX_INTR_VEC_ID) { + DRV_LOG(ERR, + "port %u too many Rx queues for interrupt" + " vector size (%d), Rx interrupts cannot be" + " enabled", + dev->data->port_id, RTE_MAX_RXTX_INTR_VEC_ID); + mlx5_rx_intr_vec_disable(dev); + rte_errno = ENOMEM; + return -rte_errno; + } + fd = rxq_obj->channel->fd; + flags = fcntl(fd, F_GETFL); + rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK); + if (rc < 0) { + rte_errno = errno; + DRV_LOG(ERR, + "port %u failed to make Rx interrupt file" + " descriptor %d non-blocking for queue index" + " %d", + dev->data->port_id, fd, i); + mlx5_rx_intr_vec_disable(dev); + return -rte_errno; + } + intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count; + intr_handle->efds[count] = fd; + count++; + } + if (!count) + mlx5_rx_intr_vec_disable(dev); + else + intr_handle->nb_efd = count; + return 0; +} + +/** + * Clean up Rx interrupts handler. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx5_rx_intr_vec_disable(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct rte_intr_handle *intr_handle = dev->intr_handle; + unsigned int i; + unsigned int rxqs_n = priv->rxqs_n; + unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID); + + if (!dev->data->dev_conf.intr_conf.rxq) + return; + if (!intr_handle->intr_vec) + goto free; + for (i = 0; i != n; ++i) { + struct mlx5_rxq_ctrl *rxq_ctrl; + struct mlx5_rxq_data *rxq_data; + + if (intr_handle->intr_vec[i] == RTE_INTR_VEC_RXTX_OFFSET + + RTE_MAX_RXTX_INTR_VEC_ID) + continue; + /** + * Need to access directly the queue to release the reference + * kept in mlx5_rx_intr_vec_enable(). + */ + rxq_data = (*priv->rxqs)[i]; + rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); + if (rxq_ctrl->obj) + mlx5_rxq_obj_release(rxq_ctrl->obj); + } +free: + rte_intr_free_epoll_fd(intr_handle); + if (intr_handle->intr_vec) + free(intr_handle->intr_vec); + intr_handle->nb_efd = 0; + intr_handle->intr_vec = NULL; +} + +/** + * MLX5 CQ notification . + * + * @param rxq + * Pointer to receive queue structure. + * @param sq_n_rxq + * Sequence number per receive queue . + */ +static inline void +mlx5_arm_cq(struct mlx5_rxq_data *rxq, int sq_n_rxq) +{ + int sq_n = 0; + uint32_t doorbell_hi; + uint64_t doorbell; + void *cq_db_reg = (char *)rxq->cq_uar + MLX5_CQ_DOORBELL; + + sq_n = sq_n_rxq & MLX5_CQ_SQN_MASK; + doorbell_hi = sq_n << MLX5_CQ_SQN_OFFSET | (rxq->cq_ci & MLX5_CI_MASK); + doorbell = (uint64_t)doorbell_hi << 32; + doorbell |= rxq->cqn; + rxq->cq_db[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(doorbell_hi); + mlx5_uar_write64(rte_cpu_to_be_64(doorbell), + cq_db_reg, rxq->uar_lock_cq); +} + +/** + * DPDK callback for Rx queue interrupt enable. + * + * @param dev + * Pointer to Ethernet device structure. + * @param rx_queue_id + * Rx queue number. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_data *rxq_data; + struct mlx5_rxq_ctrl *rxq_ctrl; + + rxq_data = (*priv->rxqs)[rx_queue_id]; + if (!rxq_data) { + rte_errno = EINVAL; + return -rte_errno; + } + rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); + if (rxq_ctrl->irq) { + struct mlx5_rxq_obj *rxq_obj; + + rxq_obj = mlx5_rxq_obj_get(dev, rx_queue_id); + if (!rxq_obj) { + rte_errno = EINVAL; + return -rte_errno; + } + mlx5_arm_cq(rxq_data, rxq_data->cq_arm_sn); + mlx5_rxq_obj_release(rxq_obj); + } + return 0; +} + +/** + * DPDK callback for Rx queue interrupt disable. + * + * @param dev + * Pointer to Ethernet device structure. + * @param rx_queue_id + * Rx queue number. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_data *rxq_data; + struct mlx5_rxq_ctrl *rxq_ctrl; + struct mlx5_rxq_obj *rxq_obj = NULL; + struct ibv_cq *ev_cq; + void *ev_ctx; + int ret; + + rxq_data = (*priv->rxqs)[rx_queue_id]; + if (!rxq_data) { + rte_errno = EINVAL; + return -rte_errno; + } + rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); + if (!rxq_ctrl->irq) + return 0; + rxq_obj = mlx5_rxq_obj_get(dev, rx_queue_id); + if (!rxq_obj) { + rte_errno = EINVAL; + return -rte_errno; + } + ret = mlx5_glue->get_cq_event(rxq_obj->channel, &ev_cq, &ev_ctx); + if (ret || ev_cq != rxq_obj->cq) { + rte_errno = EINVAL; + goto exit; + } + rxq_data->cq_arm_sn++; + mlx5_glue->ack_cq_events(rxq_obj->cq, 1); + mlx5_rxq_obj_release(rxq_obj); + return 0; +exit: + ret = rte_errno; /* Save rte_errno before cleanup. */ + if (rxq_obj) + mlx5_rxq_obj_release(rxq_obj); + DRV_LOG(WARNING, "port %u unable to disable interrupt on Rx queue %d", + dev->data->port_id, rx_queue_id); + rte_errno = ret; /* Restore rte_errno. */ + return -rte_errno; +} + +/** + * Create a CQ Verbs object. + * + * @param dev + * Pointer to Ethernet device. + * @param priv + * Pointer to device private data. + * @param rxq_data + * Pointer to Rx queue data. + * @param cqe_n + * Number of CQEs in CQ. + * @param rxq_obj + * Pointer to Rx queue object data. + * + * @return + * The Verbs object initialised, NULL otherwise and rte_errno is set. + */ +static struct ibv_cq * +mlx5_ibv_cq_new(struct rte_eth_dev *dev, struct mlx5_priv *priv, + struct mlx5_rxq_data *rxq_data, + unsigned int cqe_n, struct mlx5_rxq_obj *rxq_obj) +{ + struct { + struct ibv_cq_init_attr_ex ibv; + struct mlx5dv_cq_init_attr mlx5; + } cq_attr; + + cq_attr.ibv = (struct ibv_cq_init_attr_ex){ + .cqe = cqe_n, + .channel = rxq_obj->channel, + .comp_mask = 0, + }; + cq_attr.mlx5 = (struct mlx5dv_cq_init_attr){ + .comp_mask = 0, + }; + if (priv->config.cqe_comp && !rxq_data->hw_timestamp && + !rxq_data->lro) { + cq_attr.mlx5.comp_mask |= + MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE; +#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT + cq_attr.mlx5.cqe_comp_res_format = + mlx5_rxq_mprq_enabled(rxq_data) ? + MLX5DV_CQE_RES_FORMAT_CSUM_STRIDX : + MLX5DV_CQE_RES_FORMAT_HASH; +#else + cq_attr.mlx5.cqe_comp_res_format = MLX5DV_CQE_RES_FORMAT_HASH; +#endif + /* + * For vectorized Rx, it must not be doubled in order to + * make cq_ci and rq_ci aligned. + */ + if (mlx5_rxq_check_vec_support(rxq_data) < 0) + cq_attr.ibv.cqe *= 2; + } else if (priv->config.cqe_comp && rxq_data->hw_timestamp) { + DRV_LOG(DEBUG, + "port %u Rx CQE compression is disabled for HW" + " timestamp", + dev->data->port_id); + } else if (priv->config.cqe_comp && rxq_data->lro) { + DRV_LOG(DEBUG, + "port %u Rx CQE compression is disabled for LRO", + dev->data->port_id); + } +#ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD + if (priv->config.cqe_pad) { + cq_attr.mlx5.comp_mask |= MLX5DV_CQ_INIT_ATTR_MASK_FLAGS; + cq_attr.mlx5.flags |= MLX5DV_CQ_INIT_ATTR_FLAGS_CQE_PAD; + } +#endif + return mlx5_glue->cq_ex_to_cq(mlx5_glue->dv_create_cq(priv->sh->ctx, + &cq_attr.ibv, + &cq_attr.mlx5)); +} + +/** + * Create a WQ Verbs object. + * + * @param dev + * Pointer to Ethernet device. + * @param priv + * Pointer to device private data. + * @param rxq_data + * Pointer to Rx queue data. + * @param idx + * Queue index in DPDK Rx queue array + * @param wqe_n + * Number of WQEs in WQ. + * @param rxq_obj + * Pointer to Rx queue object data. + * + * @return + * The Verbs object initialised, NULL otherwise and rte_errno is set. + */ +static struct ibv_wq * +mlx5_ibv_wq_new(struct rte_eth_dev *dev, struct mlx5_priv *priv, + struct mlx5_rxq_data *rxq_data, uint16_t idx, + unsigned int wqe_n, struct mlx5_rxq_obj *rxq_obj) +{ + struct { + struct ibv_wq_init_attr ibv; +#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT + struct mlx5dv_wq_init_attr mlx5; +#endif + } wq_attr; + + wq_attr.ibv = (struct ibv_wq_init_attr){ + .wq_context = NULL, /* Could be useful in the future. */ + .wq_type = IBV_WQT_RQ, + /* Max number of outstanding WRs. */ + .max_wr = wqe_n >> rxq_data->sges_n, + /* Max number of scatter/gather elements in a WR. */ + .max_sge = 1 << rxq_data->sges_n, + .pd = priv->sh->pd, + .cq = rxq_obj->cq, + .comp_mask = IBV_WQ_FLAGS_CVLAN_STRIPPING | 0, + .create_flags = (rxq_data->vlan_strip ? + IBV_WQ_FLAGS_CVLAN_STRIPPING : 0), + }; + /* By default, FCS (CRC) is stripped by hardware. */ + if (rxq_data->crc_present) { + wq_attr.ibv.create_flags |= IBV_WQ_FLAGS_SCATTER_FCS; + wq_attr.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS; + } + if (priv->config.hw_padding) { +#if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING) + wq_attr.ibv.create_flags |= IBV_WQ_FLAG_RX_END_PADDING; + wq_attr.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS; +#elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING) + wq_attr.ibv.create_flags |= IBV_WQ_FLAGS_PCI_WRITE_END_PADDING; + wq_attr.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS; +#endif + } +#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT + wq_attr.mlx5 = (struct mlx5dv_wq_init_attr){ + .comp_mask = 0, + }; + if (mlx5_rxq_mprq_enabled(rxq_data)) { + struct mlx5dv_striding_rq_init_attr *mprq_attr = + &wq_attr.mlx5.striding_rq_attrs; + + wq_attr.mlx5.comp_mask |= MLX5DV_WQ_INIT_ATTR_MASK_STRIDING_RQ; + *mprq_attr = (struct mlx5dv_striding_rq_init_attr){ + .single_stride_log_num_of_bytes = rxq_data->strd_sz_n, + .single_wqe_log_num_of_strides = rxq_data->strd_num_n, + .two_byte_shift_en = MLX5_MPRQ_TWO_BYTE_SHIFT, + }; + } + rxq_obj->wq = mlx5_glue->dv_create_wq(priv->sh->ctx, &wq_attr.ibv, + &wq_attr.mlx5); +#else + rxq_obj->wq = mlx5_glue->create_wq(priv->sh->ctx, &wq_attr.ibv); +#endif + if (rxq_obj->wq) { + /* + * Make sure number of WRs*SGEs match expectations since a queue + * cannot allocate more than "desc" buffers. + */ + if (wq_attr.ibv.max_wr != (wqe_n >> rxq_data->sges_n) || + wq_attr.ibv.max_sge != (1u << rxq_data->sges_n)) { + DRV_LOG(ERR, + "port %u Rx queue %u requested %u*%u but got" + " %u*%u WRs*SGEs", + dev->data->port_id, idx, + wqe_n >> rxq_data->sges_n, + (1 << rxq_data->sges_n), + wq_attr.ibv.max_wr, wq_attr.ibv.max_sge); + claim_zero(mlx5_glue->destroy_wq(rxq_obj->wq)); + rxq_obj->wq = NULL; + rte_errno = EINVAL; + } + } + return rxq_obj->wq; +} + +/** + * Fill common fields of create RQ attributes structure. + * + * @param rxq_data + * Pointer to Rx queue data. + * @param cqn + * CQ number to use with this RQ. + * @param rq_attr + * RQ attributes structure to fill.. + */ +static void +mlx5_devx_create_rq_attr_fill(struct mlx5_rxq_data *rxq_data, uint32_t cqn, + struct mlx5_devx_create_rq_attr *rq_attr) +{ + rq_attr->state = MLX5_RQC_STATE_RST; + rq_attr->vsd = (rxq_data->vlan_strip) ? 0 : 1; + rq_attr->cqn = cqn; + rq_attr->scatter_fcs = (rxq_data->crc_present) ? 1 : 0; +} + +/** + * Fill common fields of DevX WQ attributes structure. + * + * @param priv + * Pointer to device private data. + * @param rxq_ctrl + * Pointer to Rx queue control structure. + * @param wq_attr + * WQ attributes structure to fill.. + */ +static void +mlx5_devx_wq_attr_fill(struct mlx5_priv *priv, struct mlx5_rxq_ctrl *rxq_ctrl, + struct mlx5_devx_wq_attr *wq_attr) +{ + wq_attr->end_padding_mode = priv->config.cqe_pad ? + MLX5_WQ_END_PAD_MODE_ALIGN : + MLX5_WQ_END_PAD_MODE_NONE; + wq_attr->pd = priv->sh->pdn; + wq_attr->dbr_addr = rxq_ctrl->dbr_offset; + wq_attr->dbr_umem_id = rxq_ctrl->dbr_umem_id; + wq_attr->dbr_umem_valid = 1; + wq_attr->wq_umem_id = rxq_ctrl->wq_umem->umem_id; + wq_attr->wq_umem_valid = 1; +} + +/** + * Create a RQ object using DevX. + * + * @param dev + * Pointer to Ethernet device. + * @param idx + * Queue index in DPDK Rx queue array + * @param cqn + * CQ number to use with this RQ. + * + * @return + * The DevX object initialised, NULL otherwise and rte_errno is set. + */ +static struct mlx5_devx_obj * +mlx5_devx_rq_new(struct rte_eth_dev *dev, uint16_t idx, uint32_t cqn) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx]; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); + struct mlx5_devx_create_rq_attr rq_attr; + uint32_t wqe_n = 1 << (rxq_data->elts_n - rxq_data->sges_n); + uint32_t wq_size = 0; + uint32_t wqe_size = 0; + uint32_t log_wqe_size = 0; + void *buf = NULL; + struct mlx5_devx_obj *rq; + + memset(&rq_attr, 0, sizeof(rq_attr)); + /* Fill RQ attributes. */ + rq_attr.mem_rq_type = MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE; + rq_attr.flush_in_error_en = 1; + mlx5_devx_create_rq_attr_fill(rxq_data, cqn, &rq_attr); + /* Fill WQ attributes for this RQ. */ + if (mlx5_rxq_mprq_enabled(rxq_data)) { + rq_attr.wq_attr.wq_type = MLX5_WQ_TYPE_CYCLIC_STRIDING_RQ; + /* + * Number of strides in each WQE: + * 512*2^single_wqe_log_num_of_strides. + */ + rq_attr.wq_attr.single_wqe_log_num_of_strides = + rxq_data->strd_num_n - + MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES; + /* Stride size = (2^single_stride_log_num_of_bytes)*64B. */ + rq_attr.wq_attr.single_stride_log_num_of_bytes = + rxq_data->strd_sz_n - + MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES; + wqe_size = sizeof(struct mlx5_wqe_mprq); + } else { + rq_attr.wq_attr.wq_type = MLX5_WQ_TYPE_CYCLIC; + wqe_size = sizeof(struct mlx5_wqe_data_seg); + } + log_wqe_size = log2above(wqe_size) + rxq_data->sges_n; + rq_attr.wq_attr.log_wq_stride = log_wqe_size; + rq_attr.wq_attr.log_wq_sz = rxq_data->elts_n - rxq_data->sges_n; + /* Calculate and allocate WQ memory space. */ + wqe_size = 1 << log_wqe_size; /* round up power of two.*/ + wq_size = wqe_n * wqe_size; + buf = rte_calloc_socket(__func__, 1, wq_size, MLX5_WQE_BUF_ALIGNMENT, + rxq_ctrl->socket); + if (!buf) + return NULL; + rxq_data->wqes = buf; + rxq_ctrl->wq_umem = mlx5_glue->devx_umem_reg(priv->sh->ctx, + buf, wq_size, 0); + if (!rxq_ctrl->wq_umem) { + rte_free(buf); + return NULL; + } + mlx5_devx_wq_attr_fill(priv, rxq_ctrl, &rq_attr.wq_attr); + rq = mlx5_devx_cmd_create_rq(priv->sh->ctx, &rq_attr, rxq_ctrl->socket); + if (!rq) + rxq_release_rq_resources(rxq_ctrl); + return rq; +} + +/** + * Create the Rx hairpin queue object. + * + * @param dev + * Pointer to Ethernet device. + * @param idx + * Queue index in DPDK Rx queue array + * + * @return + * The hairpin DevX object initialised, NULL otherwise and rte_errno is set. + */ +static struct mlx5_rxq_obj * +mlx5_rxq_obj_hairpin_new(struct rte_eth_dev *dev, uint16_t idx) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx]; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); + struct mlx5_devx_create_rq_attr attr = { 0 }; + struct mlx5_rxq_obj *tmpl = NULL; + int ret = 0; + uint32_t max_wq_data; + + MLX5_ASSERT(rxq_data); + MLX5_ASSERT(!rxq_ctrl->obj); + tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0, + rxq_ctrl->socket); + if (!tmpl) { + DRV_LOG(ERR, + "port %u Rx queue %u cannot allocate verbs resources", + dev->data->port_id, rxq_data->idx); + rte_errno = ENOMEM; + goto error; + } + tmpl->type = MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN; + tmpl->rxq_ctrl = rxq_ctrl; + attr.hairpin = 1; + max_wq_data = priv->config.hca_attr.log_max_hairpin_wq_data_sz; + /* Jumbo frames > 9KB should be supported, and more packets. */ + if (priv->config.log_hp_size != (uint32_t)MLX5_ARG_UNSET) { + if (priv->config.log_hp_size > max_wq_data) { + DRV_LOG(ERR, "total data size %u power of 2 is " + "too large for hairpin", + priv->config.log_hp_size); + rte_errno = ERANGE; + return NULL; + } + attr.wq_attr.log_hairpin_data_sz = priv->config.log_hp_size; + } else { + attr.wq_attr.log_hairpin_data_sz = + (max_wq_data < MLX5_HAIRPIN_JUMBO_LOG_SIZE) ? + max_wq_data : MLX5_HAIRPIN_JUMBO_LOG_SIZE; + } + /* Set the packets number to the maximum value for performance. */ + attr.wq_attr.log_hairpin_num_packets = + attr.wq_attr.log_hairpin_data_sz - + MLX5_HAIRPIN_QUEUE_STRIDE; + tmpl->rq = mlx5_devx_cmd_create_rq(priv->sh->ctx, &attr, + rxq_ctrl->socket); + if (!tmpl->rq) { + DRV_LOG(ERR, + "port %u Rx hairpin queue %u can't create rq object", + dev->data->port_id, idx); + rte_errno = errno; + goto error; + } + DRV_LOG(DEBUG, "port %u rxq %u updated with %p", dev->data->port_id, + idx, (void *)&tmpl); + rte_atomic32_inc(&tmpl->refcnt); + LIST_INSERT_HEAD(&priv->rxqsobj, tmpl, next); + priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE; + return tmpl; +error: + ret = rte_errno; /* Save rte_errno before cleanup. */ + if (tmpl->rq) + mlx5_devx_cmd_destroy(tmpl->rq); + rte_errno = ret; /* Restore rte_errno. */ + return NULL; +} + +/** + * Create the Rx queue Verbs/DevX object. + * + * @param dev + * Pointer to Ethernet device. + * @param idx + * Queue index in DPDK Rx queue array + * @param type + * Type of Rx queue object to create. + * + * @return + * The Verbs/DevX object initialised, NULL otherwise and rte_errno is set. + */ +struct mlx5_rxq_obj * +mlx5_rxq_obj_new(struct rte_eth_dev *dev, uint16_t idx, + enum mlx5_rxq_obj_type type) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx]; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); + struct ibv_wq_attr mod; + unsigned int cqe_n; + unsigned int wqe_n = 1 << rxq_data->elts_n; + struct mlx5_rxq_obj *tmpl = NULL; + struct mlx5dv_cq cq_info; + struct mlx5dv_rwq rwq; + int ret = 0; + struct mlx5dv_obj obj; + + MLX5_ASSERT(rxq_data); + MLX5_ASSERT(!rxq_ctrl->obj); + if (type == MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN) + return mlx5_rxq_obj_hairpin_new(dev, idx); + priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_RX_QUEUE; + priv->verbs_alloc_ctx.obj = rxq_ctrl; + tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0, + rxq_ctrl->socket); + if (!tmpl) { + DRV_LOG(ERR, + "port %u Rx queue %u cannot allocate verbs resources", + dev->data->port_id, rxq_data->idx); + rte_errno = ENOMEM; + goto error; + } + tmpl->type = type; + tmpl->rxq_ctrl = rxq_ctrl; + if (rxq_ctrl->irq) { + tmpl->channel = mlx5_glue->create_comp_channel(priv->sh->ctx); + if (!tmpl->channel) { + DRV_LOG(ERR, "port %u: comp channel creation failure", + dev->data->port_id); + rte_errno = ENOMEM; + goto error; + } + } + if (mlx5_rxq_mprq_enabled(rxq_data)) + cqe_n = wqe_n * (1 << rxq_data->strd_num_n) - 1; + else + cqe_n = wqe_n - 1; + tmpl->cq = mlx5_ibv_cq_new(dev, priv, rxq_data, cqe_n, tmpl); + if (!tmpl->cq) { + DRV_LOG(ERR, "port %u Rx queue %u CQ creation failure", + dev->data->port_id, idx); + rte_errno = ENOMEM; + goto error; + } + obj.cq.in = tmpl->cq; + obj.cq.out = &cq_info; + ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_CQ); + if (ret) { + rte_errno = ret; + goto error; + } + if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) { + DRV_LOG(ERR, + "port %u wrong MLX5_CQE_SIZE environment variable" + " value: it should be set to %u", + dev->data->port_id, RTE_CACHE_LINE_SIZE); + rte_errno = EINVAL; + goto error; + } + DRV_LOG(DEBUG, "port %u device_attr.max_qp_wr is %d", + dev->data->port_id, priv->sh->device_attr.orig_attr.max_qp_wr); + DRV_LOG(DEBUG, "port %u device_attr.max_sge is %d", + dev->data->port_id, priv->sh->device_attr.orig_attr.max_sge); + /* Allocate door-bell for types created with DevX. */ + if (tmpl->type != MLX5_RXQ_OBJ_TYPE_IBV) { + struct mlx5_devx_dbr_page *dbr_page; + int64_t dbr_offset; + + dbr_offset = mlx5_get_dbr(dev, &dbr_page); + if (dbr_offset < 0) + goto error; + rxq_ctrl->dbr_offset = dbr_offset; + rxq_ctrl->dbr_umem_id = dbr_page->umem->umem_id; + rxq_ctrl->dbr_umem_id_valid = 1; + rxq_data->rq_db = (uint32_t *)((uintptr_t)dbr_page->dbrs + + (uintptr_t)rxq_ctrl->dbr_offset); + } + if (tmpl->type == MLX5_RXQ_OBJ_TYPE_IBV) { + tmpl->wq = mlx5_ibv_wq_new(dev, priv, rxq_data, idx, wqe_n, + tmpl); + if (!tmpl->wq) { + DRV_LOG(ERR, "port %u Rx queue %u WQ creation failure", + dev->data->port_id, idx); + rte_errno = ENOMEM; + goto error; + } + /* Change queue state to ready. */ + mod = (struct ibv_wq_attr){ + .attr_mask = IBV_WQ_ATTR_STATE, + .wq_state = IBV_WQS_RDY, + }; + ret = mlx5_glue->modify_wq(tmpl->wq, &mod); + if (ret) { + DRV_LOG(ERR, + "port %u Rx queue %u WQ state to IBV_WQS_RDY" + " failed", dev->data->port_id, idx); + rte_errno = ret; + goto error; + } + obj.rwq.in = tmpl->wq; + obj.rwq.out = &rwq; + ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_RWQ); + if (ret) { + rte_errno = ret; + goto error; + } + rxq_data->wqes = rwq.buf; + rxq_data->rq_db = rwq.dbrec; + } else if (tmpl->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ) { + struct mlx5_devx_modify_rq_attr rq_attr; + + memset(&rq_attr, 0, sizeof(rq_attr)); + tmpl->rq = mlx5_devx_rq_new(dev, idx, cq_info.cqn); + if (!tmpl->rq) { + DRV_LOG(ERR, "port %u Rx queue %u RQ creation failure", + dev->data->port_id, idx); + rte_errno = ENOMEM; + goto error; + } + /* Change queue state to ready. */ + rq_attr.rq_state = MLX5_RQC_STATE_RST; + rq_attr.state = MLX5_RQC_STATE_RDY; + ret = mlx5_devx_cmd_modify_rq(tmpl->rq, &rq_attr); + if (ret) + goto error; + } + /* Fill the rings. */ + rxq_data->cqe_n = log2above(cq_info.cqe_cnt); + rxq_data->cq_db = cq_info.dbrec; + rxq_data->cqes = (volatile struct mlx5_cqe (*)[])(uintptr_t)cq_info.buf; + rxq_data->cq_uar = cq_info.cq_uar; + rxq_data->cqn = cq_info.cqn; + rxq_data->cq_arm_sn = 0; + mlx5_rxq_initialize(rxq_data); + rxq_data->cq_ci = 0; + DRV_LOG(DEBUG, "port %u rxq %u updated with %p", dev->data->port_id, + idx, (void *)&tmpl); + rte_atomic32_inc(&tmpl->refcnt); + LIST_INSERT_HEAD(&priv->rxqsobj, tmpl, next); + priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE; + return tmpl; +error: + if (tmpl) { + ret = rte_errno; /* Save rte_errno before cleanup. */ + if (tmpl->type == MLX5_RXQ_OBJ_TYPE_IBV && tmpl->wq) + claim_zero(mlx5_glue->destroy_wq(tmpl->wq)); + else if (tmpl->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ && tmpl->rq) + claim_zero(mlx5_devx_cmd_destroy(tmpl->rq)); + if (tmpl->cq) + claim_zero(mlx5_glue->destroy_cq(tmpl->cq)); + if (tmpl->channel) + claim_zero(mlx5_glue->destroy_comp_channel + (tmpl->channel)); + rte_free(tmpl); + rte_errno = ret; /* Restore rte_errno. */ + } + if (type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ) + rxq_release_rq_resources(rxq_ctrl); + priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE; + return NULL; +} + +/** + * Verify the Rx queue objects list is empty + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * The number of objects not released. + */ +int +mlx5_rxq_obj_verify(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + int ret = 0; + struct mlx5_rxq_obj *rxq_obj; + + LIST_FOREACH(rxq_obj, &priv->rxqsobj, next) { + DRV_LOG(DEBUG, "port %u Rx queue %u still referenced", + dev->data->port_id, rxq_obj->rxq_ctrl->rxq.idx); + ++ret; + } + return ret; +} + +/** + * Callback function to initialize mbufs for Multi-Packet RQ. + */ +static inline void +mlx5_mprq_buf_init(struct rte_mempool *mp, void *opaque_arg, + void *_m, unsigned int i __rte_unused) +{ + struct mlx5_mprq_buf *buf = _m; + struct rte_mbuf_ext_shared_info *shinfo; + unsigned int strd_n = (unsigned int)(uintptr_t)opaque_arg; + unsigned int j; + + memset(_m, 0, sizeof(*buf)); + buf->mp = mp; + rte_atomic16_set(&buf->refcnt, 1); + for (j = 0; j != strd_n; ++j) { + shinfo = &buf->shinfos[j]; + shinfo->free_cb = mlx5_mprq_buf_free_cb; + shinfo->fcb_opaque = buf; + } +} + +/** + * Free mempool of Multi-Packet RQ. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * 0 on success, negative errno value on failure. + */ +int +mlx5_mprq_free_mp(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct rte_mempool *mp = priv->mprq_mp; + unsigned int i; + + if (mp == NULL) + return 0; + DRV_LOG(DEBUG, "port %u freeing mempool (%s) for Multi-Packet RQ", + dev->data->port_id, mp->name); + /* + * If a buffer in the pool has been externally attached to a mbuf and it + * is still in use by application, destroying the Rx queue can spoil + * the packet. It is unlikely to happen but if application dynamically + * creates and destroys with holding Rx packets, this can happen. + * + * TODO: It is unavoidable for now because the mempool for Multi-Packet + * RQ isn't provided by application but managed by PMD. + */ + if (!rte_mempool_full(mp)) { + DRV_LOG(ERR, + "port %u mempool for Multi-Packet RQ is still in use", + dev->data->port_id); + rte_errno = EBUSY; + return -rte_errno; + } + rte_mempool_free(mp); + /* Unset mempool for each Rx queue. */ + for (i = 0; i != priv->rxqs_n; ++i) { + struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; + + if (rxq == NULL) + continue; + rxq->mprq_mp = NULL; + } + priv->mprq_mp = NULL; + return 0; +} + +/** + * Allocate a mempool for Multi-Packet RQ. All configured Rx queues share the + * mempool. If already allocated, reuse it if there're enough elements. + * Otherwise, resize it. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * 0 on success, negative errno value on failure. + */ +int +mlx5_mprq_alloc_mp(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct rte_mempool *mp = priv->mprq_mp; + char name[RTE_MEMPOOL_NAMESIZE]; + unsigned int desc = 0; + unsigned int buf_len; + unsigned int obj_num; + unsigned int obj_size; + unsigned int strd_num_n = 0; + unsigned int strd_sz_n = 0; + unsigned int i; + unsigned int n_ibv = 0; + + if (!mlx5_mprq_enabled(dev)) + return 0; + /* Count the total number of descriptors configured. */ + for (i = 0; i != priv->rxqs_n; ++i) { + struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; + struct mlx5_rxq_ctrl *rxq_ctrl = container_of + (rxq, struct mlx5_rxq_ctrl, rxq); + + if (rxq == NULL || rxq_ctrl->type != MLX5_RXQ_TYPE_STANDARD) + continue; + n_ibv++; + desc += 1 << rxq->elts_n; + /* Get the max number of strides. */ + if (strd_num_n < rxq->strd_num_n) + strd_num_n = rxq->strd_num_n; + /* Get the max size of a stride. */ + if (strd_sz_n < rxq->strd_sz_n) + strd_sz_n = rxq->strd_sz_n; + } + MLX5_ASSERT(strd_num_n && strd_sz_n); + buf_len = (1 << strd_num_n) * (1 << strd_sz_n); + obj_size = sizeof(struct mlx5_mprq_buf) + buf_len + (1 << strd_num_n) * + sizeof(struct rte_mbuf_ext_shared_info) + RTE_PKTMBUF_HEADROOM; + /* + * Received packets can be either memcpy'd or externally referenced. In + * case that the packet is attached to an mbuf as an external buffer, as + * it isn't possible to predict how the buffers will be queued by + * application, there's no option to exactly pre-allocate needed buffers + * in advance but to speculatively prepares enough buffers. + * + * In the data path, if this Mempool is depleted, PMD will try to memcpy + * received packets to buffers provided by application (rxq->mp) until + * this Mempool gets available again. + */ + desc *= 4; + obj_num = desc + MLX5_MPRQ_MP_CACHE_SZ * n_ibv; + /* + * rte_mempool_create_empty() has sanity check to refuse large cache + * size compared to the number of elements. + * CACHE_FLUSHTHRESH_MULTIPLIER is defined in a C file, so using a + * constant number 2 instead. + */ + obj_num = RTE_MAX(obj_num, MLX5_MPRQ_MP_CACHE_SZ * 2); + /* Check a mempool is already allocated and if it can be resued. */ + if (mp != NULL && mp->elt_size >= obj_size && mp->size >= obj_num) { + DRV_LOG(DEBUG, "port %u mempool %s is being reused", + dev->data->port_id, mp->name); + /* Reuse. */ + goto exit; + } else if (mp != NULL) { + DRV_LOG(DEBUG, "port %u mempool %s should be resized, freeing it", + dev->data->port_id, mp->name); + /* + * If failed to free, which means it may be still in use, no way + * but to keep using the existing one. On buffer underrun, + * packets will be memcpy'd instead of external buffer + * attachment. + */ + if (mlx5_mprq_free_mp(dev)) { + if (mp->elt_size >= obj_size) + goto exit; + else + return -rte_errno; + } + } + snprintf(name, sizeof(name), "port-%u-mprq", dev->data->port_id); + mp = rte_mempool_create(name, obj_num, obj_size, MLX5_MPRQ_MP_CACHE_SZ, + 0, NULL, NULL, mlx5_mprq_buf_init, + (void *)(uintptr_t)(1 << strd_num_n), + dev->device->numa_node, 0); + if (mp == NULL) { + DRV_LOG(ERR, + "port %u failed to allocate a mempool for" + " Multi-Packet RQ, count=%u, size=%u", + dev->data->port_id, obj_num, obj_size); + rte_errno = ENOMEM; + return -rte_errno; + } + priv->mprq_mp = mp; +exit: + /* Set mempool for each Rx queue. */ + for (i = 0; i != priv->rxqs_n; ++i) { + struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; + struct mlx5_rxq_ctrl *rxq_ctrl = container_of + (rxq, struct mlx5_rxq_ctrl, rxq); + + if (rxq == NULL || rxq_ctrl->type != MLX5_RXQ_TYPE_STANDARD) + continue; + rxq->mprq_mp = mp; + } + DRV_LOG(INFO, "port %u Multi-Packet RQ is configured", + dev->data->port_id); + return 0; +} + +#define MLX5_MAX_TCP_HDR_OFFSET ((unsigned int)(sizeof(struct rte_ether_hdr) + \ + sizeof(struct rte_vlan_hdr) * 2 + \ + sizeof(struct rte_ipv6_hdr))) +#define MAX_TCP_OPTION_SIZE 40u +#define MLX5_MAX_LRO_HEADER_FIX ((unsigned int)(MLX5_MAX_TCP_HDR_OFFSET + \ + sizeof(struct rte_tcp_hdr) + \ + MAX_TCP_OPTION_SIZE)) + +/** + * Adjust the maximum LRO massage size. + * + * @param dev + * Pointer to Ethernet device. + * @param idx + * RX queue index. + * @param max_lro_size + * The maximum size for LRO packet. + */ +static void +mlx5_max_lro_msg_size_adjust(struct rte_eth_dev *dev, uint16_t idx, + uint32_t max_lro_size) +{ + struct mlx5_priv *priv = dev->data->dev_private; + + if (priv->config.hca_attr.lro_max_msg_sz_mode == + MLX5_LRO_MAX_MSG_SIZE_START_FROM_L4 && max_lro_size > + MLX5_MAX_TCP_HDR_OFFSET) + max_lro_size -= MLX5_MAX_TCP_HDR_OFFSET; + max_lro_size = RTE_MIN(max_lro_size, MLX5_MAX_LRO_SIZE); + MLX5_ASSERT(max_lro_size >= MLX5_LRO_SEG_CHUNK_SIZE); + max_lro_size /= MLX5_LRO_SEG_CHUNK_SIZE; + if (priv->max_lro_msg_size) + priv->max_lro_msg_size = + RTE_MIN((uint32_t)priv->max_lro_msg_size, max_lro_size); + else + priv->max_lro_msg_size = max_lro_size; + DRV_LOG(DEBUG, + "port %u Rx Queue %u max LRO message size adjusted to %u bytes", + dev->data->port_id, idx, + priv->max_lro_msg_size * MLX5_LRO_SEG_CHUNK_SIZE); +} + +/** + * Create a DPDK Rx queue. + * + * @param dev + * Pointer to Ethernet device. + * @param idx + * RX queue index. + * @param desc + * Number of descriptors to configure in queue. + * @param socket + * NUMA socket on which memory must be allocated. + * + * @return + * A DPDK queue object on success, NULL otherwise and rte_errno is set. + */ +struct mlx5_rxq_ctrl * +mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + unsigned int socket, const struct rte_eth_rxconf *conf, + struct rte_mempool *mp) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_ctrl *tmpl; + unsigned int mb_len = rte_pktmbuf_data_room_size(mp); + unsigned int mprq_stride_nums; + unsigned int mprq_stride_size; + unsigned int mprq_stride_cap; + struct mlx5_dev_config *config = &priv->config; + /* + * Always allocate extra slots, even if eventually + * the vector Rx will not be used. + */ + uint16_t desc_n = + desc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP; + uint64_t offloads = conf->offloads | + dev->data->dev_conf.rxmode.offloads; + unsigned int lro_on_queue = !!(offloads & DEV_RX_OFFLOAD_TCP_LRO); + const int mprq_en = mlx5_check_mprq_support(dev) > 0; + unsigned int max_rx_pkt_len = lro_on_queue ? + dev->data->dev_conf.rxmode.max_lro_pkt_size : + dev->data->dev_conf.rxmode.max_rx_pkt_len; + unsigned int non_scatter_min_mbuf_size = max_rx_pkt_len + + RTE_PKTMBUF_HEADROOM; + unsigned int max_lro_size = 0; + unsigned int first_mb_free_size = mb_len - RTE_PKTMBUF_HEADROOM; + + if (non_scatter_min_mbuf_size > mb_len && !(offloads & + DEV_RX_OFFLOAD_SCATTER)) { + DRV_LOG(ERR, "port %u Rx queue %u: Scatter offload is not" + " configured and no enough mbuf space(%u) to contain " + "the maximum RX packet length(%u) with head-room(%u)", + dev->data->port_id, idx, mb_len, max_rx_pkt_len, + RTE_PKTMBUF_HEADROOM); + rte_errno = ENOSPC; + return NULL; + } + tmpl = rte_calloc_socket("RXQ", 1, + sizeof(*tmpl) + + desc_n * sizeof(struct rte_mbuf *), + 0, socket); + if (!tmpl) { + rte_errno = ENOMEM; + return NULL; + } + tmpl->type = MLX5_RXQ_TYPE_STANDARD; + if (mlx5_mr_btree_init(&tmpl->rxq.mr_ctrl.cache_bh, + MLX5_MR_BTREE_CACHE_N, socket)) { + /* rte_errno is already set. */ + goto error; + } + tmpl->socket = socket; + if (dev->data->dev_conf.intr_conf.rxq) + tmpl->irq = 1; + mprq_stride_nums = config->mprq.stride_num_n ? + config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N; + mprq_stride_size = non_scatter_min_mbuf_size <= + (1U << config->mprq.max_stride_size_n) ? + log2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N; + mprq_stride_cap = (config->mprq.stride_num_n ? + (1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) * + (config->mprq.stride_size_n ? + (1U << config->mprq.stride_size_n) : (1U << mprq_stride_size)); + /* + * This Rx queue can be configured as a Multi-Packet RQ if all of the + * following conditions are met: + * - MPRQ is enabled. + * - The number of descs is more than the number of strides. + * - max_rx_pkt_len plus overhead is less than the max size + * of a stride or mprq_stride_size is specified by a user. + * Need to nake sure that there are enough stides to encap + * the maximum packet size in case mprq_stride_size is set. + * Otherwise, enable Rx scatter if necessary. + */ + if (mprq_en && desc > (1U << mprq_stride_nums) && + (non_scatter_min_mbuf_size <= + (1U << config->mprq.max_stride_size_n) || + (config->mprq.stride_size_n && + non_scatter_min_mbuf_size <= mprq_stride_cap))) { + /* TODO: Rx scatter isn't supported yet. */ + tmpl->rxq.sges_n = 0; + /* Trim the number of descs needed. */ + desc >>= mprq_stride_nums; + tmpl->rxq.strd_num_n = config->mprq.stride_num_n ? + config->mprq.stride_num_n : mprq_stride_nums; + tmpl->rxq.strd_sz_n = config->mprq.stride_size_n ? + config->mprq.stride_size_n : mprq_stride_size; + tmpl->rxq.strd_shift_en = MLX5_MPRQ_TWO_BYTE_SHIFT; + tmpl->rxq.strd_scatter_en = + !!(offloads & DEV_RX_OFFLOAD_SCATTER); + tmpl->rxq.mprq_max_memcpy_len = RTE_MIN(first_mb_free_size, + config->mprq.max_memcpy_len); + max_lro_size = RTE_MIN(max_rx_pkt_len, + (1u << tmpl->rxq.strd_num_n) * + (1u << tmpl->rxq.strd_sz_n)); + DRV_LOG(DEBUG, + "port %u Rx queue %u: Multi-Packet RQ is enabled" + " strd_num_n = %u, strd_sz_n = %u", + dev->data->port_id, idx, + tmpl->rxq.strd_num_n, tmpl->rxq.strd_sz_n); + } else if (max_rx_pkt_len <= first_mb_free_size) { + tmpl->rxq.sges_n = 0; + max_lro_size = max_rx_pkt_len; + } else if (offloads & DEV_RX_OFFLOAD_SCATTER) { + unsigned int size = non_scatter_min_mbuf_size; + unsigned int sges_n; + + if (lro_on_queue && first_mb_free_size < + MLX5_MAX_LRO_HEADER_FIX) { + DRV_LOG(ERR, "Not enough space in the first segment(%u)" + " to include the max header size(%u) for LRO", + first_mb_free_size, MLX5_MAX_LRO_HEADER_FIX); + rte_errno = ENOTSUP; + goto error; + } + /* + * Determine the number of SGEs needed for a full packet + * and round it to the next power of two. + */ + sges_n = log2above((size / mb_len) + !!(size % mb_len)); + if (sges_n > MLX5_MAX_LOG_RQ_SEGS) { + DRV_LOG(ERR, + "port %u too many SGEs (%u) needed to handle" + " requested maximum packet size %u, the maximum" + " supported are %u", dev->data->port_id, + 1 << sges_n, max_rx_pkt_len, + 1u << MLX5_MAX_LOG_RQ_SEGS); + rte_errno = ENOTSUP; + goto error; + } + tmpl->rxq.sges_n = sges_n; + max_lro_size = max_rx_pkt_len; + } + if (config->mprq.enabled && !mlx5_rxq_mprq_enabled(&tmpl->rxq)) + DRV_LOG(WARNING, + "port %u MPRQ is requested but cannot be enabled\n" + " (requested: pkt_sz = %u, desc_num = %u," + " rxq_num = %u, stride_sz = %u, stride_num = %u\n" + " supported: min_rxqs_num = %u," + " min_stride_sz = %u, max_stride_sz = %u).", + dev->data->port_id, non_scatter_min_mbuf_size, + desc, priv->rxqs_n, + config->mprq.stride_size_n ? + (1U << config->mprq.stride_size_n) : + (1U << mprq_stride_size), + config->mprq.stride_num_n ? + (1U << config->mprq.stride_num_n) : + (1U << mprq_stride_nums), + config->mprq.min_rxqs_num, + (1U << config->mprq.min_stride_size_n), + (1U << config->mprq.max_stride_size_n)); + DRV_LOG(DEBUG, "port %u maximum number of segments per packet: %u", + dev->data->port_id, 1 << tmpl->rxq.sges_n); + if (desc % (1 << tmpl->rxq.sges_n)) { + DRV_LOG(ERR, + "port %u number of Rx queue descriptors (%u) is not a" + " multiple of SGEs per packet (%u)", + dev->data->port_id, + desc, + 1 << tmpl->rxq.sges_n); + rte_errno = EINVAL; + goto error; + } + mlx5_max_lro_msg_size_adjust(dev, idx, max_lro_size); + /* Toggle RX checksum offload if hardware supports it. */ + tmpl->rxq.csum = !!(offloads & DEV_RX_OFFLOAD_CHECKSUM); + tmpl->rxq.hw_timestamp = !!(offloads & DEV_RX_OFFLOAD_TIMESTAMP); + /* Configure VLAN stripping. */ + tmpl->rxq.vlan_strip = !!(offloads & DEV_RX_OFFLOAD_VLAN_STRIP); + /* By default, FCS (CRC) is stripped by hardware. */ + tmpl->rxq.crc_present = 0; + tmpl->rxq.lro = lro_on_queue; + if (offloads & DEV_RX_OFFLOAD_KEEP_CRC) { + if (config->hw_fcs_strip) { + /* + * RQs used for LRO-enabled TIRs should not be + * configured to scatter the FCS. + */ + if (lro_on_queue) + DRV_LOG(WARNING, + "port %u CRC stripping has been " + "disabled but will still be performed " + "by hardware, because LRO is enabled", + dev->data->port_id); + else + tmpl->rxq.crc_present = 1; + } else { + DRV_LOG(WARNING, + "port %u CRC stripping has been disabled but will" + " still be performed by hardware, make sure MLNX_OFED" + " and firmware are up to date", + dev->data->port_id); + } + } + DRV_LOG(DEBUG, + "port %u CRC stripping is %s, %u bytes will be subtracted from" + " incoming frames to hide it", + dev->data->port_id, + tmpl->rxq.crc_present ? "disabled" : "enabled", + tmpl->rxq.crc_present << 2); + /* Save port ID. */ + tmpl->rxq.rss_hash = !!priv->rss_conf.rss_hf && + (!!(dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS)); + tmpl->rxq.port_id = dev->data->port_id; + tmpl->priv = priv; + tmpl->rxq.mp = mp; + tmpl->rxq.elts_n = log2above(desc); + tmpl->rxq.rq_repl_thresh = + MLX5_VPMD_RXQ_RPLNSH_THRESH(1 << tmpl->rxq.elts_n); + tmpl->rxq.elts = + (struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1); +#ifndef RTE_ARCH_64 + tmpl->rxq.uar_lock_cq = &priv->uar_lock_cq; +#endif + tmpl->rxq.idx = idx; + rte_atomic32_inc(&tmpl->refcnt); + LIST_INSERT_HEAD(&priv->rxqsctrl, tmpl, next); + return tmpl; +error: + rte_free(tmpl); + return NULL; +} + +/** + * Create a DPDK Rx hairpin queue. + * + * @param dev + * Pointer to Ethernet device. + * @param idx + * RX queue index. + * @param desc + * Number of descriptors to configure in queue. + * @param hairpin_conf + * The hairpin binding configuration. + * + * @return + * A DPDK queue object on success, NULL otherwise and rte_errno is set. + */ +struct mlx5_rxq_ctrl * +mlx5_rxq_hairpin_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + const struct rte_eth_hairpin_conf *hairpin_conf) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_ctrl *tmpl; + + tmpl = rte_calloc_socket("RXQ", 1, sizeof(*tmpl), 0, SOCKET_ID_ANY); + if (!tmpl) { + rte_errno = ENOMEM; + return NULL; + } + tmpl->type = MLX5_RXQ_TYPE_HAIRPIN; + tmpl->socket = SOCKET_ID_ANY; + tmpl->rxq.rss_hash = 0; + tmpl->rxq.port_id = dev->data->port_id; + tmpl->priv = priv; + tmpl->rxq.mp = NULL; + tmpl->rxq.elts_n = log2above(desc); + tmpl->rxq.elts = NULL; + tmpl->rxq.mr_ctrl.cache_bh = (struct mlx5_mr_btree) { 0 }; + tmpl->hairpin_conf = *hairpin_conf; + tmpl->rxq.idx = idx; + rte_atomic32_inc(&tmpl->refcnt); + LIST_INSERT_HEAD(&priv->rxqsctrl, tmpl, next); + return tmpl; +} + +/** + * Get a Rx queue. + * + * @param dev + * Pointer to Ethernet device. + * @param idx + * RX queue index. + * + * @return + * A pointer to the queue if it exists, NULL otherwise. + */ +struct mlx5_rxq_ctrl * +mlx5_rxq_get(struct rte_eth_dev *dev, uint16_t idx) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_ctrl *rxq_ctrl = NULL; + + if ((*priv->rxqs)[idx]) { + rxq_ctrl = container_of((*priv->rxqs)[idx], + struct mlx5_rxq_ctrl, + rxq); + mlx5_rxq_obj_get(dev, idx); + rte_atomic32_inc(&rxq_ctrl->refcnt); + } + return rxq_ctrl; +} + +/** + * Release a Rx queue. + * + * @param dev + * Pointer to Ethernet device. + * @param idx + * RX queue index. + * + * @return + * 1 while a reference on it exists, 0 when freed. + */ +int +mlx5_rxq_release(struct rte_eth_dev *dev, uint16_t idx) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_ctrl *rxq_ctrl; + + if (!(*priv->rxqs)[idx]) + return 0; + rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq); + MLX5_ASSERT(rxq_ctrl->priv); + if (rxq_ctrl->obj && !mlx5_rxq_obj_release(rxq_ctrl->obj)) + rxq_ctrl->obj = NULL; + if (rte_atomic32_dec_and_test(&rxq_ctrl->refcnt)) { + if (rxq_ctrl->dbr_umem_id_valid) + claim_zero(mlx5_release_dbr(dev, rxq_ctrl->dbr_umem_id, + rxq_ctrl->dbr_offset)); + if (rxq_ctrl->type == MLX5_RXQ_TYPE_STANDARD) + mlx5_mr_btree_free(&rxq_ctrl->rxq.mr_ctrl.cache_bh); + LIST_REMOVE(rxq_ctrl, next); + rte_free(rxq_ctrl); + (*priv->rxqs)[idx] = NULL; + return 0; + } + return 1; +} + +/** + * Verify the Rx Queue list is empty + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * The number of object not released. + */ +int +mlx5_rxq_verify(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_ctrl *rxq_ctrl; + int ret = 0; + + LIST_FOREACH(rxq_ctrl, &priv->rxqsctrl, next) { + DRV_LOG(DEBUG, "port %u Rx Queue %u still referenced", + dev->data->port_id, rxq_ctrl->rxq.idx); + ++ret; + } + return ret; +} + +/** + * Get a Rx queue type. + * + * @param dev + * Pointer to Ethernet device. + * @param idx + * Rx queue index. + * + * @return + * The Rx queue type. + */ +enum mlx5_rxq_type +mlx5_rxq_get_type(struct rte_eth_dev *dev, uint16_t idx) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_ctrl *rxq_ctrl = NULL; + + if (idx < priv->rxqs_n && (*priv->rxqs)[idx]) { + rxq_ctrl = container_of((*priv->rxqs)[idx], + struct mlx5_rxq_ctrl, + rxq); + return rxq_ctrl->type; + } + return MLX5_RXQ_TYPE_UNDEFINED; +} + +/** + * Create an indirection table. + * + * @param dev + * Pointer to Ethernet device. + * @param queues + * Queues entering in the indirection table. + * @param queues_n + * Number of queues in the array. + * + * @return + * The Verbs/DevX object initialised, NULL otherwise and rte_errno is set. + */ +static struct mlx5_ind_table_obj * +mlx5_ind_table_obj_new(struct rte_eth_dev *dev, const uint16_t *queues, + uint32_t queues_n, enum mlx5_ind_tbl_type type) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ind_table_obj *ind_tbl; + unsigned int i = 0, j = 0, k = 0; + + ind_tbl = rte_calloc(__func__, 1, sizeof(*ind_tbl) + + queues_n * sizeof(uint16_t), 0); + if (!ind_tbl) { + rte_errno = ENOMEM; + return NULL; + } + ind_tbl->type = type; + if (ind_tbl->type == MLX5_IND_TBL_TYPE_IBV) { + const unsigned int wq_n = rte_is_power_of_2(queues_n) ? + log2above(queues_n) : + log2above(priv->config.ind_table_max_size); + struct ibv_wq *wq[1 << wq_n]; + + for (i = 0; i != queues_n; ++i) { + struct mlx5_rxq_ctrl *rxq = mlx5_rxq_get(dev, + queues[i]); + if (!rxq) + goto error; + wq[i] = rxq->obj->wq; + ind_tbl->queues[i] = queues[i]; + } + ind_tbl->queues_n = queues_n; + /* Finalise indirection table. */ + k = i; /* Retain value of i for use in error case. */ + for (j = 0; k != (unsigned int)(1 << wq_n); ++k, ++j) + wq[k] = wq[j]; + ind_tbl->ind_table = mlx5_glue->create_rwq_ind_table + (priv->sh->ctx, + &(struct ibv_rwq_ind_table_init_attr){ + .log_ind_tbl_size = wq_n, + .ind_tbl = wq, + .comp_mask = 0, + }); + if (!ind_tbl->ind_table) { + rte_errno = errno; + goto error; + } + } else { /* ind_tbl->type == MLX5_IND_TBL_TYPE_DEVX */ + struct mlx5_devx_rqt_attr *rqt_attr = NULL; + const unsigned int rqt_n = + 1 << (rte_is_power_of_2(queues_n) ? + log2above(queues_n) : + log2above(priv->config.ind_table_max_size)); + + rqt_attr = rte_calloc(__func__, 1, sizeof(*rqt_attr) + + rqt_n * sizeof(uint32_t), 0); + if (!rqt_attr) { + DRV_LOG(ERR, "port %u cannot allocate RQT resources", + dev->data->port_id); + rte_errno = ENOMEM; + goto error; + } + rqt_attr->rqt_max_size = priv->config.ind_table_max_size; + rqt_attr->rqt_actual_size = rqt_n; + for (i = 0; i != queues_n; ++i) { + struct mlx5_rxq_ctrl *rxq = mlx5_rxq_get(dev, + queues[i]); + if (!rxq) + goto error; + rqt_attr->rq_list[i] = rxq->obj->rq->id; + ind_tbl->queues[i] = queues[i]; + } + k = i; /* Retain value of i for use in error case. */ + for (j = 0; k != rqt_n; ++k, ++j) + rqt_attr->rq_list[k] = rqt_attr->rq_list[j]; + ind_tbl->rqt = mlx5_devx_cmd_create_rqt(priv->sh->ctx, + rqt_attr); + rte_free(rqt_attr); + if (!ind_tbl->rqt) { + DRV_LOG(ERR, "port %u cannot create DevX RQT", + dev->data->port_id); + rte_errno = errno; + goto error; + } + ind_tbl->queues_n = queues_n; + } + rte_atomic32_inc(&ind_tbl->refcnt); + LIST_INSERT_HEAD(&priv->ind_tbls, ind_tbl, next); + return ind_tbl; +error: + for (j = 0; j < i; j++) + mlx5_rxq_release(dev, ind_tbl->queues[j]); + rte_free(ind_tbl); + DEBUG("port %u cannot create indirection table", dev->data->port_id); + return NULL; +} + +/** + * Get an indirection table. + * + * @param dev + * Pointer to Ethernet device. + * @param queues + * Queues entering in the indirection table. + * @param queues_n + * Number of queues in the array. + * + * @return + * An indirection table if found. + */ +static struct mlx5_ind_table_obj * +mlx5_ind_table_obj_get(struct rte_eth_dev *dev, const uint16_t *queues, + uint32_t queues_n) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ind_table_obj *ind_tbl; + + LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) { + if ((ind_tbl->queues_n == queues_n) && + (memcmp(ind_tbl->queues, queues, + ind_tbl->queues_n * sizeof(ind_tbl->queues[0])) + == 0)) + break; + } + if (ind_tbl) { + unsigned int i; + + rte_atomic32_inc(&ind_tbl->refcnt); + for (i = 0; i != ind_tbl->queues_n; ++i) + mlx5_rxq_get(dev, ind_tbl->queues[i]); + } + return ind_tbl; +} + +/** + * Release an indirection table. + * + * @param dev + * Pointer to Ethernet device. + * @param ind_table + * Indirection table to release. + * + * @return + * 1 while a reference on it exists, 0 when freed. + */ +static int +mlx5_ind_table_obj_release(struct rte_eth_dev *dev, + struct mlx5_ind_table_obj *ind_tbl) +{ + unsigned int i; + + if (rte_atomic32_dec_and_test(&ind_tbl->refcnt)) { + if (ind_tbl->type == MLX5_IND_TBL_TYPE_IBV) + claim_zero(mlx5_glue->destroy_rwq_ind_table + (ind_tbl->ind_table)); + else if (ind_tbl->type == MLX5_IND_TBL_TYPE_DEVX) + claim_zero(mlx5_devx_cmd_destroy(ind_tbl->rqt)); + } + for (i = 0; i != ind_tbl->queues_n; ++i) + claim_nonzero(mlx5_rxq_release(dev, ind_tbl->queues[i])); + if (!rte_atomic32_read(&ind_tbl->refcnt)) { + LIST_REMOVE(ind_tbl, next); + rte_free(ind_tbl); + return 0; + } + return 1; +} + +/** + * Verify the Rx Queue list is empty + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * The number of object not released. + */ +int +mlx5_ind_table_obj_verify(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ind_table_obj *ind_tbl; + int ret = 0; + + LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) { + DRV_LOG(DEBUG, + "port %u indirection table obj %p still referenced", + dev->data->port_id, (void *)ind_tbl); + ++ret; + } + return ret; +} + +/** + * Create an Rx Hash queue. + * + * @param dev + * Pointer to Ethernet device. + * @param rss_key + * RSS key for the Rx hash queue. + * @param rss_key_len + * RSS key length. + * @param hash_fields + * Verbs protocol hash field to make the RSS on. + * @param queues + * Queues entering in hash queue. In case of empty hash_fields only the + * first queue index will be taken for the indirection table. + * @param queues_n + * Number of queues. + * @param tunnel + * Tunnel type. + * + * @return + * The Verbs/DevX object initialised index, 0 otherwise and rte_errno is set. + */ +uint32_t +mlx5_hrxq_new(struct rte_eth_dev *dev, + const uint8_t *rss_key, uint32_t rss_key_len, + uint64_t hash_fields, + const uint16_t *queues, uint32_t queues_n, + int tunnel __rte_unused) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_hrxq *hrxq; + uint32_t hrxq_idx = 0; + struct ibv_qp *qp = NULL; + struct mlx5_ind_table_obj *ind_tbl; + int err; + struct mlx5_devx_obj *tir = NULL; + struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[queues[0]]; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); + + queues_n = hash_fields ? queues_n : 1; + ind_tbl = mlx5_ind_table_obj_get(dev, queues, queues_n); + if (!ind_tbl) { + enum mlx5_ind_tbl_type type; + + type = rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV ? + MLX5_IND_TBL_TYPE_IBV : MLX5_IND_TBL_TYPE_DEVX; + ind_tbl = mlx5_ind_table_obj_new(dev, queues, queues_n, type); + } + if (!ind_tbl) { + rte_errno = ENOMEM; + return 0; + } + if (ind_tbl->type == MLX5_IND_TBL_TYPE_IBV) { +#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + struct mlx5dv_qp_init_attr qp_init_attr; + + memset(&qp_init_attr, 0, sizeof(qp_init_attr)); + if (tunnel) { + qp_init_attr.comp_mask = + MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS; + qp_init_attr.create_flags = + MLX5DV_QP_CREATE_TUNNEL_OFFLOADS; + } +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + if (dev->data->dev_conf.lpbk_mode) { + /* + * Allow packet sent from NIC loop back + * w/o source MAC check. + */ + qp_init_attr.comp_mask |= + MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS; + qp_init_attr.create_flags |= + MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_UC; + } +#endif + qp = mlx5_glue->dv_create_qp + (priv->sh->ctx, + &(struct ibv_qp_init_attr_ex){ + .qp_type = IBV_QPT_RAW_PACKET, + .comp_mask = + IBV_QP_INIT_ATTR_PD | + IBV_QP_INIT_ATTR_IND_TABLE | + IBV_QP_INIT_ATTR_RX_HASH, + .rx_hash_conf = (struct ibv_rx_hash_conf){ + .rx_hash_function = + IBV_RX_HASH_FUNC_TOEPLITZ, + .rx_hash_key_len = rss_key_len, + .rx_hash_key = + (void *)(uintptr_t)rss_key, + .rx_hash_fields_mask = hash_fields, + }, + .rwq_ind_tbl = ind_tbl->ind_table, + .pd = priv->sh->pd, + }, + &qp_init_attr); +#else + qp = mlx5_glue->create_qp_ex + (priv->sh->ctx, + &(struct ibv_qp_init_attr_ex){ + .qp_type = IBV_QPT_RAW_PACKET, + .comp_mask = + IBV_QP_INIT_ATTR_PD | + IBV_QP_INIT_ATTR_IND_TABLE | + IBV_QP_INIT_ATTR_RX_HASH, + .rx_hash_conf = (struct ibv_rx_hash_conf){ + .rx_hash_function = + IBV_RX_HASH_FUNC_TOEPLITZ, + .rx_hash_key_len = rss_key_len, + .rx_hash_key = + (void *)(uintptr_t)rss_key, + .rx_hash_fields_mask = hash_fields, + }, + .rwq_ind_tbl = ind_tbl->ind_table, + .pd = priv->sh->pd, + }); +#endif + if (!qp) { + rte_errno = errno; + goto error; + } + } else { /* ind_tbl->type == MLX5_IND_TBL_TYPE_DEVX */ + struct mlx5_devx_tir_attr tir_attr; + uint32_t i; + uint32_t lro = 1; + + /* Enable TIR LRO only if all the queues were configured for. */ + for (i = 0; i < queues_n; ++i) { + if (!(*priv->rxqs)[queues[i]]->lro) { + lro = 0; + break; + } + } + memset(&tir_attr, 0, sizeof(tir_attr)); + tir_attr.disp_type = MLX5_TIRC_DISP_TYPE_INDIRECT; + tir_attr.rx_hash_fn = MLX5_RX_HASH_FN_TOEPLITZ; + tir_attr.tunneled_offload_en = !!tunnel; + /* If needed, translate hash_fields bitmap to PRM format. */ + if (hash_fields) { +#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + struct mlx5_rx_hash_field_select *rx_hash_field_select = + hash_fields & IBV_RX_HASH_INNER ? + &tir_attr.rx_hash_field_selector_inner : + &tir_attr.rx_hash_field_selector_outer; +#else + struct mlx5_rx_hash_field_select *rx_hash_field_select = + &tir_attr.rx_hash_field_selector_outer; +#endif + + /* 1 bit: 0: IPv4, 1: IPv6. */ + rx_hash_field_select->l3_prot_type = + !!(hash_fields & MLX5_IPV6_IBV_RX_HASH); + /* 1 bit: 0: TCP, 1: UDP. */ + rx_hash_field_select->l4_prot_type = + !!(hash_fields & MLX5_UDP_IBV_RX_HASH); + /* Bitmask which sets which fields to use in RX Hash. */ + rx_hash_field_select->selected_fields = + ((!!(hash_fields & MLX5_L3_SRC_IBV_RX_HASH)) << + MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_SRC_IP) | + (!!(hash_fields & MLX5_L3_DST_IBV_RX_HASH)) << + MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_DST_IP | + (!!(hash_fields & MLX5_L4_SRC_IBV_RX_HASH)) << + MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_L4_SPORT | + (!!(hash_fields & MLX5_L4_DST_IBV_RX_HASH)) << + MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_L4_DPORT; + } + if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN) + tir_attr.transport_domain = priv->sh->td->id; + else + tir_attr.transport_domain = priv->sh->tdn; + memcpy(tir_attr.rx_hash_toeplitz_key, rss_key, + MLX5_RSS_HASH_KEY_LEN); + tir_attr.indirect_table = ind_tbl->rqt->id; + if (dev->data->dev_conf.lpbk_mode) + tir_attr.self_lb_block = + MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST; + if (lro) { + tir_attr.lro_timeout_period_usecs = + priv->config.lro.timeout; + tir_attr.lro_max_msg_sz = priv->max_lro_msg_size; + tir_attr.lro_enable_mask = + MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO | + MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO; + } + tir = mlx5_devx_cmd_create_tir(priv->sh->ctx, &tir_attr); + if (!tir) { + DRV_LOG(ERR, "port %u cannot create DevX TIR", + dev->data->port_id); + rte_errno = errno; + goto error; + } + } + hrxq = mlx5_ipool_zmalloc(priv->sh->ipool[MLX5_IPOOL_HRXQ], &hrxq_idx); + if (!hrxq) + goto error; + hrxq->ind_table = ind_tbl; + if (ind_tbl->type == MLX5_IND_TBL_TYPE_IBV) { + hrxq->qp = qp; +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + hrxq->action = + mlx5_glue->dv_create_flow_action_dest_ibv_qp(hrxq->qp); + if (!hrxq->action) { + rte_errno = errno; + goto error; + } +#endif + } else { /* ind_tbl->type == MLX5_IND_TBL_TYPE_DEVX */ + hrxq->tir = tir; +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + hrxq->action = mlx5_glue->dv_create_flow_action_dest_devx_tir + (hrxq->tir->obj); + if (!hrxq->action) { + rte_errno = errno; + goto error; + } +#endif + } + hrxq->rss_key_len = rss_key_len; + hrxq->hash_fields = hash_fields; + memcpy(hrxq->rss_key, rss_key, rss_key_len); + rte_atomic32_inc(&hrxq->refcnt); + ILIST_INSERT(priv->sh->ipool[MLX5_IPOOL_HRXQ], &priv->hrxqs, hrxq_idx, + hrxq, next); + return hrxq_idx; +error: + err = rte_errno; /* Save rte_errno before cleanup. */ + mlx5_ind_table_obj_release(dev, ind_tbl); + if (qp) + claim_zero(mlx5_glue->destroy_qp(qp)); + else if (tir) + claim_zero(mlx5_devx_cmd_destroy(tir)); + rte_errno = err; /* Restore rte_errno. */ + return 0; +} + +/** + * Get an Rx Hash queue. + * + * @param dev + * Pointer to Ethernet device. + * @param rss_conf + * RSS configuration for the Rx hash queue. + * @param queues + * Queues entering in hash queue. In case of empty hash_fields only the + * first queue index will be taken for the indirection table. + * @param queues_n + * Number of queues. + * + * @return + * An hash Rx queue index on success. + */ +uint32_t +mlx5_hrxq_get(struct rte_eth_dev *dev, + const uint8_t *rss_key, uint32_t rss_key_len, + uint64_t hash_fields, + const uint16_t *queues, uint32_t queues_n) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_hrxq *hrxq; + uint32_t idx; + + queues_n = hash_fields ? queues_n : 1; + ILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_HRXQ], priv->hrxqs, idx, + hrxq, next) { + struct mlx5_ind_table_obj *ind_tbl; + + if (hrxq->rss_key_len != rss_key_len) + continue; + if (memcmp(hrxq->rss_key, rss_key, rss_key_len)) + continue; + if (hrxq->hash_fields != hash_fields) + continue; + ind_tbl = mlx5_ind_table_obj_get(dev, queues, queues_n); + if (!ind_tbl) + continue; + if (ind_tbl != hrxq->ind_table) { + mlx5_ind_table_obj_release(dev, ind_tbl); + continue; + } + rte_atomic32_inc(&hrxq->refcnt); + return idx; + } + return 0; +} + +/** + * Release the hash Rx queue. + * + * @param dev + * Pointer to Ethernet device. + * @param hrxq + * Index to Hash Rx queue to release. + * + * @return + * 1 while a reference on it exists, 0 when freed. + */ +int +mlx5_hrxq_release(struct rte_eth_dev *dev, uint32_t hrxq_idx) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_hrxq *hrxq; + + hrxq = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_HRXQ], hrxq_idx); + if (!hrxq) + return 0; + if (rte_atomic32_dec_and_test(&hrxq->refcnt)) { +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + mlx5_glue->destroy_flow_action(hrxq->action); +#endif + if (hrxq->ind_table->type == MLX5_IND_TBL_TYPE_IBV) + claim_zero(mlx5_glue->destroy_qp(hrxq->qp)); + else /* hrxq->ind_table->type == MLX5_IND_TBL_TYPE_DEVX */ + claim_zero(mlx5_devx_cmd_destroy(hrxq->tir)); + mlx5_ind_table_obj_release(dev, hrxq->ind_table); + ILIST_REMOVE(priv->sh->ipool[MLX5_IPOOL_HRXQ], &priv->hrxqs, + hrxq_idx, hrxq, next); + mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_HRXQ], hrxq_idx); + return 0; + } + claim_nonzero(mlx5_ind_table_obj_release(dev, hrxq->ind_table)); + return 1; +} + +/** + * Verify the Rx Queue list is empty + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * The number of object not released. + */ +int +mlx5_hrxq_verify(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_hrxq *hrxq; + uint32_t idx; + int ret = 0; + + ILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_HRXQ], priv->hrxqs, idx, + hrxq, next) { + DRV_LOG(DEBUG, + "port %u hash Rx queue %p still referenced", + dev->data->port_id, (void *)hrxq); + ++ret; + } + return ret; +} + +/** + * Create a drop Rx queue Verbs/DevX object. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * The Verbs/DevX object initialised, NULL otherwise and rte_errno is set. + */ +static struct mlx5_rxq_obj * +mlx5_rxq_obj_drop_new(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct ibv_context *ctx = priv->sh->ctx; + struct ibv_cq *cq; + struct ibv_wq *wq = NULL; + struct mlx5_rxq_obj *rxq; + + if (priv->drop_queue.rxq) + return priv->drop_queue.rxq; + cq = mlx5_glue->create_cq(ctx, 1, NULL, NULL, 0); + if (!cq) { + DEBUG("port %u cannot allocate CQ for drop queue", + dev->data->port_id); + rte_errno = errno; + goto error; + } + wq = mlx5_glue->create_wq(ctx, + &(struct ibv_wq_init_attr){ + .wq_type = IBV_WQT_RQ, + .max_wr = 1, + .max_sge = 1, + .pd = priv->sh->pd, + .cq = cq, + }); + if (!wq) { + DEBUG("port %u cannot allocate WQ for drop queue", + dev->data->port_id); + rte_errno = errno; + goto error; + } + rxq = rte_calloc(__func__, 1, sizeof(*rxq), 0); + if (!rxq) { + DEBUG("port %u cannot allocate drop Rx queue memory", + dev->data->port_id); + rte_errno = ENOMEM; + goto error; + } + rxq->cq = cq; + rxq->wq = wq; + priv->drop_queue.rxq = rxq; + return rxq; +error: + if (wq) + claim_zero(mlx5_glue->destroy_wq(wq)); + if (cq) + claim_zero(mlx5_glue->destroy_cq(cq)); + return NULL; +} + +/** + * Release a drop Rx queue Verbs/DevX object. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * The Verbs/DevX object initialised, NULL otherwise and rte_errno is set. + */ +static void +mlx5_rxq_obj_drop_release(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_obj *rxq = priv->drop_queue.rxq; + + if (rxq->wq) + claim_zero(mlx5_glue->destroy_wq(rxq->wq)); + if (rxq->cq) + claim_zero(mlx5_glue->destroy_cq(rxq->cq)); + rte_free(rxq); + priv->drop_queue.rxq = NULL; +} + +/** + * Create a drop indirection table. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * The Verbs/DevX object initialised, NULL otherwise and rte_errno is set. + */ +static struct mlx5_ind_table_obj * +mlx5_ind_table_obj_drop_new(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ind_table_obj *ind_tbl; + struct mlx5_rxq_obj *rxq; + struct mlx5_ind_table_obj tmpl; + + rxq = mlx5_rxq_obj_drop_new(dev); + if (!rxq) + return NULL; + tmpl.ind_table = mlx5_glue->create_rwq_ind_table + (priv->sh->ctx, + &(struct ibv_rwq_ind_table_init_attr){ + .log_ind_tbl_size = 0, + .ind_tbl = &rxq->wq, + .comp_mask = 0, + }); + if (!tmpl.ind_table) { + DEBUG("port %u cannot allocate indirection table for drop" + " queue", + dev->data->port_id); + rte_errno = errno; + goto error; + } + ind_tbl = rte_calloc(__func__, 1, sizeof(*ind_tbl), 0); + if (!ind_tbl) { + rte_errno = ENOMEM; + goto error; + } + ind_tbl->ind_table = tmpl.ind_table; + return ind_tbl; +error: + mlx5_rxq_obj_drop_release(dev); + return NULL; +} + +/** + * Release a drop indirection table. + * + * @param dev + * Pointer to Ethernet device. + */ +static void +mlx5_ind_table_obj_drop_release(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ind_table_obj *ind_tbl = priv->drop_queue.hrxq->ind_table; + + claim_zero(mlx5_glue->destroy_rwq_ind_table(ind_tbl->ind_table)); + mlx5_rxq_obj_drop_release(dev); + rte_free(ind_tbl); + priv->drop_queue.hrxq->ind_table = NULL; +} + +/** + * Create a drop Rx Hash queue. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * The Verbs/DevX object initialised, NULL otherwise and rte_errno is set. + */ +struct mlx5_hrxq * +mlx5_hrxq_drop_new(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ind_table_obj *ind_tbl = NULL; + struct ibv_qp *qp = NULL; + struct mlx5_hrxq *hrxq = NULL; + + if (priv->drop_queue.hrxq) { + rte_atomic32_inc(&priv->drop_queue.hrxq->refcnt); + return priv->drop_queue.hrxq; + } + hrxq = rte_calloc(__func__, 1, sizeof(*hrxq), 0); + if (!hrxq) { + DRV_LOG(WARNING, + "port %u cannot allocate memory for drop queue", + dev->data->port_id); + rte_errno = ENOMEM; + goto error; + } + priv->drop_queue.hrxq = hrxq; + ind_tbl = mlx5_ind_table_obj_drop_new(dev); + if (!ind_tbl) + goto error; + hrxq->ind_table = ind_tbl; + qp = mlx5_glue->create_qp_ex(priv->sh->ctx, + &(struct ibv_qp_init_attr_ex){ + .qp_type = IBV_QPT_RAW_PACKET, + .comp_mask = + IBV_QP_INIT_ATTR_PD | + IBV_QP_INIT_ATTR_IND_TABLE | + IBV_QP_INIT_ATTR_RX_HASH, + .rx_hash_conf = (struct ibv_rx_hash_conf){ + .rx_hash_function = + IBV_RX_HASH_FUNC_TOEPLITZ, + .rx_hash_key_len = MLX5_RSS_HASH_KEY_LEN, + .rx_hash_key = rss_hash_default_key, + .rx_hash_fields_mask = 0, + }, + .rwq_ind_tbl = ind_tbl->ind_table, + .pd = priv->sh->pd + }); + if (!qp) { + DEBUG("port %u cannot allocate QP for drop queue", + dev->data->port_id); + rte_errno = errno; + goto error; + } + hrxq->qp = qp; +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + hrxq->action = mlx5_glue->dv_create_flow_action_dest_ibv_qp(hrxq->qp); + if (!hrxq->action) { + rte_errno = errno; + goto error; + } +#endif + rte_atomic32_set(&hrxq->refcnt, 1); + return hrxq; +error: +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + if (hrxq && hrxq->action) + mlx5_glue->destroy_flow_action(hrxq->action); +#endif + if (qp) + claim_zero(mlx5_glue->destroy_qp(hrxq->qp)); + if (ind_tbl) + mlx5_ind_table_obj_drop_release(dev); + if (hrxq) { + priv->drop_queue.hrxq = NULL; + rte_free(hrxq); + } + return NULL; +} + +/** + * Release a drop hash Rx queue. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx5_hrxq_drop_release(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_hrxq *hrxq = priv->drop_queue.hrxq; + + if (rte_atomic32_dec_and_test(&hrxq->refcnt)) { +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + mlx5_glue->destroy_flow_action(hrxq->action); +#endif + claim_zero(mlx5_glue->destroy_qp(hrxq->qp)); + mlx5_ind_table_obj_drop_release(dev); + rte_free(hrxq); + priv->drop_queue.hrxq = NULL; + } +} diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx.c new file mode 100644 index 000000000..6a17a9a5d --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx.c @@ -0,0 +1,5691 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2015 6WIND S.A. + * Copyright 2015-2019 Mellanox Technologies, Ltd + */ + +#include <stdint.h> +#include <string.h> +#include <stdlib.h> + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#include <infiniband/mlx5dv.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_mbuf.h> +#include <rte_mempool.h> +#include <rte_prefetch.h> +#include <rte_common.h> +#include <rte_branch_prediction.h> +#include <rte_ether.h> +#include <rte_cycles.h> +#include <rte_flow.h> + +#include <mlx5_devx_cmds.h> +#include <mlx5_prm.h> +#include <mlx5_common.h> + +#include "mlx5_defs.h" +#include "mlx5.h" +#include "mlx5_mr.h" +#include "mlx5_utils.h" +#include "mlx5_rxtx.h" +#include "mlx5_autoconf.h" + +/* TX burst subroutines return codes. */ +enum mlx5_txcmp_code { + MLX5_TXCMP_CODE_EXIT = 0, + MLX5_TXCMP_CODE_ERROR, + MLX5_TXCMP_CODE_SINGLE, + MLX5_TXCMP_CODE_MULTI, + MLX5_TXCMP_CODE_TSO, + MLX5_TXCMP_CODE_EMPW, +}; + +/* + * These defines are used to configure Tx burst routine option set + * supported at compile time. The not specified options are optimized out + * out due to if conditions can be explicitly calculated at compile time. + * The offloads with bigger runtime check (require more CPU cycles to + * skip) overhead should have the bigger index - this is needed to + * select the better matching routine function if no exact match and + * some offloads are not actually requested. + */ +#define MLX5_TXOFF_CONFIG_MULTI (1u << 0) /* Multi-segment packets.*/ +#define MLX5_TXOFF_CONFIG_TSO (1u << 1) /* TCP send offload supported.*/ +#define MLX5_TXOFF_CONFIG_SWP (1u << 2) /* Tunnels/SW Parser offloads.*/ +#define MLX5_TXOFF_CONFIG_CSUM (1u << 3) /* Check Sums offloaded. */ +#define MLX5_TXOFF_CONFIG_INLINE (1u << 4) /* Data inlining supported. */ +#define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/ +#define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */ +#define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/ +#define MLX5_TXOFF_CONFIG_MPW (1u << 9) /* Legacy MPW supported.*/ + +/* The most common offloads groups. */ +#define MLX5_TXOFF_CONFIG_NONE 0 +#define MLX5_TXOFF_CONFIG_FULL (MLX5_TXOFF_CONFIG_MULTI | \ + MLX5_TXOFF_CONFIG_TSO | \ + MLX5_TXOFF_CONFIG_SWP | \ + MLX5_TXOFF_CONFIG_CSUM | \ + MLX5_TXOFF_CONFIG_INLINE | \ + MLX5_TXOFF_CONFIG_VLAN | \ + MLX5_TXOFF_CONFIG_METADATA) + +#define MLX5_TXOFF_CONFIG(mask) (olx & MLX5_TXOFF_CONFIG_##mask) + +#define MLX5_TXOFF_DECL(func, olx) \ +static uint16_t mlx5_tx_burst_##func(void *txq, \ + struct rte_mbuf **pkts, \ + uint16_t pkts_n) \ +{ \ + return mlx5_tx_burst_tmpl((struct mlx5_txq_data *)txq, \ + pkts, pkts_n, (olx)); \ +} + +#define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx}, + +static __rte_always_inline uint32_t +rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe); + +static __rte_always_inline int +mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, + uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe); + +static __rte_always_inline uint32_t +rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe); + +static __rte_always_inline void +rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, + volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res); + +static __rte_always_inline void +mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, + const unsigned int strd_n); + +static int +mlx5_queue_state_modify(struct rte_eth_dev *dev, + struct mlx5_mp_arg_queue_state_modify *sm); + +static inline void +mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, + volatile struct mlx5_cqe *restrict cqe, + uint32_t phcsum); + +static inline void +mlx5_lro_update_hdr(uint8_t *restrict padd, + volatile struct mlx5_cqe *restrict cqe, + uint32_t len); + +uint32_t mlx5_ptype_table[] __rte_cache_aligned = { + [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */ +}; + +uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned; +uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned; + +uint64_t rte_net_mlx5_dynf_inline_mask; +#define PKT_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask + +/** + * Build a table to translate Rx completion flags to packet type. + * + * @note: fix mlx5_dev_supported_ptypes_get() if any change here. + */ +void +mlx5_set_ptype_table(void) +{ + unsigned int i; + uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table; + + /* Last entry must not be overwritten, reserved for errored packet. */ + for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i) + (*p)[i] = RTE_PTYPE_UNKNOWN; + /* + * The index to the array should have: + * bit[1:0] = l3_hdr_type + * bit[4:2] = l4_hdr_type + * bit[5] = ip_frag + * bit[6] = tunneled + * bit[7] = outer_l3_type + */ + /* L2 */ + (*p)[0x00] = RTE_PTYPE_L2_ETHER; + /* L3 */ + (*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_NONFRAG; + (*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_NONFRAG; + /* Fragmented */ + (*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG; + (*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG; + /* TCP */ + (*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x0d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x0e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + /* UDP */ + (*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP; + (*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP; + /* Repeat with outer_l3_type being set. Just in case. */ + (*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_NONFRAG; + (*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_NONFRAG; + (*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG; + (*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG; + (*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x8d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x8e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP; + (*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP; + /* Tunneled - L3 */ + (*p)[0x40] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN; + (*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_NONFRAG; + (*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_NONFRAG; + (*p)[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN; + (*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_NONFRAG; + (*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_NONFRAG; + /* Tunneled - Fragmented */ + (*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG; + (*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG; + (*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG; + (*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG; + /* Tunneled - TCP */ + (*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0x4d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0x4e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0x51] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0x52] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0xcd] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0xce] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + /* Tunneled - UDP */ + (*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP; + (*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP; + (*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP; + (*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP; +} + +/** + * Build a table to translate packet to checksum type of Verbs. + */ +void +mlx5_set_cksum_table(void) +{ + unsigned int i; + uint8_t v; + + /* + * The index should have: + * bit[0] = PKT_TX_TCP_SEG + * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM + * bit[4] = PKT_TX_IP_CKSUM + * bit[8] = PKT_TX_OUTER_IP_CKSUM + * bit[9] = tunnel + */ + for (i = 0; i < RTE_DIM(mlx5_cksum_table); ++i) { + v = 0; + if (i & (1 << 9)) { + /* Tunneled packet. */ + if (i & (1 << 8)) /* Outer IP. */ + v |= MLX5_ETH_WQE_L3_CSUM; + if (i & (1 << 4)) /* Inner IP. */ + v |= MLX5_ETH_WQE_L3_INNER_CSUM; + if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ + v |= MLX5_ETH_WQE_L4_INNER_CSUM; + } else { + /* No tunnel. */ + if (i & (1 << 4)) /* IP. */ + v |= MLX5_ETH_WQE_L3_CSUM; + if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ + v |= MLX5_ETH_WQE_L4_CSUM; + } + mlx5_cksum_table[i] = v; + } +} + +/** + * Build a table to translate packet type of mbuf to SWP type of Verbs. + */ +void +mlx5_set_swp_types_table(void) +{ + unsigned int i; + uint8_t v; + + /* + * The index should have: + * bit[0:1] = PKT_TX_L4_MASK + * bit[4] = PKT_TX_IPV6 + * bit[8] = PKT_TX_OUTER_IPV6 + * bit[9] = PKT_TX_OUTER_UDP + */ + for (i = 0; i < RTE_DIM(mlx5_swp_types_table); ++i) { + v = 0; + if (i & (1 << 8)) + v |= MLX5_ETH_WQE_L3_OUTER_IPV6; + if (i & (1 << 9)) + v |= MLX5_ETH_WQE_L4_OUTER_UDP; + if (i & (1 << 4)) + v |= MLX5_ETH_WQE_L3_INNER_IPV6; + if ((i & 3) == (PKT_TX_UDP_CKSUM >> 52)) + v |= MLX5_ETH_WQE_L4_INNER_UDP; + mlx5_swp_types_table[i] = v; + } +} + +/** + * Set Software Parser flags and offsets in Ethernet Segment of WQE. + * Flags must be preliminary initialized to zero. + * + * @param loc + * Pointer to burst routine local context. + * @param swp_flags + * Pointer to store Software Parser flags + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + * + * @return + * Software Parser offsets packed in dword. + * Software Parser flags are set by pointer. + */ +static __rte_always_inline uint32_t +txq_mbuf_to_swp(struct mlx5_txq_local *restrict loc, + uint8_t *swp_flags, + unsigned int olx) +{ + uint64_t ol, tunnel; + unsigned int idx, off; + uint32_t set; + + if (!MLX5_TXOFF_CONFIG(SWP)) + return 0; + ol = loc->mbuf->ol_flags; + tunnel = ol & PKT_TX_TUNNEL_MASK; + /* + * Check whether Software Parser is required. + * Only customized tunnels may ask for. + */ + if (likely(tunnel != PKT_TX_TUNNEL_UDP && tunnel != PKT_TX_TUNNEL_IP)) + return 0; + /* + * The index should have: + * bit[0:1] = PKT_TX_L4_MASK + * bit[4] = PKT_TX_IPV6 + * bit[8] = PKT_TX_OUTER_IPV6 + * bit[9] = PKT_TX_OUTER_UDP + */ + idx = (ol & (PKT_TX_L4_MASK | PKT_TX_IPV6 | PKT_TX_OUTER_IPV6)) >> 52; + idx |= (tunnel == PKT_TX_TUNNEL_UDP) ? (1 << 9) : 0; + *swp_flags = mlx5_swp_types_table[idx]; + /* + * Set offsets for SW parser. Since ConnectX-5, SW parser just + * complements HW parser. SW parser starts to engage only if HW parser + * can't reach a header. For the older devices, HW parser will not kick + * in if any of SWP offsets is set. Therefore, all of the L3 offsets + * should be set regardless of HW offload. + */ + off = loc->mbuf->outer_l2_len; + if (MLX5_TXOFF_CONFIG(VLAN) && ol & PKT_TX_VLAN_PKT) + off += sizeof(struct rte_vlan_hdr); + set = (off >> 1) << 8; /* Outer L3 offset. */ + off += loc->mbuf->outer_l3_len; + if (tunnel == PKT_TX_TUNNEL_UDP) + set |= off >> 1; /* Outer L4 offset. */ + if (ol & (PKT_TX_IPV4 | PKT_TX_IPV6)) { /* Inner IP. */ + const uint64_t csum = ol & PKT_TX_L4_MASK; + off += loc->mbuf->l2_len; + set |= (off >> 1) << 24; /* Inner L3 offset. */ + if (csum == PKT_TX_TCP_CKSUM || + csum == PKT_TX_UDP_CKSUM || + (MLX5_TXOFF_CONFIG(TSO) && ol & PKT_TX_TCP_SEG)) { + off += loc->mbuf->l3_len; + set |= (off >> 1) << 16; /* Inner L4 offset. */ + } + } + set = rte_cpu_to_le_32(set); + return set; +} + +/** + * Convert the Checksum offloads to Verbs. + * + * @param buf + * Pointer to the mbuf. + * + * @return + * Converted checksum flags. + */ +static __rte_always_inline uint8_t +txq_ol_cksum_to_cs(struct rte_mbuf *buf) +{ + uint32_t idx; + uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK); + const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK | + PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM; + + /* + * The index should have: + * bit[0] = PKT_TX_TCP_SEG + * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM + * bit[4] = PKT_TX_IP_CKSUM + * bit[8] = PKT_TX_OUTER_IP_CKSUM + * bit[9] = tunnel + */ + idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9); + return mlx5_cksum_table[idx]; +} + +/** + * Internal function to compute the number of used descriptors in an RX queue + * + * @param rxq + * The Rx queue. + * + * @return + * The number of used rx descriptor. + */ +static uint32_t +rx_queue_count(struct mlx5_rxq_data *rxq) +{ + struct rxq_zip *zip = &rxq->zip; + volatile struct mlx5_cqe *cqe; + const unsigned int cqe_n = (1 << rxq->cqe_n); + const unsigned int cqe_cnt = cqe_n - 1; + unsigned int cq_ci; + unsigned int used; + + /* if we are processing a compressed cqe */ + if (zip->ai) { + used = zip->cqe_cnt - zip->ca; + cq_ci = zip->cq_ci; + } else { + used = 0; + cq_ci = rxq->cq_ci; + } + cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; + while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) { + int8_t op_own; + unsigned int n; + + op_own = cqe->op_own; + if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) + n = rte_be_to_cpu_32(cqe->byte_cnt); + else + n = 1; + cq_ci += n; + used += n; + cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; + } + used = RTE_MIN(used, (1U << rxq->elts_n) - 1); + return used; +} + +/** + * DPDK callback to check the status of a rx descriptor. + * + * @param rx_queue + * The Rx queue. + * @param[in] offset + * The index of the descriptor in the ring. + * + * @return + * The status of the tx descriptor. + */ +int +mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset) +{ + struct mlx5_rxq_data *rxq = rx_queue; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of(rxq, struct mlx5_rxq_ctrl, rxq); + struct rte_eth_dev *dev = ETH_DEV(rxq_ctrl->priv); + + if (dev->rx_pkt_burst != mlx5_rx_burst) { + rte_errno = ENOTSUP; + return -rte_errno; + } + if (offset >= (1 << rxq->elts_n)) { + rte_errno = EINVAL; + return -rte_errno; + } + if (offset < rx_queue_count(rxq)) + return RTE_ETH_RX_DESC_DONE; + return RTE_ETH_RX_DESC_AVAIL; +} + +/** + * DPDK callback to get the RX queue information + * + * @param dev + * Pointer to the device structure. + * + * @param rx_queue_id + * Rx queue identificator. + * + * @param qinfo + * Pointer to the RX queue information structure. + * + * @return + * None. + */ + +void +mlx5_rxq_info_get(struct rte_eth_dev *dev, uint16_t rx_queue_id, + struct rte_eth_rxq_info *qinfo) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_data *rxq = (*priv->rxqs)[rx_queue_id]; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of(rxq, struct mlx5_rxq_ctrl, rxq); + + if (!rxq) + return; + qinfo->mp = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ? + rxq->mprq_mp : rxq->mp; + qinfo->conf.rx_thresh.pthresh = 0; + qinfo->conf.rx_thresh.hthresh = 0; + qinfo->conf.rx_thresh.wthresh = 0; + qinfo->conf.rx_free_thresh = rxq->rq_repl_thresh; + qinfo->conf.rx_drop_en = 1; + qinfo->conf.rx_deferred_start = rxq_ctrl ? 0 : 1; + qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads; + qinfo->scattered_rx = dev->data->scattered_rx; + qinfo->nb_desc = 1 << rxq->elts_n; +} + +/** + * DPDK callback to get the RX packet burst mode information + * + * @param dev + * Pointer to the device structure. + * + * @param rx_queue_id + * Rx queue identificatior. + * + * @param mode + * Pointer to the burts mode information. + * + * @return + * 0 as success, -EINVAL as failure. + */ + +int +mlx5_rx_burst_mode_get(struct rte_eth_dev *dev, + uint16_t rx_queue_id __rte_unused, + struct rte_eth_burst_mode *mode) +{ + eth_rx_burst_t pkt_burst = dev->rx_pkt_burst; + + if (pkt_burst == mlx5_rx_burst) { + snprintf(mode->info, sizeof(mode->info), "%s", "Scalar"); + } else if (pkt_burst == mlx5_rx_burst_mprq) { + snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ"); + } else if (pkt_burst == mlx5_rx_burst_vec) { +#if defined RTE_ARCH_X86_64 + snprintf(mode->info, sizeof(mode->info), "%s", "Vector SSE"); +#elif defined RTE_ARCH_ARM64 + snprintf(mode->info, sizeof(mode->info), "%s", "Vector Neon"); +#elif defined RTE_ARCH_PPC_64 + snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec"); +#else + return -EINVAL; +#endif + } else { + return -EINVAL; + } + return 0; +} + +/** + * DPDK callback to get the number of used descriptors in a RX queue + * + * @param dev + * Pointer to the device structure. + * + * @param rx_queue_id + * The Rx queue. + * + * @return + * The number of used rx descriptor. + * -EINVAL if the queue is invalid + */ +uint32_t +mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_data *rxq; + + if (dev->rx_pkt_burst != mlx5_rx_burst) { + rte_errno = ENOTSUP; + return -rte_errno; + } + rxq = (*priv->rxqs)[rx_queue_id]; + if (!rxq) { + rte_errno = EINVAL; + return -rte_errno; + } + return rx_queue_count(rxq); +} + +#define MLX5_SYSTEM_LOG_DIR "/var/log" +/** + * Dump debug information to log file. + * + * @param fname + * The file name. + * @param hex_title + * If not NULL this string is printed as a header to the output + * and the output will be in hexadecimal view. + * @param buf + * This is the buffer address to print out. + * @param len + * The number of bytes to dump out. + */ +void +mlx5_dump_debug_information(const char *fname, const char *hex_title, + const void *buf, unsigned int hex_len) +{ + FILE *fd; + + MKSTR(path, "%s/%s", MLX5_SYSTEM_LOG_DIR, fname); + fd = fopen(path, "a+"); + if (!fd) { + DRV_LOG(WARNING, "cannot open %s for debug dump", path); + MKSTR(path2, "./%s", fname); + fd = fopen(path2, "a+"); + if (!fd) { + DRV_LOG(ERR, "cannot open %s for debug dump", path2); + return; + } + DRV_LOG(INFO, "New debug dump in file %s", path2); + } else { + DRV_LOG(INFO, "New debug dump in file %s", path); + } + if (hex_title) + rte_hexdump(fd, hex_title, buf, hex_len); + else + fprintf(fd, "%s", (const char *)buf); + fprintf(fd, "\n\n\n"); + fclose(fd); +} + +/** + * Move QP from error state to running state and initialize indexes. + * + * @param txq_ctrl + * Pointer to TX queue control structure. + * + * @return + * 0 on success, else -1. + */ +static int +tx_recover_qp(struct mlx5_txq_ctrl *txq_ctrl) +{ + struct mlx5_mp_arg_queue_state_modify sm = { + .is_wq = 0, + .queue_id = txq_ctrl->txq.idx, + }; + + if (mlx5_queue_state_modify(ETH_DEV(txq_ctrl->priv), &sm)) + return -1; + txq_ctrl->txq.wqe_ci = 0; + txq_ctrl->txq.wqe_pi = 0; + txq_ctrl->txq.elts_comp = 0; + return 0; +} + +/* Return 1 if the error CQE is signed otherwise, sign it and return 0. */ +static int +check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe) +{ + static const uint8_t magic[] = "seen"; + int ret = 1; + unsigned int i; + + for (i = 0; i < sizeof(magic); ++i) + if (!ret || err_cqe->rsvd1[i] != magic[i]) { + ret = 0; + err_cqe->rsvd1[i] = magic[i]; + } + return ret; +} + +/** + * Handle error CQE. + * + * @param txq + * Pointer to TX queue structure. + * @param error_cqe + * Pointer to the error CQE. + * + * @return + * Negative value if queue recovery failed, otherwise + * the error completion entry is handled successfully. + */ +static int +mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq, + volatile struct mlx5_err_cqe *err_cqe) +{ + if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) { + const uint16_t wqe_m = ((1 << txq->wqe_n) - 1); + struct mlx5_txq_ctrl *txq_ctrl = + container_of(txq, struct mlx5_txq_ctrl, txq); + uint16_t new_wqe_pi = rte_be_to_cpu_16(err_cqe->wqe_counter); + int seen = check_err_cqe_seen(err_cqe); + + if (!seen && txq_ctrl->dump_file_n < + txq_ctrl->priv->config.max_dump_files_num) { + MKSTR(err_str, "Unexpected CQE error syndrome " + "0x%02x CQN = %u SQN = %u wqe_counter = %u " + "wq_ci = %u cq_ci = %u", err_cqe->syndrome, + txq->cqe_s, txq->qp_num_8s >> 8, + rte_be_to_cpu_16(err_cqe->wqe_counter), + txq->wqe_ci, txq->cq_ci); + MKSTR(name, "dpdk_mlx5_port_%u_txq_%u_index_%u_%u", + PORT_ID(txq_ctrl->priv), txq->idx, + txq_ctrl->dump_file_n, (uint32_t)rte_rdtsc()); + mlx5_dump_debug_information(name, NULL, err_str, 0); + mlx5_dump_debug_information(name, "MLX5 Error CQ:", + (const void *)((uintptr_t) + txq->cqes), + sizeof(*err_cqe) * + (1 << txq->cqe_n)); + mlx5_dump_debug_information(name, "MLX5 Error SQ:", + (const void *)((uintptr_t) + txq->wqes), + MLX5_WQE_SIZE * + (1 << txq->wqe_n)); + txq_ctrl->dump_file_n++; + } + if (!seen) + /* + * Count errors in WQEs units. + * Later it can be improved to count error packets, + * for example, by SQ parsing to find how much packets + * should be counted for each WQE. + */ + txq->stats.oerrors += ((txq->wqe_ci & wqe_m) - + new_wqe_pi) & wqe_m; + if (tx_recover_qp(txq_ctrl)) { + /* Recovering failed - retry later on the same WQE. */ + return -1; + } + /* Release all the remaining buffers. */ + txq_free_elts(txq_ctrl); + } + return 0; +} + +/** + * Translate RX completion flags to packet type. + * + * @param[in] rxq + * Pointer to RX queue structure. + * @param[in] cqe + * Pointer to CQE. + * + * @note: fix mlx5_dev_supported_ptypes_get() if any change here. + * + * @return + * Packet type for struct rte_mbuf. + */ +static inline uint32_t +rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe) +{ + uint8_t idx; + uint8_t pinfo = cqe->pkt_info; + uint16_t ptype = cqe->hdr_type_etc; + + /* + * The index to the array should have: + * bit[1:0] = l3_hdr_type + * bit[4:2] = l4_hdr_type + * bit[5] = ip_frag + * bit[6] = tunneled + * bit[7] = outer_l3_type + */ + idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10); + return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6)); +} + +/** + * Initialize Rx WQ and indexes. + * + * @param[in] rxq + * Pointer to RX queue structure. + */ +void +mlx5_rxq_initialize(struct mlx5_rxq_data *rxq) +{ + const unsigned int wqe_n = 1 << rxq->elts_n; + unsigned int i; + + for (i = 0; (i != wqe_n); ++i) { + volatile struct mlx5_wqe_data_seg *scat; + uintptr_t addr; + uint32_t byte_count; + + if (mlx5_rxq_mprq_enabled(rxq)) { + struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i]; + + scat = &((volatile struct mlx5_wqe_mprq *) + rxq->wqes)[i].dseg; + addr = (uintptr_t)mlx5_mprq_buf_addr(buf, + 1 << rxq->strd_num_n); + byte_count = (1 << rxq->strd_sz_n) * + (1 << rxq->strd_num_n); + } else { + struct rte_mbuf *buf = (*rxq->elts)[i]; + + scat = &((volatile struct mlx5_wqe_data_seg *) + rxq->wqes)[i]; + addr = rte_pktmbuf_mtod(buf, uintptr_t); + byte_count = DATA_LEN(buf); + } + /* scat->addr must be able to store a pointer. */ + MLX5_ASSERT(sizeof(scat->addr) >= sizeof(uintptr_t)); + *scat = (struct mlx5_wqe_data_seg){ + .addr = rte_cpu_to_be_64(addr), + .byte_count = rte_cpu_to_be_32(byte_count), + .lkey = mlx5_rx_addr2mr(rxq, addr), + }; + } + rxq->consumed_strd = 0; + rxq->decompressed = 0; + rxq->rq_pi = 0; + rxq->zip = (struct rxq_zip){ + .ai = 0, + }; + /* Update doorbell counter. */ + rxq->rq_ci = wqe_n >> rxq->sges_n; + rte_cio_wmb(); + *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); +} + +/** + * Modify a Verbs/DevX queue state. + * This must be called from the primary process. + * + * @param dev + * Pointer to Ethernet device. + * @param sm + * State modify request parameters. + * + * @return + * 0 in case of success else non-zero value and rte_errno is set. + */ +int +mlx5_queue_state_modify_primary(struct rte_eth_dev *dev, + const struct mlx5_mp_arg_queue_state_modify *sm) +{ + int ret; + struct mlx5_priv *priv = dev->data->dev_private; + + if (sm->is_wq) { + struct mlx5_rxq_data *rxq = (*priv->rxqs)[sm->queue_id]; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of(rxq, struct mlx5_rxq_ctrl, rxq); + + if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV) { + struct ibv_wq_attr mod = { + .attr_mask = IBV_WQ_ATTR_STATE, + .wq_state = sm->state, + }; + + ret = mlx5_glue->modify_wq(rxq_ctrl->obj->wq, &mod); + } else { /* rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ. */ + struct mlx5_devx_modify_rq_attr rq_attr; + + memset(&rq_attr, 0, sizeof(rq_attr)); + if (sm->state == IBV_WQS_RESET) { + rq_attr.rq_state = MLX5_RQC_STATE_ERR; + rq_attr.state = MLX5_RQC_STATE_RST; + } else if (sm->state == IBV_WQS_RDY) { + rq_attr.rq_state = MLX5_RQC_STATE_RST; + rq_attr.state = MLX5_RQC_STATE_RDY; + } else if (sm->state == IBV_WQS_ERR) { + rq_attr.rq_state = MLX5_RQC_STATE_RDY; + rq_attr.state = MLX5_RQC_STATE_ERR; + } + ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, + &rq_attr); + } + if (ret) { + DRV_LOG(ERR, "Cannot change Rx WQ state to %u - %s", + sm->state, strerror(errno)); + rte_errno = errno; + return ret; + } + } else { + struct mlx5_txq_data *txq = (*priv->txqs)[sm->queue_id]; + struct mlx5_txq_ctrl *txq_ctrl = + container_of(txq, struct mlx5_txq_ctrl, txq); + struct ibv_qp_attr mod = { + .qp_state = IBV_QPS_RESET, + .port_num = (uint8_t)priv->ibv_port, + }; + struct ibv_qp *qp = txq_ctrl->obj->qp; + + ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); + if (ret) { + DRV_LOG(ERR, "Cannot change the Tx QP state to RESET " + "%s", strerror(errno)); + rte_errno = errno; + return ret; + } + mod.qp_state = IBV_QPS_INIT; + ret = mlx5_glue->modify_qp(qp, &mod, + (IBV_QP_STATE | IBV_QP_PORT)); + if (ret) { + DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s", + strerror(errno)); + rte_errno = errno; + return ret; + } + mod.qp_state = IBV_QPS_RTR; + ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); + if (ret) { + DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s", + strerror(errno)); + rte_errno = errno; + return ret; + } + mod.qp_state = IBV_QPS_RTS; + ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); + if (ret) { + DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s", + strerror(errno)); + rte_errno = errno; + return ret; + } + } + return 0; +} + +/** + * Modify a Verbs queue state. + * + * @param dev + * Pointer to Ethernet device. + * @param sm + * State modify request parameters. + * + * @return + * 0 in case of success else non-zero value. + */ +static int +mlx5_queue_state_modify(struct rte_eth_dev *dev, + struct mlx5_mp_arg_queue_state_modify *sm) +{ + struct mlx5_priv *priv = dev->data->dev_private; + int ret = 0; + + switch (rte_eal_process_type()) { + case RTE_PROC_PRIMARY: + ret = mlx5_queue_state_modify_primary(dev, sm); + break; + case RTE_PROC_SECONDARY: + ret = mlx5_mp_req_queue_state_modify(&priv->mp_id, sm); + break; + default: + break; + } + return ret; +} + +/** + * Handle a Rx error. + * The function inserts the RQ state to reset when the first error CQE is + * shown, then drains the CQ by the caller function loop. When the CQ is empty, + * it moves the RQ state to ready and initializes the RQ. + * Next CQE identification and error counting are in the caller responsibility. + * + * @param[in] rxq + * Pointer to RX queue structure. + * @param[in] vec + * 1 when called from vectorized Rx burst, need to prepare mbufs for the RQ. + * 0 when called from non-vectorized Rx burst. + * + * @return + * -1 in case of recovery error, otherwise the CQE status. + */ +int +mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec) +{ + const uint16_t cqe_n = 1 << rxq->cqe_n; + const uint16_t cqe_mask = cqe_n - 1; + const unsigned int wqe_n = 1 << rxq->elts_n; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of(rxq, struct mlx5_rxq_ctrl, rxq); + union { + volatile struct mlx5_cqe *cqe; + volatile struct mlx5_err_cqe *err_cqe; + } u = { + .cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask], + }; + struct mlx5_mp_arg_queue_state_modify sm; + int ret; + + switch (rxq->err_state) { + case MLX5_RXQ_ERR_STATE_NO_ERROR: + rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET; + /* Fall-through */ + case MLX5_RXQ_ERR_STATE_NEED_RESET: + sm.is_wq = 1; + sm.queue_id = rxq->idx; + sm.state = IBV_WQS_RESET; + if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), &sm)) + return -1; + if (rxq_ctrl->dump_file_n < + rxq_ctrl->priv->config.max_dump_files_num) { + MKSTR(err_str, "Unexpected CQE error syndrome " + "0x%02x CQN = %u RQN = %u wqe_counter = %u" + " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome, + rxq->cqn, rxq_ctrl->wqn, + rte_be_to_cpu_16(u.err_cqe->wqe_counter), + rxq->rq_ci << rxq->sges_n, rxq->cq_ci); + MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u", + rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc()); + mlx5_dump_debug_information(name, NULL, err_str, 0); + mlx5_dump_debug_information(name, "MLX5 Error CQ:", + (const void *)((uintptr_t) + rxq->cqes), + sizeof(*u.cqe) * cqe_n); + mlx5_dump_debug_information(name, "MLX5 Error RQ:", + (const void *)((uintptr_t) + rxq->wqes), + 16 * wqe_n); + rxq_ctrl->dump_file_n++; + } + rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY; + /* Fall-through */ + case MLX5_RXQ_ERR_STATE_NEED_READY: + ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci); + if (ret == MLX5_CQE_STATUS_HW_OWN) { + rte_cio_wmb(); + *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); + rte_cio_wmb(); + /* + * The RQ consumer index must be zeroed while moving + * from RESET state to RDY state. + */ + *rxq->rq_db = rte_cpu_to_be_32(0); + rte_cio_wmb(); + sm.is_wq = 1; + sm.queue_id = rxq->idx; + sm.state = IBV_WQS_RDY; + if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), + &sm)) + return -1; + if (vec) { + const uint16_t q_mask = wqe_n - 1; + uint16_t elt_idx; + struct rte_mbuf **elt; + int i; + unsigned int n = wqe_n - (rxq->rq_ci - + rxq->rq_pi); + + for (i = 0; i < (int)n; ++i) { + elt_idx = (rxq->rq_ci + i) & q_mask; + elt = &(*rxq->elts)[elt_idx]; + *elt = rte_mbuf_raw_alloc(rxq->mp); + if (!*elt) { + for (i--; i >= 0; --i) { + elt_idx = (rxq->rq_ci + + i) & q_mask; + elt = &(*rxq->elts) + [elt_idx]; + rte_pktmbuf_free_seg + (*elt); + } + return -1; + } + } + for (i = 0; i < (int)wqe_n; ++i) { + elt = &(*rxq->elts)[i]; + DATA_LEN(*elt) = + (uint16_t)((*elt)->buf_len - + rte_pktmbuf_headroom(*elt)); + } + /* Padding with a fake mbuf for vec Rx. */ + for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) + (*rxq->elts)[wqe_n + i] = + &rxq->fake_mbuf; + } + mlx5_rxq_initialize(rxq); + rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR; + } + return ret; + default: + return -1; + } +} + +/** + * Get size of the next packet for a given CQE. For compressed CQEs, the + * consumer index is updated only once all packets of the current one have + * been processed. + * + * @param rxq + * Pointer to RX queue. + * @param cqe + * CQE to process. + * @param[out] mcqe + * Store pointer to mini-CQE if compressed. Otherwise, the pointer is not + * written. + * + * @return + * 0 in case of empty CQE, otherwise the packet size in bytes. + */ +static inline int +mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, + uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe) +{ + struct rxq_zip *zip = &rxq->zip; + uint16_t cqe_n = cqe_cnt + 1; + int len; + uint16_t idx, end; + + do { + len = 0; + /* Process compressed data in the CQE and mini arrays. */ + if (zip->ai) { + volatile struct mlx5_mini_cqe8 (*mc)[8] = + (volatile struct mlx5_mini_cqe8 (*)[8]) + (uintptr_t)(&(*rxq->cqes)[zip->ca & + cqe_cnt].pkt_info); + + len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt); + *mcqe = &(*mc)[zip->ai & 7]; + if ((++zip->ai & 7) == 0) { + /* Invalidate consumed CQEs */ + idx = zip->ca; + end = zip->na; + while (idx != end) { + (*rxq->cqes)[idx & cqe_cnt].op_own = + MLX5_CQE_INVALIDATE; + ++idx; + } + /* + * Increment consumer index to skip the number + * of CQEs consumed. Hardware leaves holes in + * the CQ ring for software use. + */ + zip->ca = zip->na; + zip->na += 8; + } + if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { + /* Invalidate the rest */ + idx = zip->ca; + end = zip->cq_ci; + + while (idx != end) { + (*rxq->cqes)[idx & cqe_cnt].op_own = + MLX5_CQE_INVALIDATE; + ++idx; + } + rxq->cq_ci = zip->cq_ci; + zip->ai = 0; + } + /* + * No compressed data, get next CQE and verify if it is + * compressed. + */ + } else { + int ret; + int8_t op_own; + + ret = check_cqe(cqe, cqe_n, rxq->cq_ci); + if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { + if (unlikely(ret == MLX5_CQE_STATUS_ERR || + rxq->err_state)) { + ret = mlx5_rx_err_handle(rxq, 0); + if (ret == MLX5_CQE_STATUS_HW_OWN || + ret == -1) + return 0; + } else { + return 0; + } + } + ++rxq->cq_ci; + op_own = cqe->op_own; + if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { + volatile struct mlx5_mini_cqe8 (*mc)[8] = + (volatile struct mlx5_mini_cqe8 (*)[8]) + (uintptr_t)(&(*rxq->cqes) + [rxq->cq_ci & + cqe_cnt].pkt_info); + + /* Fix endianness. */ + zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt); + /* + * Current mini array position is the one + * returned by check_cqe64(). + * + * If completion comprises several mini arrays, + * as a special case the second one is located + * 7 CQEs after the initial CQE instead of 8 + * for subsequent ones. + */ + zip->ca = rxq->cq_ci; + zip->na = zip->ca + 7; + /* Compute the next non compressed CQE. */ + --rxq->cq_ci; + zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; + /* Get packet size to return. */ + len = rte_be_to_cpu_32((*mc)[0].byte_cnt); + *mcqe = &(*mc)[0]; + zip->ai = 1; + /* Prefetch all to be invalidated */ + idx = zip->ca; + end = zip->cq_ci; + while (idx != end) { + rte_prefetch0(&(*rxq->cqes)[(idx) & + cqe_cnt]); + ++idx; + } + } else { + len = rte_be_to_cpu_32(cqe->byte_cnt); + } + } + if (unlikely(rxq->err_state)) { + cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; + ++rxq->stats.idropped; + } else { + return len; + } + } while (1); +} + +/** + * Translate RX completion flags to offload flags. + * + * @param[in] cqe + * Pointer to CQE. + * + * @return + * Offload flags (ol_flags) for struct rte_mbuf. + */ +static inline uint32_t +rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe) +{ + uint32_t ol_flags = 0; + uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc); + + ol_flags = + TRANSPOSE(flags, + MLX5_CQE_RX_L3_HDR_VALID, + PKT_RX_IP_CKSUM_GOOD) | + TRANSPOSE(flags, + MLX5_CQE_RX_L4_HDR_VALID, + PKT_RX_L4_CKSUM_GOOD); + return ol_flags; +} + +/** + * Fill in mbuf fields from RX completion flags. + * Note that pkt->ol_flags should be initialized outside of this function. + * + * @param rxq + * Pointer to RX queue. + * @param pkt + * mbuf to fill. + * @param cqe + * CQE to process. + * @param rss_hash_res + * Packet RSS Hash result. + */ +static inline void +rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, + volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res) +{ + /* Update packet information. */ + pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe); + if (rss_hash_res && rxq->rss_hash) { + pkt->hash.rss = rss_hash_res; + pkt->ol_flags |= PKT_RX_RSS_HASH; + } + if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) { + pkt->ol_flags |= PKT_RX_FDIR; + if (cqe->sop_drop_qpn != + rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) { + uint32_t mark = cqe->sop_drop_qpn; + + pkt->ol_flags |= PKT_RX_FDIR_ID; + pkt->hash.fdir.hi = mlx5_flow_mark_get(mark); + } + } + if (rxq->dynf_meta && cqe->flow_table_metadata) { + pkt->ol_flags |= rxq->flow_meta_mask; + *RTE_MBUF_DYNFIELD(pkt, rxq->flow_meta_offset, uint32_t *) = + cqe->flow_table_metadata; + } + if (rxq->csum) + pkt->ol_flags |= rxq_cq_to_ol_flags(cqe); + if (rxq->vlan_strip && + (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) { + pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED; + pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info); + } + if (rxq->hw_timestamp) { + pkt->timestamp = rte_be_to_cpu_64(cqe->timestamp); + pkt->ol_flags |= PKT_RX_TIMESTAMP; + } +} + +/** + * DPDK callback for RX. + * + * @param dpdk_rxq + * Generic pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * + * @return + * Number of packets successfully received (<= pkts_n). + */ +uint16_t +mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + struct mlx5_rxq_data *rxq = dpdk_rxq; + const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1; + const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1; + const unsigned int sges_n = rxq->sges_n; + struct rte_mbuf *pkt = NULL; + struct rte_mbuf *seg = NULL; + volatile struct mlx5_cqe *cqe = + &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; + unsigned int i = 0; + unsigned int rq_ci = rxq->rq_ci << sges_n; + int len = 0; /* keep its value across iterations. */ + + while (pkts_n) { + unsigned int idx = rq_ci & wqe_cnt; + volatile struct mlx5_wqe_data_seg *wqe = + &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx]; + struct rte_mbuf *rep = (*rxq->elts)[idx]; + volatile struct mlx5_mini_cqe8 *mcqe = NULL; + uint32_t rss_hash_res; + + if (pkt) + NEXT(seg) = rep; + seg = rep; + rte_prefetch0(seg); + rte_prefetch0(cqe); + rte_prefetch0(wqe); + rep = rte_mbuf_raw_alloc(rxq->mp); + if (unlikely(rep == NULL)) { + ++rxq->stats.rx_nombuf; + if (!pkt) { + /* + * no buffers before we even started, + * bail out silently. + */ + break; + } + while (pkt != seg) { + MLX5_ASSERT(pkt != (*rxq->elts)[idx]); + rep = NEXT(pkt); + NEXT(pkt) = NULL; + NB_SEGS(pkt) = 1; + rte_mbuf_raw_free(pkt); + pkt = rep; + } + break; + } + if (!pkt) { + cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; + len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe); + if (!len) { + rte_mbuf_raw_free(rep); + break; + } + pkt = seg; + MLX5_ASSERT(len >= (rxq->crc_present << 2)); + pkt->ol_flags &= EXT_ATTACHED_MBUF; + /* If compressed, take hash result from mini-CQE. */ + rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ? + cqe->rx_hash_res : + mcqe->rx_hash_result); + rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); + if (rxq->crc_present) + len -= RTE_ETHER_CRC_LEN; + PKT_LEN(pkt) = len; + if (cqe->lro_num_seg > 1) { + mlx5_lro_update_hdr + (rte_pktmbuf_mtod(pkt, uint8_t *), cqe, + len); + pkt->ol_flags |= PKT_RX_LRO; + pkt->tso_segsz = len / cqe->lro_num_seg; + } + } + DATA_LEN(rep) = DATA_LEN(seg); + PKT_LEN(rep) = PKT_LEN(seg); + SET_DATA_OFF(rep, DATA_OFF(seg)); + PORT(rep) = PORT(seg); + (*rxq->elts)[idx] = rep; + /* + * Fill NIC descriptor with the new buffer. The lkey and size + * of the buffers are already known, only the buffer address + * changes. + */ + wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); + /* If there's only one MR, no need to replace LKey in WQE. */ + if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) + wqe->lkey = mlx5_rx_mb2mr(rxq, rep); + if (len > DATA_LEN(seg)) { + len -= DATA_LEN(seg); + ++NB_SEGS(pkt); + ++rq_ci; + continue; + } + DATA_LEN(seg) = len; +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Increment bytes counter. */ + rxq->stats.ibytes += PKT_LEN(pkt); +#endif + /* Return packet. */ + *(pkts++) = pkt; + pkt = NULL; + --pkts_n; + ++i; + /* Align consumer index to the next stride. */ + rq_ci >>= sges_n; + ++rq_ci; + rq_ci <<= sges_n; + } + if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci))) + return 0; + /* Update the consumer index. */ + rxq->rq_ci = rq_ci >> sges_n; + rte_cio_wmb(); + *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); + rte_cio_wmb(); + *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Increment packets counter. */ + rxq->stats.ipackets += i; +#endif + return i; +} + +/** + * Update LRO packet TCP header. + * The HW LRO feature doesn't update the TCP header after coalescing the + * TCP segments but supplies information in CQE to fill it by SW. + * + * @param tcp + * Pointer to the TCP header. + * @param cqe + * Pointer to the completion entry.. + * @param phcsum + * The L3 pseudo-header checksum. + */ +static inline void +mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, + volatile struct mlx5_cqe *restrict cqe, + uint32_t phcsum) +{ + uint8_t l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) & + MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT; + /* + * The HW calculates only the TCP payload checksum, need to complete + * the TCP header checksum and the L3 pseudo-header checksum. + */ + uint32_t csum = phcsum + cqe->csum; + + if (l4_type == MLX5_L4_HDR_TYPE_TCP_EMPTY_ACK || + l4_type == MLX5_L4_HDR_TYPE_TCP_WITH_ACL) { + tcp->tcp_flags |= RTE_TCP_ACK_FLAG; + tcp->recv_ack = cqe->lro_ack_seq_num; + tcp->rx_win = cqe->lro_tcp_win; + } + if (cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_PUSH_MASK) + tcp->tcp_flags |= RTE_TCP_PSH_FLAG; + tcp->cksum = 0; + csum += rte_raw_cksum(tcp, (tcp->data_off & 0xF) * 4); + csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff); + csum = (~csum) & 0xffff; + if (csum == 0) + csum = 0xffff; + tcp->cksum = csum; +} + +/** + * Update LRO packet headers. + * The HW LRO feature doesn't update the L3/TCP headers after coalescing the + * TCP segments but supply information in CQE to fill it by SW. + * + * @param padd + * The packet address. + * @param cqe + * Pointer to the completion entry.. + * @param len + * The packet length. + */ +static inline void +mlx5_lro_update_hdr(uint8_t *restrict padd, + volatile struct mlx5_cqe *restrict cqe, + uint32_t len) +{ + union { + struct rte_ether_hdr *eth; + struct rte_vlan_hdr *vlan; + struct rte_ipv4_hdr *ipv4; + struct rte_ipv6_hdr *ipv6; + struct rte_tcp_hdr *tcp; + uint8_t *hdr; + } h = { + .hdr = padd, + }; + uint16_t proto = h.eth->ether_type; + uint32_t phcsum; + + h.eth++; + while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) || + proto == RTE_BE16(RTE_ETHER_TYPE_QINQ)) { + proto = h.vlan->eth_proto; + h.vlan++; + } + if (proto == RTE_BE16(RTE_ETHER_TYPE_IPV4)) { + h.ipv4->time_to_live = cqe->lro_min_ttl; + h.ipv4->total_length = rte_cpu_to_be_16(len - (h.hdr - padd)); + h.ipv4->hdr_checksum = 0; + h.ipv4->hdr_checksum = rte_ipv4_cksum(h.ipv4); + phcsum = rte_ipv4_phdr_cksum(h.ipv4, 0); + h.ipv4++; + } else { + h.ipv6->hop_limits = cqe->lro_min_ttl; + h.ipv6->payload_len = rte_cpu_to_be_16(len - (h.hdr - padd) - + sizeof(*h.ipv6)); + phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0); + h.ipv6++; + } + mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum); +} + +void +mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque) +{ + struct mlx5_mprq_buf *buf = opaque; + + if (rte_atomic16_read(&buf->refcnt) == 1) { + rte_mempool_put(buf->mp, buf); + } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) { + rte_atomic16_set(&buf->refcnt, 1); + rte_mempool_put(buf->mp, buf); + } +} + +void +mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf) +{ + mlx5_mprq_buf_free_cb(NULL, buf); +} + +static inline void +mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, + const unsigned int strd_n) +{ + struct mlx5_mprq_buf *rep = rxq->mprq_repl; + volatile struct mlx5_wqe_data_seg *wqe = + &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg; + void *addr; + + MLX5_ASSERT(rep != NULL); + /* Replace MPRQ buf. */ + (*rxq->mprq_bufs)[rq_idx] = rep; + /* Replace WQE. */ + addr = mlx5_mprq_buf_addr(rep, strd_n); + wqe->addr = rte_cpu_to_be_64((uintptr_t)addr); + /* If there's only one MR, no need to replace LKey in WQE. */ + if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) + wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr); + /* Stash a mbuf for next replacement. */ + if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep))) + rxq->mprq_repl = rep; + else + rxq->mprq_repl = NULL; +} + +/** + * DPDK callback for RX with Multi-Packet RQ support. + * + * @param dpdk_rxq + * Generic pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * + * @return + * Number of packets successfully received (<= pkts_n). + */ +uint16_t +mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + struct mlx5_rxq_data *rxq = dpdk_rxq; + const unsigned int strd_n = 1 << rxq->strd_num_n; + const unsigned int strd_sz = 1 << rxq->strd_sz_n; + const unsigned int strd_shift = + MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en; + const unsigned int cq_mask = (1 << rxq->cqe_n) - 1; + const unsigned int wq_mask = (1 << rxq->elts_n) - 1; + volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; + unsigned int i = 0; + uint32_t rq_ci = rxq->rq_ci; + uint16_t consumed_strd = rxq->consumed_strd; + struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; + + while (i < pkts_n) { + struct rte_mbuf *pkt; + void *addr; + int ret; + uint32_t len; + uint16_t strd_cnt; + uint16_t strd_idx; + uint32_t offset; + uint32_t byte_cnt; + int32_t hdrm_overlap; + volatile struct mlx5_mini_cqe8 *mcqe = NULL; + uint32_t rss_hash_res = 0; + + if (consumed_strd == strd_n) { + /* Replace WQE only if the buffer is still in use. */ + if (rte_atomic16_read(&buf->refcnt) > 1) { + mprq_buf_replace(rxq, rq_ci & wq_mask, strd_n); + /* Release the old buffer. */ + mlx5_mprq_buf_free(buf); + } else if (unlikely(rxq->mprq_repl == NULL)) { + struct mlx5_mprq_buf *rep; + + /* + * Currently, the MPRQ mempool is out of buffer + * and doing memcpy regardless of the size of Rx + * packet. Retry allocation to get back to + * normal. + */ + if (!rte_mempool_get(rxq->mprq_mp, + (void **)&rep)) + rxq->mprq_repl = rep; + } + /* Advance to the next WQE. */ + consumed_strd = 0; + ++rq_ci; + buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; + } + cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; + ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe); + if (!ret) + break; + byte_cnt = ret; + strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >> + MLX5_MPRQ_STRIDE_NUM_SHIFT; + MLX5_ASSERT(strd_cnt); + consumed_strd += strd_cnt; + if (byte_cnt & MLX5_MPRQ_FILLER_MASK) + continue; + if (mcqe == NULL) { + rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res); + strd_idx = rte_be_to_cpu_16(cqe->wqe_counter); + } else { + /* mini-CQE for MPRQ doesn't have hash result. */ + strd_idx = rte_be_to_cpu_16(mcqe->stride_idx); + } + MLX5_ASSERT(strd_idx < strd_n); + MLX5_ASSERT(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) & + wq_mask)); + pkt = rte_pktmbuf_alloc(rxq->mp); + if (unlikely(pkt == NULL)) { + ++rxq->stats.rx_nombuf; + break; + } + len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT; + MLX5_ASSERT((int)len >= (rxq->crc_present << 2)); + if (rxq->crc_present) + len -= RTE_ETHER_CRC_LEN; + offset = strd_idx * strd_sz + strd_shift; + addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset); + hdrm_overlap = len + RTE_PKTMBUF_HEADROOM - strd_cnt * strd_sz; + /* + * Memcpy packets to the target mbuf if: + * - The size of packet is smaller than mprq_max_memcpy_len. + * - Out of buffer in the Mempool for Multi-Packet RQ. + * - The packet's stride overlaps a headroom and scatter is off. + */ + if (len <= rxq->mprq_max_memcpy_len || + rxq->mprq_repl == NULL || + (hdrm_overlap > 0 && !rxq->strd_scatter_en)) { + if (likely(rte_pktmbuf_tailroom(pkt) >= len)) { + rte_memcpy(rte_pktmbuf_mtod(pkt, void *), + addr, len); + DATA_LEN(pkt) = len; + } else if (rxq->strd_scatter_en) { + struct rte_mbuf *prev = pkt; + uint32_t seg_len = + RTE_MIN(rte_pktmbuf_tailroom(pkt), len); + uint32_t rem_len = len - seg_len; + + rte_memcpy(rte_pktmbuf_mtod(pkt, void *), + addr, seg_len); + DATA_LEN(pkt) = seg_len; + while (rem_len) { + struct rte_mbuf *next = + rte_pktmbuf_alloc(rxq->mp); + + if (unlikely(next == NULL)) { + rte_pktmbuf_free(pkt); + ++rxq->stats.rx_nombuf; + goto out; + } + NEXT(prev) = next; + SET_DATA_OFF(next, 0); + addr = RTE_PTR_ADD(addr, seg_len); + seg_len = RTE_MIN + (rte_pktmbuf_tailroom(next), + rem_len); + rte_memcpy + (rte_pktmbuf_mtod(next, void *), + addr, seg_len); + DATA_LEN(next) = seg_len; + rem_len -= seg_len; + prev = next; + ++NB_SEGS(pkt); + } + } else { + rte_pktmbuf_free_seg(pkt); + ++rxq->stats.idropped; + continue; + } + } else { + rte_iova_t buf_iova; + struct rte_mbuf_ext_shared_info *shinfo; + uint16_t buf_len = strd_cnt * strd_sz; + void *buf_addr; + + /* Increment the refcnt of the whole chunk. */ + rte_atomic16_add_return(&buf->refcnt, 1); + MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <= + strd_n + 1); + buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM); + /* + * MLX5 device doesn't use iova but it is necessary in a + * case where the Rx packet is transmitted via a + * different PMD. + */ + buf_iova = rte_mempool_virt2iova(buf) + + RTE_PTR_DIFF(buf_addr, buf); + shinfo = &buf->shinfos[strd_idx]; + rte_mbuf_ext_refcnt_set(shinfo, 1); + /* + * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when + * attaching the stride to mbuf and more offload flags + * will be added below by calling rxq_cq_to_mbuf(). + * Other fields will be overwritten. + */ + rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova, + buf_len, shinfo); + /* Set mbuf head-room. */ + SET_DATA_OFF(pkt, RTE_PKTMBUF_HEADROOM); + MLX5_ASSERT(pkt->ol_flags == EXT_ATTACHED_MBUF); + MLX5_ASSERT(rte_pktmbuf_tailroom(pkt) >= + len - (hdrm_overlap > 0 ? hdrm_overlap : 0)); + DATA_LEN(pkt) = len; + /* + * Copy the last fragment of a packet (up to headroom + * size bytes) in case there is a stride overlap with + * a next packet's headroom. Allocate a separate mbuf + * to store this fragment and link it. Scatter is on. + */ + if (hdrm_overlap > 0) { + MLX5_ASSERT(rxq->strd_scatter_en); + struct rte_mbuf *seg = + rte_pktmbuf_alloc(rxq->mp); + + if (unlikely(seg == NULL)) { + rte_pktmbuf_free_seg(pkt); + ++rxq->stats.rx_nombuf; + break; + } + SET_DATA_OFF(seg, 0); + rte_memcpy(rte_pktmbuf_mtod(seg, void *), + RTE_PTR_ADD(addr, len - hdrm_overlap), + hdrm_overlap); + DATA_LEN(seg) = hdrm_overlap; + DATA_LEN(pkt) = len - hdrm_overlap; + NEXT(pkt) = seg; + NB_SEGS(pkt) = 2; + } + } + rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); + if (cqe->lro_num_seg > 1) { + mlx5_lro_update_hdr(addr, cqe, len); + pkt->ol_flags |= PKT_RX_LRO; + pkt->tso_segsz = len / cqe->lro_num_seg; + } + PKT_LEN(pkt) = len; + PORT(pkt) = rxq->port_id; +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Increment bytes counter. */ + rxq->stats.ibytes += PKT_LEN(pkt); +#endif + /* Return packet. */ + *(pkts++) = pkt; + ++i; + } +out: + /* Update the consumer indexes. */ + rxq->consumed_strd = consumed_strd; + rte_cio_wmb(); + *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); + if (rq_ci != rxq->rq_ci) { + rxq->rq_ci = rq_ci; + rte_cio_wmb(); + *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); + } +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Increment packets counter. */ + rxq->stats.ipackets += i; +#endif + return i; +} + +/** + * Dummy DPDK callback for TX. + * + * This function is used to temporarily replace the real callback during + * unsafe control operations on the queue, or in case of error. + * + * @param dpdk_txq + * Generic pointer to TX queue structure. + * @param[in] pkts + * Packets to transmit. + * @param pkts_n + * Number of packets in array. + * + * @return + * Number of packets successfully transmitted (<= pkts_n). + */ +uint16_t +removed_tx_burst(void *dpdk_txq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) +{ + rte_mb(); + return 0; +} + +/** + * Dummy DPDK callback for RX. + * + * This function is used to temporarily replace the real callback during + * unsafe control operations on the queue, or in case of error. + * + * @param dpdk_rxq + * Generic pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * + * @return + * Number of packets successfully received (<= pkts_n). + */ +uint16_t +removed_rx_burst(void *dpdk_txq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) +{ + rte_mb(); + return 0; +} + +/* + * Vectorized Rx/Tx routines are not compiled in when required vector + * instructions are not supported on a target architecture. The following null + * stubs are needed for linkage when those are not included outside of this file + * (e.g. mlx5_rxtx_vec_sse.c for x86). + */ + +__rte_weak uint16_t +mlx5_rx_burst_vec(void *dpdk_txq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) +{ + return 0; +} + +__rte_weak int +mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused) +{ + return -ENOTSUP; +} + +__rte_weak int +mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused) +{ + return -ENOTSUP; +} + +/** + * Free the mbufs from the linear array of pointers. + * + * @param pkts + * Pointer to array of packets to be free. + * @param pkts_n + * Number of packets to be freed. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + */ +static __rte_always_inline void +mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts, + unsigned int pkts_n, + unsigned int olx __rte_unused) +{ + struct rte_mempool *pool = NULL; + struct rte_mbuf **p_free = NULL; + struct rte_mbuf *mbuf; + unsigned int n_free = 0; + + /* + * The implemented algorithm eliminates + * copying pointers to temporary array + * for rte_mempool_put_bulk() calls. + */ + MLX5_ASSERT(pkts); + MLX5_ASSERT(pkts_n); + for (;;) { + for (;;) { + /* + * Decrement mbuf reference counter, detach + * indirect and external buffers if needed. + */ + mbuf = rte_pktmbuf_prefree_seg(*pkts); + if (likely(mbuf != NULL)) { + MLX5_ASSERT(mbuf == *pkts); + if (likely(n_free != 0)) { + if (unlikely(pool != mbuf->pool)) + /* From different pool. */ + break; + } else { + /* Start new scan array. */ + pool = mbuf->pool; + p_free = pkts; + } + ++n_free; + ++pkts; + --pkts_n; + if (unlikely(pkts_n == 0)) { + mbuf = NULL; + break; + } + } else { + /* + * This happens if mbuf is still referenced. + * We can't put it back to the pool, skip. + */ + ++pkts; + --pkts_n; + if (unlikely(n_free != 0)) + /* There is some array to free.*/ + break; + if (unlikely(pkts_n == 0)) + /* Last mbuf, nothing to free. */ + return; + } + } + for (;;) { + /* + * This loop is implemented to avoid multiple + * inlining of rte_mempool_put_bulk(). + */ + MLX5_ASSERT(pool); + MLX5_ASSERT(p_free); + MLX5_ASSERT(n_free); + /* + * Free the array of pre-freed mbufs + * belonging to the same memory pool. + */ + rte_mempool_put_bulk(pool, (void *)p_free, n_free); + if (unlikely(mbuf != NULL)) { + /* There is the request to start new scan. */ + pool = mbuf->pool; + p_free = pkts++; + n_free = 1; + --pkts_n; + if (likely(pkts_n != 0)) + break; + /* + * This is the last mbuf to be freed. + * Do one more loop iteration to complete. + * This is rare case of the last unique mbuf. + */ + mbuf = NULL; + continue; + } + if (likely(pkts_n == 0)) + return; + n_free = 0; + break; + } + } +} + +/** + * Free the mbuf from the elts ring buffer till new tail. + * + * @param txq + * Pointer to Tx queue structure. + * @param tail + * Index in elts to free up to, becomes new elts tail. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + */ +static __rte_always_inline void +mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq, + uint16_t tail, + unsigned int olx __rte_unused) +{ + uint16_t n_elts = tail - txq->elts_tail; + + MLX5_ASSERT(n_elts); + MLX5_ASSERT(n_elts <= txq->elts_s); + /* + * Implement a loop to support ring buffer wraparound + * with single inlining of mlx5_tx_free_mbuf(). + */ + do { + unsigned int part; + + part = txq->elts_s - (txq->elts_tail & txq->elts_m); + part = RTE_MIN(part, n_elts); + MLX5_ASSERT(part); + MLX5_ASSERT(part <= txq->elts_s); + mlx5_tx_free_mbuf(&txq->elts[txq->elts_tail & txq->elts_m], + part, olx); + txq->elts_tail += part; + n_elts -= part; + } while (n_elts); +} + +/** + * Store the mbuf being sent into elts ring buffer. + * On Tx completion these mbufs will be freed. + * + * @param txq + * Pointer to Tx queue structure. + * @param pkts + * Pointer to array of packets to be stored. + * @param pkts_n + * Number of packets to be stored. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + */ +static __rte_always_inline void +mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq, + struct rte_mbuf **restrict pkts, + unsigned int pkts_n, + unsigned int olx __rte_unused) +{ + unsigned int part; + struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts; + + MLX5_ASSERT(pkts); + MLX5_ASSERT(pkts_n); + part = txq->elts_s - (txq->elts_head & txq->elts_m); + MLX5_ASSERT(part); + MLX5_ASSERT(part <= txq->elts_s); + /* This code is a good candidate for vectorizing with SIMD. */ + rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)), + (void *)pkts, + RTE_MIN(part, pkts_n) * sizeof(struct rte_mbuf *)); + txq->elts_head += pkts_n; + if (unlikely(part < pkts_n)) + /* The copy is wrapping around the elts array. */ + rte_memcpy((void *)elts, (void *)(pkts + part), + (pkts_n - part) * sizeof(struct rte_mbuf *)); +} + +/** + * Update completion queue consuming index via doorbell + * and flush the completed data buffers. + * + * @param txq + * Pointer to TX queue structure. + * @param valid CQE pointer + * if not NULL update txq->wqe_pi and flush the buffers + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + */ +static __rte_always_inline void +mlx5_tx_comp_flush(struct mlx5_txq_data *restrict txq, + volatile struct mlx5_cqe *last_cqe, + unsigned int olx __rte_unused) +{ + if (likely(last_cqe != NULL)) { + uint16_t tail; + + txq->wqe_pi = rte_be_to_cpu_16(last_cqe->wqe_counter); + tail = txq->fcqs[(txq->cq_ci - 1) & txq->cqe_m]; + if (likely(tail != txq->elts_tail)) { + mlx5_tx_free_elts(txq, tail, olx); + MLX5_ASSERT(tail == txq->elts_tail); + } + } +} + +/** + * Manage TX completions. This routine checks the CQ for + * arrived CQEs, deduces the last accomplished WQE in SQ, + * updates SQ producing index and frees all completed mbufs. + * + * @param txq + * Pointer to TX queue structure. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + * + * NOTE: not inlined intentionally, it makes tx_burst + * routine smaller, simple and faster - from experiments. + */ +static void +mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq, + unsigned int olx __rte_unused) +{ + unsigned int count = MLX5_TX_COMP_MAX_CQE; + volatile struct mlx5_cqe *last_cqe = NULL; + bool ring_doorbell = false; + int ret; + + static_assert(MLX5_CQE_STATUS_HW_OWN < 0, "Must be negative value"); + static_assert(MLX5_CQE_STATUS_SW_OWN < 0, "Must be negative value"); + do { + volatile struct mlx5_cqe *cqe; + + cqe = &txq->cqes[txq->cq_ci & txq->cqe_m]; + ret = check_cqe(cqe, txq->cqe_s, txq->cq_ci); + if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { + if (likely(ret != MLX5_CQE_STATUS_ERR)) { + /* No new CQEs in completion queue. */ + MLX5_ASSERT(ret == MLX5_CQE_STATUS_HW_OWN); + break; + } + /* + * Some error occurred, try to restart. + * We have no barrier after WQE related Doorbell + * written, make sure all writes are completed + * here, before we might perform SQ reset. + */ + rte_wmb(); + ret = mlx5_tx_error_cqe_handle + (txq, (volatile struct mlx5_err_cqe *)cqe); + if (unlikely(ret < 0)) { + /* + * Some error occurred on queue error + * handling, we do not advance the index + * here, allowing to retry on next call. + */ + return; + } + /* + * We are going to fetch all entries with + * MLX5_CQE_SYNDROME_WR_FLUSH_ERR status. + * The send queue is supposed to be empty. + */ + ring_doorbell = true; + ++txq->cq_ci; + txq->cq_pi = txq->cq_ci; + last_cqe = NULL; + continue; + } + /* Normal transmit completion. */ + MLX5_ASSERT(txq->cq_ci != txq->cq_pi); + MLX5_ASSERT((txq->fcqs[txq->cq_ci & txq->cqe_m] >> 16) == + cqe->wqe_counter); + ring_doorbell = true; + ++txq->cq_ci; + last_cqe = cqe; + /* + * We have to restrict the amount of processed CQEs + * in one tx_burst routine call. The CQ may be large + * and many CQEs may be updated by the NIC in one + * transaction. Buffers freeing is time consuming, + * multiple iterations may introduce significant + * latency. + */ + if (likely(--count == 0)) + break; + } while (true); + if (likely(ring_doorbell)) { + /* Ring doorbell to notify hardware. */ + rte_compiler_barrier(); + *txq->cq_db = rte_cpu_to_be_32(txq->cq_ci); + mlx5_tx_comp_flush(txq, last_cqe, olx); + } +} + +/** + * Check if the completion request flag should be set in the last WQE. + * Both pushed mbufs and WQEs are monitored and the completion request + * flag is set if any of thresholds is reached. + * + * @param txq + * Pointer to TX queue structure. + * @param loc + * Pointer to burst routine local context. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + */ +static __rte_always_inline void +mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq, + struct mlx5_txq_local *restrict loc, + unsigned int olx) +{ + uint16_t head = txq->elts_head; + unsigned int part; + + part = MLX5_TXOFF_CONFIG(INLINE) ? + 0 : loc->pkts_sent - loc->pkts_copy; + head += part; + if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH || + (MLX5_TXOFF_CONFIG(INLINE) && + (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) { + volatile struct mlx5_wqe *last = loc->wqe_last; + + MLX5_ASSERT(last); + txq->elts_comp = head; + if (MLX5_TXOFF_CONFIG(INLINE)) + txq->wqe_comp = txq->wqe_ci; + /* Request unconditional completion on last WQE. */ + last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS << + MLX5_COMP_MODE_OFFSET); + /* Save elts_head in dedicated free on completion queue. */ +#ifdef RTE_LIBRTE_MLX5_DEBUG + txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head | + (last->cseg.opcode >> 8) << 16; +#else + txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head; +#endif + /* A CQE slot must always be available. */ + MLX5_ASSERT((txq->cq_pi - txq->cq_ci) <= txq->cqe_s); + } +} + +/** + * DPDK callback to check the status of a tx descriptor. + * + * @param tx_queue + * The tx queue. + * @param[in] offset + * The index of the descriptor in the ring. + * + * @return + * The status of the tx descriptor. + */ +int +mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset) +{ + struct mlx5_txq_data *restrict txq = tx_queue; + uint16_t used; + + mlx5_tx_handle_completion(txq, 0); + used = txq->elts_head - txq->elts_tail; + if (offset < used) + return RTE_ETH_TX_DESC_FULL; + return RTE_ETH_TX_DESC_DONE; +} + +/** + * Build the Control Segment with specified opcode: + * - MLX5_OPCODE_SEND + * - MLX5_OPCODE_ENHANCED_MPSW + * - MLX5_OPCODE_TSO + * + * @param txq + * Pointer to TX queue structure. + * @param loc + * Pointer to burst routine local context. + * @param wqe + * Pointer to WQE to fill with built Control Segment. + * @param ds + * Supposed length of WQE in segments. + * @param opcode + * SQ WQE opcode to put into Control Segment. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + */ +static __rte_always_inline void +mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq, + struct mlx5_txq_local *restrict loc __rte_unused, + struct mlx5_wqe *restrict wqe, + unsigned int ds, + unsigned int opcode, + unsigned int olx __rte_unused) +{ + struct mlx5_wqe_cseg *restrict cs = &wqe->cseg; + + /* For legacy MPW replace the EMPW by TSO with modifier. */ + if (MLX5_TXOFF_CONFIG(MPW) && opcode == MLX5_OPCODE_ENHANCED_MPSW) + opcode = MLX5_OPCODE_TSO | MLX5_OPC_MOD_MPW << 24; + cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode); + cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); + cs->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR << + MLX5_COMP_MODE_OFFSET); + cs->misc = RTE_BE32(0); +} + +/** + * Build the Ethernet Segment without inlined data. + * Supports Software Parser, Checksums and VLAN + * insertion Tx offload features. + * + * @param txq + * Pointer to TX queue structure. + * @param loc + * Pointer to burst routine local context. + * @param wqe + * Pointer to WQE to fill with built Ethernet Segment. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + */ +static __rte_always_inline void +mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused, + struct mlx5_txq_local *restrict loc, + struct mlx5_wqe *restrict wqe, + unsigned int olx) +{ + struct mlx5_wqe_eseg *restrict es = &wqe->eseg; + uint32_t csum; + + /* + * Calculate and set check sum flags first, dword field + * in segment may be shared with Software Parser flags. + */ + csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; + es->flags = rte_cpu_to_le_32(csum); + /* + * Calculate and set Software Parser offsets and flags. + * These flags a set for custom UDP and IP tunnel packets. + */ + es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); + /* Fill metadata field if needed. */ + es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? + loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? + *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; + /* Engage VLAN tag insertion feature if requested. */ + if (MLX5_TXOFF_CONFIG(VLAN) && + loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { + /* + * We should get here only if device support + * this feature correctly. + */ + MLX5_ASSERT(txq->vlan_en); + es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT | + loc->mbuf->vlan_tci); + } else { + es->inline_hdr = RTE_BE32(0); + } +} + +/** + * Build the Ethernet Segment with minimal inlined data + * of MLX5_ESEG_MIN_INLINE_SIZE bytes length. This is + * used to fill the gap in single WQEBB WQEs. + * Supports Software Parser, Checksums and VLAN + * insertion Tx offload features. + * + * @param txq + * Pointer to TX queue structure. + * @param loc + * Pointer to burst routine local context. + * @param wqe + * Pointer to WQE to fill with built Ethernet Segment. + * @param vlan + * Length of VLAN tag insertion if any. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + */ +static __rte_always_inline void +mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused, + struct mlx5_txq_local *restrict loc, + struct mlx5_wqe *restrict wqe, + unsigned int vlan, + unsigned int olx) +{ + struct mlx5_wqe_eseg *restrict es = &wqe->eseg; + uint32_t csum; + uint8_t *psrc, *pdst; + + /* + * Calculate and set check sum flags first, dword field + * in segment may be shared with Software Parser flags. + */ + csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; + es->flags = rte_cpu_to_le_32(csum); + /* + * Calculate and set Software Parser offsets and flags. + * These flags a set for custom UDP and IP tunnel packets. + */ + es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); + /* Fill metadata field if needed. */ + es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? + loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? + *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; + static_assert(MLX5_ESEG_MIN_INLINE_SIZE == + (sizeof(uint16_t) + + sizeof(rte_v128u32_t)), + "invalid Ethernet Segment data size"); + static_assert(MLX5_ESEG_MIN_INLINE_SIZE == + (sizeof(uint16_t) + + sizeof(struct rte_vlan_hdr) + + 2 * RTE_ETHER_ADDR_LEN), + "invalid Ethernet Segment data size"); + psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); + es->inline_hdr_sz = RTE_BE16(MLX5_ESEG_MIN_INLINE_SIZE); + es->inline_data = *(unaligned_uint16_t *)psrc; + psrc += sizeof(uint16_t); + pdst = (uint8_t *)(es + 1); + if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { + /* Implement VLAN tag insertion as part inline data. */ + memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t)); + pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); + psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); + /* Insert VLAN ethertype + VLAN tag. */ + *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 + ((RTE_ETHER_TYPE_VLAN << 16) | + loc->mbuf->vlan_tci); + pdst += sizeof(struct rte_vlan_hdr); + /* Copy the rest two bytes from packet data. */ + MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); + *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc; + } else { + /* Fill the gap in the title WQEBB with inline data. */ + rte_mov16(pdst, psrc); + } +} + +/** + * Build the Ethernet Segment with entire packet + * data inlining. Checks the boundary of WQEBB and + * ring buffer wrapping, supports Software Parser, + * Checksums and VLAN insertion Tx offload features. + * + * @param txq + * Pointer to TX queue structure. + * @param loc + * Pointer to burst routine local context. + * @param wqe + * Pointer to WQE to fill with built Ethernet Segment. + * @param vlan + * Length of VLAN tag insertion if any. + * @param inlen + * Length of data to inline (VLAN included, if any). + * @param tso + * TSO flag, set mss field from the packet. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + * + * @return + * Pointer to the next Data Segment (aligned and wrapped around). + */ +static __rte_always_inline struct mlx5_wqe_dseg * +mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq, + struct mlx5_txq_local *restrict loc, + struct mlx5_wqe *restrict wqe, + unsigned int vlan, + unsigned int inlen, + unsigned int tso, + unsigned int olx) +{ + struct mlx5_wqe_eseg *restrict es = &wqe->eseg; + uint32_t csum; + uint8_t *psrc, *pdst; + unsigned int part; + + /* + * Calculate and set check sum flags first, dword field + * in segment may be shared with Software Parser flags. + */ + csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; + if (tso) { + csum <<= 24; + csum |= loc->mbuf->tso_segsz; + es->flags = rte_cpu_to_be_32(csum); + } else { + es->flags = rte_cpu_to_le_32(csum); + } + /* + * Calculate and set Software Parser offsets and flags. + * These flags a set for custom UDP and IP tunnel packets. + */ + es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); + /* Fill metadata field if needed. */ + es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? + loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? + *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; + static_assert(MLX5_ESEG_MIN_INLINE_SIZE == + (sizeof(uint16_t) + + sizeof(rte_v128u32_t)), + "invalid Ethernet Segment data size"); + static_assert(MLX5_ESEG_MIN_INLINE_SIZE == + (sizeof(uint16_t) + + sizeof(struct rte_vlan_hdr) + + 2 * RTE_ETHER_ADDR_LEN), + "invalid Ethernet Segment data size"); + psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); + es->inline_hdr_sz = rte_cpu_to_be_16(inlen); + es->inline_data = *(unaligned_uint16_t *)psrc; + psrc += sizeof(uint16_t); + pdst = (uint8_t *)(es + 1); + if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { + /* Implement VLAN tag insertion as part inline data. */ + memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t)); + pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); + psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); + /* Insert VLAN ethertype + VLAN tag. */ + *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 + ((RTE_ETHER_TYPE_VLAN << 16) | + loc->mbuf->vlan_tci); + pdst += sizeof(struct rte_vlan_hdr); + /* Copy the rest two bytes from packet data. */ + MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); + *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc; + psrc += sizeof(uint16_t); + } else { + /* Fill the gap in the title WQEBB with inline data. */ + rte_mov16(pdst, psrc); + psrc += sizeof(rte_v128u32_t); + } + pdst = (uint8_t *)(es + 2); + MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); + MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end); + inlen -= MLX5_ESEG_MIN_INLINE_SIZE; + if (!inlen) { + MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); + return (struct mlx5_wqe_dseg *)pdst; + } + /* + * The WQEBB space availability is checked by caller. + * Here we should be aware of WQE ring buffer wraparound only. + */ + part = (uint8_t *)txq->wqes_end - pdst; + part = RTE_MIN(part, inlen); + do { + rte_memcpy(pdst, psrc, part); + inlen -= part; + if (likely(!inlen)) { + /* + * If return value is not used by the caller + * the code below will be optimized out. + */ + pdst += part; + pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); + if (unlikely(pdst >= (uint8_t *)txq->wqes_end)) + pdst = (uint8_t *)txq->wqes; + return (struct mlx5_wqe_dseg *)pdst; + } + pdst = (uint8_t *)txq->wqes; + psrc += part; + part = inlen; + } while (true); +} + +/** + * Copy data from chain of mbuf to the specified linear buffer. + * Checksums and VLAN insertion Tx offload features. If data + * from some mbuf copied completely this mbuf is freed. Local + * structure is used to keep the byte stream state. + * + * @param pdst + * Pointer to the destination linear buffer. + * @param loc + * Pointer to burst routine local context. + * @param len + * Length of data to be copied. + * @param must + * Length of data to be copied ignoring no inline hint. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + * + * @return + * Number of actual copied data bytes. This is always greater than or + * equal to must parameter and might be lesser than len in no inline + * hint flag is encountered. + */ +static __rte_always_inline unsigned int +mlx5_tx_mseg_memcpy(uint8_t *pdst, + struct mlx5_txq_local *restrict loc, + unsigned int len, + unsigned int must, + unsigned int olx __rte_unused) +{ + struct rte_mbuf *mbuf; + unsigned int part, dlen, copy = 0; + uint8_t *psrc; + + MLX5_ASSERT(len); + MLX5_ASSERT(must <= len); + do { + /* Allow zero length packets, must check first. */ + dlen = rte_pktmbuf_data_len(loc->mbuf); + if (dlen <= loc->mbuf_off) { + /* Exhausted packet, just free. */ + mbuf = loc->mbuf; + loc->mbuf = mbuf->next; + rte_pktmbuf_free_seg(mbuf); + loc->mbuf_off = 0; + MLX5_ASSERT(loc->mbuf_nseg > 1); + MLX5_ASSERT(loc->mbuf); + --loc->mbuf_nseg; + if (loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) { + unsigned int diff; + + if (copy >= must) { + /* + * We already copied the minimal + * requested amount of data. + */ + return copy; + } + diff = must - copy; + if (diff <= rte_pktmbuf_data_len(loc->mbuf)) { + /* + * Copy only the minimal required + * part of the data buffer. + */ + len = diff; + } + } + continue; + } + dlen -= loc->mbuf_off; + psrc = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *, + loc->mbuf_off); + part = RTE_MIN(len, dlen); + rte_memcpy(pdst, psrc, part); + copy += part; + loc->mbuf_off += part; + len -= part; + if (!len) { + if (loc->mbuf_off >= rte_pktmbuf_data_len(loc->mbuf)) { + loc->mbuf_off = 0; + /* Exhausted packet, just free. */ + mbuf = loc->mbuf; + loc->mbuf = mbuf->next; + rte_pktmbuf_free_seg(mbuf); + loc->mbuf_off = 0; + MLX5_ASSERT(loc->mbuf_nseg >= 1); + --loc->mbuf_nseg; + } + return copy; + } + pdst += part; + } while (true); +} + +/** + * Build the Ethernet Segment with inlined data from + * multi-segment packet. Checks the boundary of WQEBB + * and ring buffer wrapping, supports Software Parser, + * Checksums and VLAN insertion Tx offload features. + * + * @param txq + * Pointer to TX queue structure. + * @param loc + * Pointer to burst routine local context. + * @param wqe + * Pointer to WQE to fill with built Ethernet Segment. + * @param vlan + * Length of VLAN tag insertion if any. + * @param inlen + * Length of data to inline (VLAN included, if any). + * @param tso + * TSO flag, set mss field from the packet. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + * + * @return + * Pointer to the next Data Segment (aligned and + * possible NOT wrapped around - caller should do + * wrapping check on its own). + */ +static __rte_always_inline struct mlx5_wqe_dseg * +mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq, + struct mlx5_txq_local *restrict loc, + struct mlx5_wqe *restrict wqe, + unsigned int vlan, + unsigned int inlen, + unsigned int tso, + unsigned int olx) +{ + struct mlx5_wqe_eseg *restrict es = &wqe->eseg; + uint32_t csum; + uint8_t *pdst; + unsigned int part, tlen = 0; + + /* + * Calculate and set check sum flags first, uint32_t field + * in segment may be shared with Software Parser flags. + */ + csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; + if (tso) { + csum <<= 24; + csum |= loc->mbuf->tso_segsz; + es->flags = rte_cpu_to_be_32(csum); + } else { + es->flags = rte_cpu_to_le_32(csum); + } + /* + * Calculate and set Software Parser offsets and flags. + * These flags a set for custom UDP and IP tunnel packets. + */ + es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); + /* Fill metadata field if needed. */ + es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? + loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? + *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; + static_assert(MLX5_ESEG_MIN_INLINE_SIZE == + (sizeof(uint16_t) + + sizeof(rte_v128u32_t)), + "invalid Ethernet Segment data size"); + static_assert(MLX5_ESEG_MIN_INLINE_SIZE == + (sizeof(uint16_t) + + sizeof(struct rte_vlan_hdr) + + 2 * RTE_ETHER_ADDR_LEN), + "invalid Ethernet Segment data size"); + MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); + pdst = (uint8_t *)&es->inline_data; + if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { + /* Implement VLAN tag insertion as part inline data. */ + mlx5_tx_mseg_memcpy(pdst, loc, + 2 * RTE_ETHER_ADDR_LEN, + 2 * RTE_ETHER_ADDR_LEN, olx); + pdst += 2 * RTE_ETHER_ADDR_LEN; + *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 + ((RTE_ETHER_TYPE_VLAN << 16) | + loc->mbuf->vlan_tci); + pdst += sizeof(struct rte_vlan_hdr); + tlen += 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr); + } + MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end); + /* + * The WQEBB space availability is checked by caller. + * Here we should be aware of WQE ring buffer wraparound only. + */ + part = (uint8_t *)txq->wqes_end - pdst; + part = RTE_MIN(part, inlen - tlen); + MLX5_ASSERT(part); + do { + unsigned int copy; + + /* + * Copying may be interrupted inside the routine + * if run into no inline hint flag. + */ + copy = tlen >= txq->inlen_mode ? 0 : (txq->inlen_mode - tlen); + copy = mlx5_tx_mseg_memcpy(pdst, loc, part, copy, olx); + tlen += copy; + if (likely(inlen <= tlen) || copy < part) { + es->inline_hdr_sz = rte_cpu_to_be_16(tlen); + pdst += copy; + pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); + return (struct mlx5_wqe_dseg *)pdst; + } + pdst = (uint8_t *)txq->wqes; + part = inlen - tlen; + } while (true); +} + +/** + * Build the Data Segment of pointer type. + * + * @param txq + * Pointer to TX queue structure. + * @param loc + * Pointer to burst routine local context. + * @param dseg + * Pointer to WQE to fill with built Data Segment. + * @param buf + * Data buffer to point. + * @param len + * Data buffer length. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + */ +static __rte_always_inline void +mlx5_tx_dseg_ptr(struct mlx5_txq_data *restrict txq, + struct mlx5_txq_local *restrict loc, + struct mlx5_wqe_dseg *restrict dseg, + uint8_t *buf, + unsigned int len, + unsigned int olx __rte_unused) + +{ + MLX5_ASSERT(len); + dseg->bcount = rte_cpu_to_be_32(len); + dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf); + dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf); +} + +/** + * Build the Data Segment of pointer type or inline + * if data length is less than buffer in minimal + * Data Segment size. + * + * @param txq + * Pointer to TX queue structure. + * @param loc + * Pointer to burst routine local context. + * @param dseg + * Pointer to WQE to fill with built Data Segment. + * @param buf + * Data buffer to point. + * @param len + * Data buffer length. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + */ +static __rte_always_inline void +mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq, + struct mlx5_txq_local *restrict loc, + struct mlx5_wqe_dseg *restrict dseg, + uint8_t *buf, + unsigned int len, + unsigned int olx __rte_unused) + +{ + uintptr_t dst, src; + + MLX5_ASSERT(len); + if (len > MLX5_DSEG_MIN_INLINE_SIZE) { + dseg->bcount = rte_cpu_to_be_32(len); + dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf); + dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf); + + return; + } + dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); + /* Unrolled implementation of generic rte_memcpy. */ + dst = (uintptr_t)&dseg->inline_data[0]; + src = (uintptr_t)buf; + if (len & 0x08) { +#ifdef RTE_ARCH_STRICT_ALIGN + MLX5_ASSERT(dst == RTE_PTR_ALIGN(dst, sizeof(uint32_t))); + *(uint32_t *)dst = *(unaligned_uint32_t *)src; + dst += sizeof(uint32_t); + src += sizeof(uint32_t); + *(uint32_t *)dst = *(unaligned_uint32_t *)src; + dst += sizeof(uint32_t); + src += sizeof(uint32_t); +#else + *(uint64_t *)dst = *(unaligned_uint64_t *)src; + dst += sizeof(uint64_t); + src += sizeof(uint64_t); +#endif + } + if (len & 0x04) { + *(uint32_t *)dst = *(unaligned_uint32_t *)src; + dst += sizeof(uint32_t); + src += sizeof(uint32_t); + } + if (len & 0x02) { + *(uint16_t *)dst = *(unaligned_uint16_t *)src; + dst += sizeof(uint16_t); + src += sizeof(uint16_t); + } + if (len & 0x01) + *(uint8_t *)dst = *(uint8_t *)src; +} + +/** + * Build the Data Segment of inlined data from single + * segment packet, no VLAN insertion. + * + * @param txq + * Pointer to TX queue structure. + * @param loc + * Pointer to burst routine local context. + * @param dseg + * Pointer to WQE to fill with built Data Segment. + * @param buf + * Data buffer to point. + * @param len + * Data buffer length. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + * + * @return + * Pointer to the next Data Segment after inlined data. + * Ring buffer wraparound check is needed. We do not + * do it here because it may not be needed for the + * last packet in the eMPW session. + */ +static __rte_always_inline struct mlx5_wqe_dseg * +mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq, + struct mlx5_txq_local *restrict loc __rte_unused, + struct mlx5_wqe_dseg *restrict dseg, + uint8_t *buf, + unsigned int len, + unsigned int olx __rte_unused) +{ + unsigned int part; + uint8_t *pdst; + + if (!MLX5_TXOFF_CONFIG(MPW)) { + /* Store the descriptor byte counter for eMPW sessions. */ + dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); + pdst = &dseg->inline_data[0]; + } else { + /* The entire legacy MPW session counter is stored on close. */ + pdst = (uint8_t *)dseg; + } + /* + * The WQEBB space availability is checked by caller. + * Here we should be aware of WQE ring buffer wraparound only. + */ + part = (uint8_t *)txq->wqes_end - pdst; + part = RTE_MIN(part, len); + do { + rte_memcpy(pdst, buf, part); + len -= part; + if (likely(!len)) { + pdst += part; + if (!MLX5_TXOFF_CONFIG(MPW)) + pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); + /* Note: no final wraparound check here. */ + return (struct mlx5_wqe_dseg *)pdst; + } + pdst = (uint8_t *)txq->wqes; + buf += part; + part = len; + } while (true); +} + +/** + * Build the Data Segment of inlined data from single + * segment packet with VLAN insertion. + * + * @param txq + * Pointer to TX queue structure. + * @param loc + * Pointer to burst routine local context. + * @param dseg + * Pointer to the dseg fill with built Data Segment. + * @param buf + * Data buffer to point. + * @param len + * Data buffer length. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + * + * @return + * Pointer to the next Data Segment after inlined data. + * Ring buffer wraparound check is needed. + */ +static __rte_always_inline struct mlx5_wqe_dseg * +mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq, + struct mlx5_txq_local *restrict loc __rte_unused, + struct mlx5_wqe_dseg *restrict dseg, + uint8_t *buf, + unsigned int len, + unsigned int olx __rte_unused) + +{ + unsigned int part; + uint8_t *pdst; + + MLX5_ASSERT(len > MLX5_ESEG_MIN_INLINE_SIZE); + static_assert(MLX5_DSEG_MIN_INLINE_SIZE == + (2 * RTE_ETHER_ADDR_LEN), + "invalid Data Segment data size"); + if (!MLX5_TXOFF_CONFIG(MPW)) { + /* Store the descriptor byte counter for eMPW sessions. */ + dseg->bcount = rte_cpu_to_be_32 + ((len + sizeof(struct rte_vlan_hdr)) | + MLX5_ETH_WQE_DATA_INLINE); + pdst = &dseg->inline_data[0]; + } else { + /* The entire legacy MPW session counter is stored on close. */ + pdst = (uint8_t *)dseg; + } + memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE); + buf += MLX5_DSEG_MIN_INLINE_SIZE; + pdst += MLX5_DSEG_MIN_INLINE_SIZE; + len -= MLX5_DSEG_MIN_INLINE_SIZE; + /* Insert VLAN ethertype + VLAN tag. Pointer is aligned. */ + MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); + if (unlikely(pdst >= (uint8_t *)txq->wqes_end)) + pdst = (uint8_t *)txq->wqes; + *(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) | + loc->mbuf->vlan_tci); + pdst += sizeof(struct rte_vlan_hdr); + /* + * The WQEBB space availability is checked by caller. + * Here we should be aware of WQE ring buffer wraparound only. + */ + part = (uint8_t *)txq->wqes_end - pdst; + part = RTE_MIN(part, len); + do { + rte_memcpy(pdst, buf, part); + len -= part; + if (likely(!len)) { + pdst += part; + if (!MLX5_TXOFF_CONFIG(MPW)) + pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); + /* Note: no final wraparound check here. */ + return (struct mlx5_wqe_dseg *)pdst; + } + pdst = (uint8_t *)txq->wqes; + buf += part; + part = len; + } while (true); +} + +/** + * Build the Ethernet Segment with optionally inlined data with + * VLAN insertion and following Data Segments (if any) from + * multi-segment packet. Used by ordinary send and TSO. + * + * @param txq + * Pointer to TX queue structure. + * @param loc + * Pointer to burst routine local context. + * @param wqe + * Pointer to WQE to fill with built Ethernet/Data Segments. + * @param vlan + * Length of VLAN header to insert, 0 means no VLAN insertion. + * @param inlen + * Data length to inline. For TSO this parameter specifies + * exact value, for ordinary send routine can be aligned by + * caller to provide better WQE space saving and data buffer + * start address alignment. This length includes VLAN header + * being inserted. + * @param tso + * Zero means ordinary send, inlined data can be extended, + * otherwise this is TSO, inlined data length is fixed. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + * + * @return + * Actual size of built WQE in segments. + */ +static __rte_always_inline unsigned int +mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq, + struct mlx5_txq_local *restrict loc, + struct mlx5_wqe *restrict wqe, + unsigned int vlan, + unsigned int inlen, + unsigned int tso, + unsigned int olx __rte_unused) +{ + struct mlx5_wqe_dseg *restrict dseg; + unsigned int ds; + + MLX5_ASSERT((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen); + loc->mbuf_nseg = NB_SEGS(loc->mbuf); + loc->mbuf_off = 0; + + dseg = mlx5_tx_eseg_mdat(txq, loc, wqe, vlan, inlen, tso, olx); + if (!loc->mbuf_nseg) + goto dseg_done; + /* + * There are still some mbuf remaining, not inlined. + * The first mbuf may be partially inlined and we + * must process the possible non-zero data offset. + */ + if (loc->mbuf_off) { + unsigned int dlen; + uint8_t *dptr; + + /* + * Exhausted packets must be dropped before. + * Non-zero offset means there are some data + * remained in the packet. + */ + MLX5_ASSERT(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf)); + MLX5_ASSERT(rte_pktmbuf_data_len(loc->mbuf)); + dptr = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *, + loc->mbuf_off); + dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off; + /* + * Build the pointer/minimal data Data Segment. + * Do ring buffer wrapping check in advance. + */ + if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) + dseg = (struct mlx5_wqe_dseg *)txq->wqes; + mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx); + /* Store the mbuf to be freed on completion. */ + MLX5_ASSERT(loc->elts_free); + txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; + --loc->elts_free; + ++dseg; + if (--loc->mbuf_nseg == 0) + goto dseg_done; + loc->mbuf = loc->mbuf->next; + loc->mbuf_off = 0; + } + do { + if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) { + struct rte_mbuf *mbuf; + + /* Zero length segment found, just skip. */ + mbuf = loc->mbuf; + loc->mbuf = loc->mbuf->next; + rte_pktmbuf_free_seg(mbuf); + if (--loc->mbuf_nseg == 0) + break; + } else { + if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) + dseg = (struct mlx5_wqe_dseg *)txq->wqes; + mlx5_tx_dseg_iptr + (txq, loc, dseg, + rte_pktmbuf_mtod(loc->mbuf, uint8_t *), + rte_pktmbuf_data_len(loc->mbuf), olx); + MLX5_ASSERT(loc->elts_free); + txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; + --loc->elts_free; + ++dseg; + if (--loc->mbuf_nseg == 0) + break; + loc->mbuf = loc->mbuf->next; + } + } while (true); + +dseg_done: + /* Calculate actual segments used from the dseg pointer. */ + if ((uintptr_t)wqe < (uintptr_t)dseg) + ds = ((uintptr_t)dseg - (uintptr_t)wqe) / MLX5_WSEG_SIZE; + else + ds = (((uintptr_t)dseg - (uintptr_t)wqe) + + txq->wqe_s * MLX5_WQE_SIZE) / MLX5_WSEG_SIZE; + return ds; +} + +/** + * Tx one packet function for multi-segment TSO. Supports all + * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs, + * sends one packet per WQE. + * + * This routine is responsible for storing processed mbuf + * into elts ring buffer and update elts_head. + * + * @param txq + * Pointer to TX queue structure. + * @param loc + * Pointer to burst routine local context. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + * + * @return + * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. + * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. + * Local context variables partially updated. + */ +static __rte_always_inline enum mlx5_txcmp_code +mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq, + struct mlx5_txq_local *restrict loc, + unsigned int olx) +{ + struct mlx5_wqe *restrict wqe; + unsigned int ds, dlen, inlen, ntcp, vlan = 0; + + /* + * Calculate data length to be inlined to estimate + * the required space in WQE ring buffer. + */ + dlen = rte_pktmbuf_pkt_len(loc->mbuf); + if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) + vlan = sizeof(struct rte_vlan_hdr); + inlen = loc->mbuf->l2_len + vlan + + loc->mbuf->l3_len + loc->mbuf->l4_len; + if (unlikely((!inlen || !loc->mbuf->tso_segsz))) + return MLX5_TXCMP_CODE_ERROR; + if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK) + inlen += loc->mbuf->outer_l2_len + loc->mbuf->outer_l3_len; + /* Packet must contain all TSO headers. */ + if (unlikely(inlen > MLX5_MAX_TSO_HEADER || + inlen <= MLX5_ESEG_MIN_INLINE_SIZE || + inlen > (dlen + vlan))) + return MLX5_TXCMP_CODE_ERROR; + MLX5_ASSERT(inlen >= txq->inlen_mode); + /* + * Check whether there are enough free WQEBBs: + * - Control Segment + * - Ethernet Segment + * - First Segment of inlined Ethernet data + * - ... data continued ... + * - Data Segments of pointer/min inline type + */ + ds = NB_SEGS(loc->mbuf) + 2 + (inlen - + MLX5_ESEG_MIN_INLINE_SIZE + + MLX5_WSEG_SIZE + + MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; + if (unlikely(loc->wqe_free < ((ds + 3) / 4))) + return MLX5_TXCMP_CODE_EXIT; + /* Check for maximal WQE size. */ + if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) + return MLX5_TXCMP_CODE_ERROR; +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Update sent data bytes/packets counters. */ + ntcp = (dlen - (inlen - vlan) + loc->mbuf->tso_segsz - 1) / + loc->mbuf->tso_segsz; + /* + * One will be added for mbuf itself + * at the end of the mlx5_tx_burst from + * loc->pkts_sent field. + */ + --ntcp; + txq->stats.opackets += ntcp; + txq->stats.obytes += dlen + vlan + ntcp * inlen; +#endif + wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); + loc->wqe_last = wqe; + mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_TSO, olx); + ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 1, olx); + wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); + txq->wqe_ci += (ds + 3) / 4; + loc->wqe_free -= (ds + 3) / 4; + return MLX5_TXCMP_CODE_MULTI; +} + +/** + * Tx one packet function for multi-segment SEND. Supports all + * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs, + * sends one packet per WQE, without any data inlining in + * Ethernet Segment. + * + * This routine is responsible for storing processed mbuf + * into elts ring buffer and update elts_head. + * + * @param txq + * Pointer to TX queue structure. + * @param loc + * Pointer to burst routine local context. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + * + * @return + * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. + * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. + * Local context variables partially updated. + */ +static __rte_always_inline enum mlx5_txcmp_code +mlx5_tx_packet_multi_send(struct mlx5_txq_data *restrict txq, + struct mlx5_txq_local *restrict loc, + unsigned int olx) +{ + struct mlx5_wqe_dseg *restrict dseg; + struct mlx5_wqe *restrict wqe; + unsigned int ds, nseg; + + MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1); + /* + * No inline at all, it means the CPU cycles saving + * is prioritized at configuration, we should not + * copy any packet data to WQE. + */ + nseg = NB_SEGS(loc->mbuf); + ds = 2 + nseg; + if (unlikely(loc->wqe_free < ((ds + 3) / 4))) + return MLX5_TXCMP_CODE_EXIT; + /* Check for maximal WQE size. */ + if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) + return MLX5_TXCMP_CODE_ERROR; + /* + * Some Tx offloads may cause an error if + * packet is not long enough, check against + * assumed minimal length. + */ + if (rte_pktmbuf_pkt_len(loc->mbuf) <= MLX5_ESEG_MIN_INLINE_SIZE) + return MLX5_TXCMP_CODE_ERROR; +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Update sent data bytes counter. */ + txq->stats.obytes += rte_pktmbuf_pkt_len(loc->mbuf); + if (MLX5_TXOFF_CONFIG(VLAN) && + loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) + txq->stats.obytes += sizeof(struct rte_vlan_hdr); +#endif + /* + * SEND WQE, one WQEBB: + * - Control Segment, SEND opcode + * - Ethernet Segment, optional VLAN, no inline + * - Data Segments, pointer only type + */ + wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); + loc->wqe_last = wqe; + mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_SEND, olx); + mlx5_tx_eseg_none(txq, loc, wqe, olx); + dseg = &wqe->dseg[0]; + do { + if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) { + struct rte_mbuf *mbuf; + + /* + * Zero length segment found, have to + * correct total size of WQE in segments. + * It is supposed to be rare occasion, so + * in normal case (no zero length segments) + * we avoid extra writing to the Control + * Segment. + */ + --ds; + wqe->cseg.sq_ds -= RTE_BE32(1); + mbuf = loc->mbuf; + loc->mbuf = mbuf->next; + rte_pktmbuf_free_seg(mbuf); + if (--nseg == 0) + break; + } else { + mlx5_tx_dseg_ptr + (txq, loc, dseg, + rte_pktmbuf_mtod(loc->mbuf, uint8_t *), + rte_pktmbuf_data_len(loc->mbuf), olx); + txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; + --loc->elts_free; + if (--nseg == 0) + break; + ++dseg; + if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) + dseg = (struct mlx5_wqe_dseg *)txq->wqes; + loc->mbuf = loc->mbuf->next; + } + } while (true); + txq->wqe_ci += (ds + 3) / 4; + loc->wqe_free -= (ds + 3) / 4; + return MLX5_TXCMP_CODE_MULTI; +} + +/** + * Tx one packet function for multi-segment SEND. Supports all + * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs, + * sends one packet per WQE, with data inlining in + * Ethernet Segment and minimal Data Segments. + * + * This routine is responsible for storing processed mbuf + * into elts ring buffer and update elts_head. + * + * @param txq + * Pointer to TX queue structure. + * @param loc + * Pointer to burst routine local context. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + * + * @return + * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. + * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. + * Local context variables partially updated. + */ +static __rte_always_inline enum mlx5_txcmp_code +mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq, + struct mlx5_txq_local *restrict loc, + unsigned int olx) +{ + struct mlx5_wqe *restrict wqe; + unsigned int ds, inlen, dlen, vlan = 0; + + MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); + MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1); + /* + * First calculate data length to be inlined + * to estimate the required space for WQE. + */ + dlen = rte_pktmbuf_pkt_len(loc->mbuf); + if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) + vlan = sizeof(struct rte_vlan_hdr); + inlen = dlen + vlan; + /* Check against minimal length. */ + if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE) + return MLX5_TXCMP_CODE_ERROR; + MLX5_ASSERT(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE); + if (inlen > txq->inlen_send || + loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) { + struct rte_mbuf *mbuf; + unsigned int nxlen; + uintptr_t start; + + /* + * Packet length exceeds the allowed inline + * data length, check whether the minimal + * inlining is required. + */ + if (txq->inlen_mode) { + MLX5_ASSERT(txq->inlen_mode >= + MLX5_ESEG_MIN_INLINE_SIZE); + MLX5_ASSERT(txq->inlen_mode <= txq->inlen_send); + inlen = txq->inlen_mode; + } else { + if (loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE || + !vlan || txq->vlan_en) { + /* + * VLAN insertion will be done inside by HW. + * It is not utmost effective - VLAN flag is + * checked twice, but we should proceed the + * inlining length correctly and take into + * account the VLAN header being inserted. + */ + return mlx5_tx_packet_multi_send + (txq, loc, olx); + } + inlen = MLX5_ESEG_MIN_INLINE_SIZE; + } + /* + * Now we know the minimal amount of data is requested + * to inline. Check whether we should inline the buffers + * from the chain beginning to eliminate some mbufs. + */ + mbuf = loc->mbuf; + nxlen = rte_pktmbuf_data_len(mbuf); + if (unlikely(nxlen <= txq->inlen_send)) { + /* We can inline first mbuf at least. */ + if (nxlen < inlen) { + unsigned int smlen; + + /* Scan mbufs till inlen filled. */ + do { + smlen = nxlen; + mbuf = NEXT(mbuf); + MLX5_ASSERT(mbuf); + nxlen = rte_pktmbuf_data_len(mbuf); + nxlen += smlen; + } while (unlikely(nxlen < inlen)); + if (unlikely(nxlen > txq->inlen_send)) { + /* We cannot inline entire mbuf. */ + smlen = inlen - smlen; + start = rte_pktmbuf_mtod_offset + (mbuf, uintptr_t, smlen); + goto do_align; + } + } + do { + inlen = nxlen; + mbuf = NEXT(mbuf); + /* There should be not end of packet. */ + MLX5_ASSERT(mbuf); + nxlen = inlen + rte_pktmbuf_data_len(mbuf); + } while (unlikely(nxlen < txq->inlen_send)); + } + start = rte_pktmbuf_mtod(mbuf, uintptr_t); + /* + * Check whether we can do inline to align start + * address of data buffer to cacheline. + */ +do_align: + start = (~start + 1) & (RTE_CACHE_LINE_SIZE - 1); + if (unlikely(start)) { + start += inlen; + if (start <= txq->inlen_send) + inlen = start; + } + } + /* + * Check whether there are enough free WQEBBs: + * - Control Segment + * - Ethernet Segment + * - First Segment of inlined Ethernet data + * - ... data continued ... + * - Data Segments of pointer/min inline type + * + * Estimate the number of Data Segments conservatively, + * supposing no any mbufs is being freed during inlining. + */ + MLX5_ASSERT(inlen <= txq->inlen_send); + ds = NB_SEGS(loc->mbuf) + 2 + (inlen - + MLX5_ESEG_MIN_INLINE_SIZE + + MLX5_WSEG_SIZE + + MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; + if (unlikely(loc->wqe_free < ((ds + 3) / 4))) + return MLX5_TXCMP_CODE_EXIT; + /* Check for maximal WQE size. */ + if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) + return MLX5_TXCMP_CODE_ERROR; +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Update sent data bytes/packets counters. */ + txq->stats.obytes += dlen + vlan; +#endif + wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); + loc->wqe_last = wqe; + mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_SEND, olx); + ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 0, olx); + wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); + txq->wqe_ci += (ds + 3) / 4; + loc->wqe_free -= (ds + 3) / 4; + return MLX5_TXCMP_CODE_MULTI; +} + +/** + * Tx burst function for multi-segment packets. Supports all + * types of Tx offloads, uses MLX5_OPCODE_SEND/TSO to build WQEs, + * sends one packet per WQE. Function stops sending if it + * encounters the single-segment packet. + * + * This routine is responsible for storing processed mbuf + * into elts ring buffer and update elts_head. + * + * @param txq + * Pointer to TX queue structure. + * @param[in] pkts + * Packets to transmit. + * @param pkts_n + * Number of packets in array. + * @param loc + * Pointer to burst routine local context. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + * + * @return + * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. + * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. + * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered. + * MLX5_TXCMP_CODE_TSO - TSO single-segment packet encountered. + * Local context variables updated. + */ +static __rte_always_inline enum mlx5_txcmp_code +mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq, + struct rte_mbuf **restrict pkts, + unsigned int pkts_n, + struct mlx5_txq_local *restrict loc, + unsigned int olx) +{ + MLX5_ASSERT(loc->elts_free && loc->wqe_free); + MLX5_ASSERT(pkts_n > loc->pkts_sent); + pkts += loc->pkts_sent + 1; + pkts_n -= loc->pkts_sent; + for (;;) { + enum mlx5_txcmp_code ret; + + MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1); + /* + * Estimate the number of free elts quickly but + * conservatively. Some segment may be fully inlined + * and freed, ignore this here - precise estimation + * is costly. + */ + if (loc->elts_free < NB_SEGS(loc->mbuf)) + return MLX5_TXCMP_CODE_EXIT; + if (MLX5_TXOFF_CONFIG(TSO) && + unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) { + /* Proceed with multi-segment TSO. */ + ret = mlx5_tx_packet_multi_tso(txq, loc, olx); + } else if (MLX5_TXOFF_CONFIG(INLINE)) { + /* Proceed with multi-segment SEND with inlining. */ + ret = mlx5_tx_packet_multi_inline(txq, loc, olx); + } else { + /* Proceed with multi-segment SEND w/o inlining. */ + ret = mlx5_tx_packet_multi_send(txq, loc, olx); + } + if (ret == MLX5_TXCMP_CODE_EXIT) + return MLX5_TXCMP_CODE_EXIT; + if (ret == MLX5_TXCMP_CODE_ERROR) + return MLX5_TXCMP_CODE_ERROR; + /* WQE is built, go to the next packet. */ + ++loc->pkts_sent; + --pkts_n; + if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) + return MLX5_TXCMP_CODE_EXIT; + loc->mbuf = *pkts++; + if (pkts_n > 1) + rte_prefetch0(*pkts); + if (likely(NB_SEGS(loc->mbuf) > 1)) + continue; + /* Here ends the series of multi-segment packets. */ + if (MLX5_TXOFF_CONFIG(TSO) && + unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) + return MLX5_TXCMP_CODE_TSO; + return MLX5_TXCMP_CODE_SINGLE; + } + MLX5_ASSERT(false); +} + +/** + * Tx burst function for single-segment packets with TSO. + * Supports all types of Tx offloads, except multi-packets. + * Uses MLX5_OPCODE_TSO to build WQEs, sends one packet per WQE. + * Function stops sending if it encounters the multi-segment + * packet or packet without TSO requested. + * + * The routine is responsible for storing processed mbuf + * into elts ring buffer and update elts_head if inline + * offloads is requested due to possible early freeing + * of the inlined mbufs (can not store pkts array in elts + * as a batch). + * + * @param txq + * Pointer to TX queue structure. + * @param[in] pkts + * Packets to transmit. + * @param pkts_n + * Number of packets in array. + * @param loc + * Pointer to burst routine local context. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + * + * @return + * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. + * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. + * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered. + * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. + * Local context variables updated. + */ +static __rte_always_inline enum mlx5_txcmp_code +mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq, + struct rte_mbuf **restrict pkts, + unsigned int pkts_n, + struct mlx5_txq_local *restrict loc, + unsigned int olx) +{ + MLX5_ASSERT(loc->elts_free && loc->wqe_free); + MLX5_ASSERT(pkts_n > loc->pkts_sent); + pkts += loc->pkts_sent + 1; + pkts_n -= loc->pkts_sent; + for (;;) { + struct mlx5_wqe_dseg *restrict dseg; + struct mlx5_wqe *restrict wqe; + unsigned int ds, dlen, hlen, ntcp, vlan = 0; + uint8_t *dptr; + + MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); + dlen = rte_pktmbuf_data_len(loc->mbuf); + if (MLX5_TXOFF_CONFIG(VLAN) && + loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { + vlan = sizeof(struct rte_vlan_hdr); + } + /* + * First calculate the WQE size to check + * whether we have enough space in ring buffer. + */ + hlen = loc->mbuf->l2_len + vlan + + loc->mbuf->l3_len + loc->mbuf->l4_len; + if (unlikely((!hlen || !loc->mbuf->tso_segsz))) + return MLX5_TXCMP_CODE_ERROR; + if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK) + hlen += loc->mbuf->outer_l2_len + + loc->mbuf->outer_l3_len; + /* Segment must contain all TSO headers. */ + if (unlikely(hlen > MLX5_MAX_TSO_HEADER || + hlen <= MLX5_ESEG_MIN_INLINE_SIZE || + hlen > (dlen + vlan))) + return MLX5_TXCMP_CODE_ERROR; + /* + * Check whether there are enough free WQEBBs: + * - Control Segment + * - Ethernet Segment + * - First Segment of inlined Ethernet data + * - ... data continued ... + * - Finishing Data Segment of pointer type + */ + ds = 4 + (hlen - MLX5_ESEG_MIN_INLINE_SIZE + + MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; + if (loc->wqe_free < ((ds + 3) / 4)) + return MLX5_TXCMP_CODE_EXIT; +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Update sent data bytes/packets counters. */ + ntcp = (dlen + vlan - hlen + + loc->mbuf->tso_segsz - 1) / + loc->mbuf->tso_segsz; + /* + * One will be added for mbuf itself at the end + * of the mlx5_tx_burst from loc->pkts_sent field. + */ + --ntcp; + txq->stats.opackets += ntcp; + txq->stats.obytes += dlen + vlan + ntcp * hlen; +#endif + /* + * Build the TSO WQE: + * - Control Segment + * - Ethernet Segment with hlen bytes inlined + * - Data Segment of pointer type + */ + wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); + loc->wqe_last = wqe; + mlx5_tx_cseg_init(txq, loc, wqe, ds, + MLX5_OPCODE_TSO, olx); + dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, hlen, 1, olx); + dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + hlen - vlan; + dlen -= hlen - vlan; + mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx); + /* + * WQE is built, update the loop parameters + * and go to the next packet. + */ + txq->wqe_ci += (ds + 3) / 4; + loc->wqe_free -= (ds + 3) / 4; + if (MLX5_TXOFF_CONFIG(INLINE)) + txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; + --loc->elts_free; + ++loc->pkts_sent; + --pkts_n; + if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) + return MLX5_TXCMP_CODE_EXIT; + loc->mbuf = *pkts++; + if (pkts_n > 1) + rte_prefetch0(*pkts); + if (MLX5_TXOFF_CONFIG(MULTI) && + unlikely(NB_SEGS(loc->mbuf) > 1)) + return MLX5_TXCMP_CODE_MULTI; + if (likely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))) + return MLX5_TXCMP_CODE_SINGLE; + /* Continue with the next TSO packet. */ + } + MLX5_ASSERT(false); +} + +/** + * Analyze the packet and select the best method to send. + * + * @param txq + * Pointer to TX queue structure. + * @param loc + * Pointer to burst routine local context. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + * @param newp + * The predefined flag whether do complete check for + * multi-segment packets and TSO. + * + * @return + * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. + * MLX5_TXCMP_CODE_TSO - TSO required, use TSO/LSO. + * MLX5_TXCMP_CODE_SINGLE - single-segment packet, use SEND. + * MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW. + */ +static __rte_always_inline enum mlx5_txcmp_code +mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq, + struct mlx5_txq_local *restrict loc, + unsigned int olx, + bool newp) +{ + /* Check for multi-segment packet. */ + if (newp && + MLX5_TXOFF_CONFIG(MULTI) && + unlikely(NB_SEGS(loc->mbuf) > 1)) + return MLX5_TXCMP_CODE_MULTI; + /* Check for TSO packet. */ + if (newp && + MLX5_TXOFF_CONFIG(TSO) && + unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) + return MLX5_TXCMP_CODE_TSO; + /* Check if eMPW is enabled at all. */ + if (!MLX5_TXOFF_CONFIG(EMPW)) + return MLX5_TXCMP_CODE_SINGLE; + /* Check if eMPW can be engaged. */ + if (MLX5_TXOFF_CONFIG(VLAN) && + unlikely(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) && + (!MLX5_TXOFF_CONFIG(INLINE) || + unlikely((rte_pktmbuf_data_len(loc->mbuf) + + sizeof(struct rte_vlan_hdr)) > txq->inlen_empw))) { + /* + * eMPW does not support VLAN insertion offload, + * we have to inline the entire packet but + * packet is too long for inlining. + */ + return MLX5_TXCMP_CODE_SINGLE; + } + return MLX5_TXCMP_CODE_EMPW; +} + +/** + * Check the next packet attributes to match with the eMPW batch ones. + * In addition, for legacy MPW the packet length is checked either. + * + * @param txq + * Pointer to TX queue structure. + * @param es + * Pointer to Ethernet Segment of eMPW batch. + * @param loc + * Pointer to burst routine local context. + * @param dlen + * Length of previous packet in MPW descriptor. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + * + * @return + * true - packet match with eMPW batch attributes. + * false - no match, eMPW should be restarted. + */ +static __rte_always_inline bool +mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused, + struct mlx5_wqe_eseg *restrict es, + struct mlx5_txq_local *restrict loc, + uint32_t dlen, + unsigned int olx) +{ + uint8_t swp_flags = 0; + + /* Compare the checksum flags, if any. */ + if (MLX5_TXOFF_CONFIG(CSUM) && + txq_ol_cksum_to_cs(loc->mbuf) != es->cs_flags) + return false; + /* Compare the Software Parser offsets and flags. */ + if (MLX5_TXOFF_CONFIG(SWP) && + (es->swp_offs != txq_mbuf_to_swp(loc, &swp_flags, olx) || + es->swp_flags != swp_flags)) + return false; + /* Fill metadata field if needed. */ + if (MLX5_TXOFF_CONFIG(METADATA) && + es->metadata != (loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? + *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0)) + return false; + /* Legacy MPW can send packets with the same lengt only. */ + if (MLX5_TXOFF_CONFIG(MPW) && + dlen != rte_pktmbuf_data_len(loc->mbuf)) + return false; + /* There must be no VLAN packets in eMPW loop. */ + if (MLX5_TXOFF_CONFIG(VLAN)) + MLX5_ASSERT(!(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)); + return true; +} + +/* + * Update send loop variables and WQE for eMPW loop + * without data inlining. Number of Data Segments is + * equal to the number of sent packets. + * + * @param txq + * Pointer to TX queue structure. + * @param loc + * Pointer to burst routine local context. + * @param ds + * Number of packets/Data Segments/Packets. + * @param slen + * Accumulated statistics, bytes sent + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + * + * @return + * true - packet match with eMPW batch attributes. + * false - no match, eMPW should be restarted. + */ +static __rte_always_inline void +mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq, + struct mlx5_txq_local *restrict loc, + unsigned int ds, + unsigned int slen, + unsigned int olx __rte_unused) +{ + MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE)); +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Update sent data bytes counter. */ + txq->stats.obytes += slen; +#else + (void)slen; +#endif + loc->elts_free -= ds; + loc->pkts_sent += ds; + ds += 2; + loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); + txq->wqe_ci += (ds + 3) / 4; + loc->wqe_free -= (ds + 3) / 4; +} + +/* + * Update send loop variables and WQE for eMPW loop + * with data inlining. Gets the size of pushed descriptors + * and data to the WQE. + * + * @param txq + * Pointer to TX queue structure. + * @param loc + * Pointer to burst routine local context. + * @param len + * Total size of descriptor/data in bytes. + * @param slen + * Accumulated statistics, data bytes sent. + * @param wqem + * The base WQE for the eMPW/MPW descriptor. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + * + * @return + * true - packet match with eMPW batch attributes. + * false - no match, eMPW should be restarted. + */ +static __rte_always_inline void +mlx5_tx_idone_empw(struct mlx5_txq_data *restrict txq, + struct mlx5_txq_local *restrict loc, + unsigned int len, + unsigned int slen, + struct mlx5_wqe *restrict wqem, + unsigned int olx __rte_unused) +{ + struct mlx5_wqe_dseg *dseg = &wqem->dseg[0]; + + MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Update sent data bytes counter. */ + txq->stats.obytes += slen; +#else + (void)slen; +#endif + if (MLX5_TXOFF_CONFIG(MPW) && dseg->bcount == RTE_BE32(0)) { + /* + * If the legacy MPW session contains the inline packets + * we should set the only inline data segment length + * and align the total length to the segment size. + */ + MLX5_ASSERT(len > sizeof(dseg->bcount)); + dseg->bcount = rte_cpu_to_be_32((len - sizeof(dseg->bcount)) | + MLX5_ETH_WQE_DATA_INLINE); + len = (len + MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE + 2; + } else { + /* + * The session is not legacy MPW or contains the + * data buffer pointer segments. + */ + MLX5_ASSERT((len % MLX5_WSEG_SIZE) == 0); + len = len / MLX5_WSEG_SIZE + 2; + } + wqem->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len); + txq->wqe_ci += (len + 3) / 4; + loc->wqe_free -= (len + 3) / 4; + loc->wqe_last = wqem; +} + +/** + * The set of Tx burst functions for single-segment packets + * without TSO and with Multi-Packet Writing feature support. + * Supports all types of Tx offloads, except multi-packets + * and TSO. + * + * Uses MLX5_OPCODE_EMPW to build WQEs if possible and sends + * as many packet per WQE as it can. If eMPW is not configured + * or packet can not be sent with eMPW (VLAN insertion) the + * ordinary SEND opcode is used and only one packet placed + * in WQE. + * + * Functions stop sending if it encounters the multi-segment + * packet or packet with TSO requested. + * + * The routines are responsible for storing processed mbuf + * into elts ring buffer and update elts_head if inlining + * offload is requested. Otherwise the copying mbufs to elts + * can be postponed and completed at the end of burst routine. + * + * @param txq + * Pointer to TX queue structure. + * @param[in] pkts + * Packets to transmit. + * @param pkts_n + * Number of packets in array. + * @param loc + * Pointer to burst routine local context. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + * + * @return + * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. + * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. + * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. + * MLX5_TXCMP_CODE_TSO - TSO packet encountered. + * MLX5_TXCMP_CODE_SINGLE - used inside functions set. + * MLX5_TXCMP_CODE_EMPW - used inside functions set. + * + * Local context variables updated. + * + * + * The routine sends packets with MLX5_OPCODE_EMPW + * without inlining, this is dedicated optimized branch. + * No VLAN insertion is supported. + */ +static __rte_always_inline enum mlx5_txcmp_code +mlx5_tx_burst_empw_simple(struct mlx5_txq_data *restrict txq, + struct rte_mbuf **restrict pkts, + unsigned int pkts_n, + struct mlx5_txq_local *restrict loc, + unsigned int olx) +{ + /* + * Subroutine is the part of mlx5_tx_burst_single() + * and sends single-segment packet with eMPW opcode + * without data inlining. + */ + MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE)); + MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW)); + MLX5_ASSERT(loc->elts_free && loc->wqe_free); + MLX5_ASSERT(pkts_n > loc->pkts_sent); + static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size"); + pkts += loc->pkts_sent + 1; + pkts_n -= loc->pkts_sent; + for (;;) { + struct mlx5_wqe_dseg *restrict dseg; + struct mlx5_wqe_eseg *restrict eseg; + enum mlx5_txcmp_code ret; + unsigned int part, loop; + unsigned int slen = 0; + +next_empw: + MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); + part = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ? + MLX5_MPW_MAX_PACKETS : + MLX5_EMPW_MAX_PACKETS); + if (unlikely(loc->elts_free < part)) { + /* We have no enough elts to save all mbufs. */ + if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS)) + return MLX5_TXCMP_CODE_EXIT; + /* But we still able to send at least minimal eMPW. */ + part = loc->elts_free; + } + /* Check whether we have enough WQEs */ + if (unlikely(loc->wqe_free < ((2 + part + 3) / 4))) { + if (unlikely(loc->wqe_free < + ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4))) + return MLX5_TXCMP_CODE_EXIT; + part = (loc->wqe_free * 4) - 2; + } + if (likely(part > 1)) + rte_prefetch0(*pkts); + loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m); + /* + * Build eMPW title WQEBB: + * - Control Segment, eMPW opcode + * - Ethernet Segment, no inline + */ + mlx5_tx_cseg_init(txq, loc, loc->wqe_last, part + 2, + MLX5_OPCODE_ENHANCED_MPSW, olx); + mlx5_tx_eseg_none(txq, loc, loc->wqe_last, + olx & ~MLX5_TXOFF_CONFIG_VLAN); + eseg = &loc->wqe_last->eseg; + dseg = &loc->wqe_last->dseg[0]; + loop = part; + /* Store the packet length for legacy MPW. */ + if (MLX5_TXOFF_CONFIG(MPW)) + eseg->mss = rte_cpu_to_be_16 + (rte_pktmbuf_data_len(loc->mbuf)); + for (;;) { + uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf); +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Update sent data bytes counter. */ + slen += dlen; +#endif + mlx5_tx_dseg_ptr + (txq, loc, dseg, + rte_pktmbuf_mtod(loc->mbuf, uint8_t *), + dlen, olx); + if (unlikely(--loop == 0)) + break; + loc->mbuf = *pkts++; + if (likely(loop > 1)) + rte_prefetch0(*pkts); + ret = mlx5_tx_able_to_empw(txq, loc, olx, true); + /* + * Unroll the completion code to avoid + * returning variable value - it results in + * unoptimized sequent checking in caller. + */ + if (ret == MLX5_TXCMP_CODE_MULTI) { + part -= loop; + mlx5_tx_sdone_empw(txq, loc, part, slen, olx); + if (unlikely(!loc->elts_free || + !loc->wqe_free)) + return MLX5_TXCMP_CODE_EXIT; + return MLX5_TXCMP_CODE_MULTI; + } + MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); + if (ret == MLX5_TXCMP_CODE_TSO) { + part -= loop; + mlx5_tx_sdone_empw(txq, loc, part, slen, olx); + if (unlikely(!loc->elts_free || + !loc->wqe_free)) + return MLX5_TXCMP_CODE_EXIT; + return MLX5_TXCMP_CODE_TSO; + } + if (ret == MLX5_TXCMP_CODE_SINGLE) { + part -= loop; + mlx5_tx_sdone_empw(txq, loc, part, slen, olx); + if (unlikely(!loc->elts_free || + !loc->wqe_free)) + return MLX5_TXCMP_CODE_EXIT; + return MLX5_TXCMP_CODE_SINGLE; + } + if (ret != MLX5_TXCMP_CODE_EMPW) { + MLX5_ASSERT(false); + part -= loop; + mlx5_tx_sdone_empw(txq, loc, part, slen, olx); + return MLX5_TXCMP_CODE_ERROR; + } + /* + * Check whether packet parameters coincide + * within assumed eMPW batch: + * - check sum settings + * - metadata value + * - software parser settings + * - packets length (legacy MPW only) + */ + if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx)) { + MLX5_ASSERT(loop); + part -= loop; + mlx5_tx_sdone_empw(txq, loc, part, slen, olx); + if (unlikely(!loc->elts_free || + !loc->wqe_free)) + return MLX5_TXCMP_CODE_EXIT; + pkts_n -= part; + goto next_empw; + } + /* Packet attributes match, continue the same eMPW. */ + ++dseg; + if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) + dseg = (struct mlx5_wqe_dseg *)txq->wqes; + } + /* eMPW is built successfully, update loop parameters. */ + MLX5_ASSERT(!loop); + MLX5_ASSERT(pkts_n >= part); +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Update sent data bytes counter. */ + txq->stats.obytes += slen; +#endif + loc->elts_free -= part; + loc->pkts_sent += part; + txq->wqe_ci += (2 + part + 3) / 4; + loc->wqe_free -= (2 + part + 3) / 4; + pkts_n -= part; + if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) + return MLX5_TXCMP_CODE_EXIT; + loc->mbuf = *pkts++; + ret = mlx5_tx_able_to_empw(txq, loc, olx, true); + if (unlikely(ret != MLX5_TXCMP_CODE_EMPW)) + return ret; + /* Continue sending eMPW batches. */ + } + MLX5_ASSERT(false); +} + +/** + * The routine sends packets with MLX5_OPCODE_EMPW + * with inlining, optionally supports VLAN insertion. + */ +static __rte_always_inline enum mlx5_txcmp_code +mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq, + struct rte_mbuf **restrict pkts, + unsigned int pkts_n, + struct mlx5_txq_local *restrict loc, + unsigned int olx) +{ + /* + * Subroutine is the part of mlx5_tx_burst_single() + * and sends single-segment packet with eMPW opcode + * with data inlining. + */ + MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); + MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW)); + MLX5_ASSERT(loc->elts_free && loc->wqe_free); + MLX5_ASSERT(pkts_n > loc->pkts_sent); + static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size"); + pkts += loc->pkts_sent + 1; + pkts_n -= loc->pkts_sent; + for (;;) { + struct mlx5_wqe_dseg *restrict dseg; + struct mlx5_wqe *restrict wqem; + enum mlx5_txcmp_code ret; + unsigned int room, part, nlim; + unsigned int slen = 0; + + MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); + /* + * Limits the amount of packets in one WQE + * to improve CQE latency generation. + */ + nlim = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ? + MLX5_MPW_INLINE_MAX_PACKETS : + MLX5_EMPW_MAX_PACKETS); + /* Check whether we have minimal amount WQEs */ + if (unlikely(loc->wqe_free < + ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4))) + return MLX5_TXCMP_CODE_EXIT; + if (likely(pkts_n > 1)) + rte_prefetch0(*pkts); + wqem = txq->wqes + (txq->wqe_ci & txq->wqe_m); + /* + * Build eMPW title WQEBB: + * - Control Segment, eMPW opcode, zero DS + * - Ethernet Segment, no inline + */ + mlx5_tx_cseg_init(txq, loc, wqem, 0, + MLX5_OPCODE_ENHANCED_MPSW, olx); + mlx5_tx_eseg_none(txq, loc, wqem, + olx & ~MLX5_TXOFF_CONFIG_VLAN); + dseg = &wqem->dseg[0]; + /* Store the packet length for legacy MPW. */ + if (MLX5_TXOFF_CONFIG(MPW)) + wqem->eseg.mss = rte_cpu_to_be_16 + (rte_pktmbuf_data_len(loc->mbuf)); + room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE, + loc->wqe_free) * MLX5_WQE_SIZE - + MLX5_WQE_CSEG_SIZE - + MLX5_WQE_ESEG_SIZE; + /* Limit the room for legacy MPW sessions for performance. */ + if (MLX5_TXOFF_CONFIG(MPW)) + room = RTE_MIN(room, + RTE_MAX(txq->inlen_empw + + sizeof(dseg->bcount) + + (MLX5_TXOFF_CONFIG(VLAN) ? + sizeof(struct rte_vlan_hdr) : 0), + MLX5_MPW_INLINE_MAX_PACKETS * + MLX5_WQE_DSEG_SIZE)); + /* Build WQE till we have space, packets and resources. */ + part = room; + for (;;) { + uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf); + uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); + unsigned int tlen; + + MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE); + MLX5_ASSERT((room % MLX5_WQE_DSEG_SIZE) == 0); + MLX5_ASSERT((uintptr_t)dseg < (uintptr_t)txq->wqes_end); + /* + * Some Tx offloads may cause an error if + * packet is not long enough, check against + * assumed minimal length. + */ + if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) { + part -= room; + if (unlikely(!part)) + return MLX5_TXCMP_CODE_ERROR; + /* + * We have some successfully built + * packet Data Segments to send. + */ + mlx5_tx_idone_empw(txq, loc, part, + slen, wqem, olx); + return MLX5_TXCMP_CODE_ERROR; + } + /* Inline or not inline - that's the Question. */ + if (dlen > txq->inlen_empw || + loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) + goto pointer_empw; + if (MLX5_TXOFF_CONFIG(MPW)) { + if (dlen > txq->inlen_send) + goto pointer_empw; + tlen = dlen; + if (part == room) { + /* Open new inline MPW session. */ + tlen += sizeof(dseg->bcount); + dseg->bcount = RTE_BE32(0); + dseg = RTE_PTR_ADD + (dseg, sizeof(dseg->bcount)); + } else { + /* + * No pointer and inline descriptor + * intermix for legacy MPW sessions. + */ + if (wqem->dseg[0].bcount) + break; + } + } else { + tlen = sizeof(dseg->bcount) + dlen; + } + /* Inline entire packet, optional VLAN insertion. */ + if (MLX5_TXOFF_CONFIG(VLAN) && + loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { + /* + * The packet length must be checked in + * mlx5_tx_able_to_empw() and packet + * fits into inline length guaranteed. + */ + MLX5_ASSERT((dlen + + sizeof(struct rte_vlan_hdr)) <= + txq->inlen_empw); + tlen += sizeof(struct rte_vlan_hdr); + if (room < tlen) + break; + dseg = mlx5_tx_dseg_vlan(txq, loc, dseg, + dptr, dlen, olx); +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Update sent data bytes counter. */ + slen += sizeof(struct rte_vlan_hdr); +#endif + } else { + if (room < tlen) + break; + dseg = mlx5_tx_dseg_empw(txq, loc, dseg, + dptr, dlen, olx); + } + if (!MLX5_TXOFF_CONFIG(MPW)) + tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE); + MLX5_ASSERT(room >= tlen); + room -= tlen; + /* + * Packet data are completely inlined, + * free the packet immediately. + */ + rte_pktmbuf_free_seg(loc->mbuf); + goto next_mbuf; +pointer_empw: + /* + * No pointer and inline descriptor + * intermix for legacy MPW sessions. + */ + if (MLX5_TXOFF_CONFIG(MPW) && + part != room && + wqem->dseg[0].bcount == RTE_BE32(0)) + break; + /* + * Not inlinable VLAN packets are + * proceeded outside of this routine. + */ + MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE); + if (MLX5_TXOFF_CONFIG(VLAN)) + MLX5_ASSERT(!(loc->mbuf->ol_flags & + PKT_TX_VLAN_PKT)); + mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx); + /* We have to store mbuf in elts.*/ + txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; + room -= MLX5_WQE_DSEG_SIZE; + /* Ring buffer wraparound is checked at the loop end.*/ + ++dseg; +next_mbuf: +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Update sent data bytes counter. */ + slen += dlen; +#endif + loc->pkts_sent++; + loc->elts_free--; + pkts_n--; + if (unlikely(!pkts_n || !loc->elts_free)) { + /* + * We have no resources/packets to + * continue build descriptors. + */ + part -= room; + mlx5_tx_idone_empw(txq, loc, part, + slen, wqem, olx); + return MLX5_TXCMP_CODE_EXIT; + } + loc->mbuf = *pkts++; + if (likely(pkts_n > 1)) + rte_prefetch0(*pkts); + ret = mlx5_tx_able_to_empw(txq, loc, olx, true); + /* + * Unroll the completion code to avoid + * returning variable value - it results in + * unoptimized sequent checking in caller. + */ + if (ret == MLX5_TXCMP_CODE_MULTI) { + part -= room; + mlx5_tx_idone_empw(txq, loc, part, + slen, wqem, olx); + if (unlikely(!loc->elts_free || + !loc->wqe_free)) + return MLX5_TXCMP_CODE_EXIT; + return MLX5_TXCMP_CODE_MULTI; + } + MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); + if (ret == MLX5_TXCMP_CODE_TSO) { + part -= room; + mlx5_tx_idone_empw(txq, loc, part, + slen, wqem, olx); + if (unlikely(!loc->elts_free || + !loc->wqe_free)) + return MLX5_TXCMP_CODE_EXIT; + return MLX5_TXCMP_CODE_TSO; + } + if (ret == MLX5_TXCMP_CODE_SINGLE) { + part -= room; + mlx5_tx_idone_empw(txq, loc, part, + slen, wqem, olx); + if (unlikely(!loc->elts_free || + !loc->wqe_free)) + return MLX5_TXCMP_CODE_EXIT; + return MLX5_TXCMP_CODE_SINGLE; + } + if (ret != MLX5_TXCMP_CODE_EMPW) { + MLX5_ASSERT(false); + part -= room; + mlx5_tx_idone_empw(txq, loc, part, + slen, wqem, olx); + return MLX5_TXCMP_CODE_ERROR; + } + /* Check if we have minimal room left. */ + nlim--; + if (unlikely(!nlim || room < MLX5_WQE_DSEG_SIZE)) + break; + /* + * Check whether packet parameters coincide + * within assumed eMPW batch: + * - check sum settings + * - metadata value + * - software parser settings + * - packets length (legacy MPW only) + */ + if (!mlx5_tx_match_empw(txq, &wqem->eseg, + loc, dlen, olx)) + break; + /* Packet attributes match, continue the same eMPW. */ + if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) + dseg = (struct mlx5_wqe_dseg *)txq->wqes; + } + /* + * We get here to close an existing eMPW + * session and start the new one. + */ + MLX5_ASSERT(pkts_n); + part -= room; + if (unlikely(!part)) + return MLX5_TXCMP_CODE_EXIT; + mlx5_tx_idone_empw(txq, loc, part, slen, wqem, olx); + if (unlikely(!loc->elts_free || + !loc->wqe_free)) + return MLX5_TXCMP_CODE_EXIT; + /* Continue the loop with new eMPW session. */ + } + MLX5_ASSERT(false); +} + +/** + * The routine sends packets with ordinary MLX5_OPCODE_SEND. + * Data inlining and VLAN insertion are supported. + */ +static __rte_always_inline enum mlx5_txcmp_code +mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, + struct rte_mbuf **restrict pkts, + unsigned int pkts_n, + struct mlx5_txq_local *restrict loc, + unsigned int olx) +{ + /* + * Subroutine is the part of mlx5_tx_burst_single() + * and sends single-segment packet with SEND opcode. + */ + MLX5_ASSERT(loc->elts_free && loc->wqe_free); + MLX5_ASSERT(pkts_n > loc->pkts_sent); + pkts += loc->pkts_sent + 1; + pkts_n -= loc->pkts_sent; + for (;;) { + struct mlx5_wqe *restrict wqe; + enum mlx5_txcmp_code ret; + + MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); + if (MLX5_TXOFF_CONFIG(INLINE)) { + unsigned int inlen, vlan = 0; + + inlen = rte_pktmbuf_data_len(loc->mbuf); + if (MLX5_TXOFF_CONFIG(VLAN) && + loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { + vlan = sizeof(struct rte_vlan_hdr); + inlen += vlan; + static_assert((sizeof(struct rte_vlan_hdr) + + sizeof(struct rte_ether_hdr)) == + MLX5_ESEG_MIN_INLINE_SIZE, + "invalid min inline data size"); + } + /* + * If inlining is enabled at configuration time + * the limit must be not less than minimal size. + * Otherwise we would do extra check for data + * size to avoid crashes due to length overflow. + */ + MLX5_ASSERT(txq->inlen_send >= + MLX5_ESEG_MIN_INLINE_SIZE); + if (inlen <= txq->inlen_send) { + unsigned int seg_n, wqe_n; + + rte_prefetch0(rte_pktmbuf_mtod + (loc->mbuf, uint8_t *)); + /* Check against minimal length. */ + if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE) + return MLX5_TXCMP_CODE_ERROR; + if (loc->mbuf->ol_flags & + PKT_TX_DYNF_NOINLINE) { + /* + * The hint flag not to inline packet + * data is set. Check whether we can + * follow the hint. + */ + if ((!MLX5_TXOFF_CONFIG(EMPW) && + txq->inlen_mode) || + (MLX5_TXOFF_CONFIG(MPW) && + txq->inlen_mode)) { + /* + * The hardware requires the + * minimal inline data header. + */ + goto single_min_inline; + } + if (MLX5_TXOFF_CONFIG(VLAN) && + vlan && !txq->vlan_en) { + /* + * We must insert VLAN tag + * by software means. + */ + goto single_part_inline; + } + goto single_no_inline; + } + /* + * Completely inlined packet data WQE: + * - Control Segment, SEND opcode + * - Ethernet Segment, no VLAN insertion + * - Data inlined, VLAN optionally inserted + * - Alignment to MLX5_WSEG_SIZE + * Have to estimate amount of WQEBBs + */ + seg_n = (inlen + 3 * MLX5_WSEG_SIZE - + MLX5_ESEG_MIN_INLINE_SIZE + + MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; + /* Check if there are enough WQEBBs. */ + wqe_n = (seg_n + 3) / 4; + if (wqe_n > loc->wqe_free) + return MLX5_TXCMP_CODE_EXIT; + wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); + loc->wqe_last = wqe; + mlx5_tx_cseg_init(txq, loc, wqe, seg_n, + MLX5_OPCODE_SEND, olx); + mlx5_tx_eseg_data(txq, loc, wqe, + vlan, inlen, 0, olx); + txq->wqe_ci += wqe_n; + loc->wqe_free -= wqe_n; + /* + * Packet data are completely inlined, + * free the packet immediately. + */ + rte_pktmbuf_free_seg(loc->mbuf); + } else if ((!MLX5_TXOFF_CONFIG(EMPW) || + MLX5_TXOFF_CONFIG(MPW)) && + txq->inlen_mode) { + /* + * If minimal inlining is requested the eMPW + * feature should be disabled due to data is + * inlined into Ethernet Segment, which can + * not contain inlined data for eMPW due to + * segment shared for all packets. + */ + struct mlx5_wqe_dseg *restrict dseg; + unsigned int ds; + uint8_t *dptr; + + /* + * The inline-mode settings require + * to inline the specified amount of + * data bytes to the Ethernet Segment. + * We should check the free space in + * WQE ring buffer to inline partially. + */ +single_min_inline: + MLX5_ASSERT(txq->inlen_send >= txq->inlen_mode); + MLX5_ASSERT(inlen > txq->inlen_mode); + MLX5_ASSERT(txq->inlen_mode >= + MLX5_ESEG_MIN_INLINE_SIZE); + /* + * Check whether there are enough free WQEBBs: + * - Control Segment + * - Ethernet Segment + * - First Segment of inlined Ethernet data + * - ... data continued ... + * - Finishing Data Segment of pointer type + */ + ds = (MLX5_WQE_CSEG_SIZE + + MLX5_WQE_ESEG_SIZE + + MLX5_WQE_DSEG_SIZE + + txq->inlen_mode - + MLX5_ESEG_MIN_INLINE_SIZE + + MLX5_WQE_DSEG_SIZE + + MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; + if (loc->wqe_free < ((ds + 3) / 4)) + return MLX5_TXCMP_CODE_EXIT; + /* + * Build the ordinary SEND WQE: + * - Control Segment + * - Ethernet Segment, inline inlen_mode bytes + * - Data Segment of pointer type + */ + wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); + loc->wqe_last = wqe; + mlx5_tx_cseg_init(txq, loc, wqe, ds, + MLX5_OPCODE_SEND, olx); + dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, + txq->inlen_mode, + 0, olx); + dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + + txq->inlen_mode - vlan; + inlen -= txq->inlen_mode; + mlx5_tx_dseg_ptr(txq, loc, dseg, + dptr, inlen, olx); + /* + * WQE is built, update the loop parameters + * and got to the next packet. + */ + txq->wqe_ci += (ds + 3) / 4; + loc->wqe_free -= (ds + 3) / 4; + /* We have to store mbuf in elts.*/ + MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); + txq->elts[txq->elts_head++ & txq->elts_m] = + loc->mbuf; + --loc->elts_free; + } else { + uint8_t *dptr; + unsigned int dlen; + + /* + * Partially inlined packet data WQE, we have + * some space in title WQEBB, we can fill it + * with some packet data. It takes one WQEBB, + * it is available, no extra space check: + * - Control Segment, SEND opcode + * - Ethernet Segment, no VLAN insertion + * - MLX5_ESEG_MIN_INLINE_SIZE bytes of Data + * - Data Segment, pointer type + * + * We also get here if VLAN insertion is not + * supported by HW, the inline is enabled. + */ +single_part_inline: + wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); + loc->wqe_last = wqe; + mlx5_tx_cseg_init(txq, loc, wqe, 4, + MLX5_OPCODE_SEND, olx); + mlx5_tx_eseg_dmin(txq, loc, wqe, vlan, olx); + dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + + MLX5_ESEG_MIN_INLINE_SIZE - vlan; + /* + * The length check is performed above, by + * comparing with txq->inlen_send. We should + * not get overflow here. + */ + MLX5_ASSERT(inlen > MLX5_ESEG_MIN_INLINE_SIZE); + dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE; + mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1], + dptr, dlen, olx); + ++txq->wqe_ci; + --loc->wqe_free; + /* We have to store mbuf in elts.*/ + MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); + txq->elts[txq->elts_head++ & txq->elts_m] = + loc->mbuf; + --loc->elts_free; + } +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Update sent data bytes counter. */ + txq->stats.obytes += vlan + + rte_pktmbuf_data_len(loc->mbuf); +#endif + } else { + /* + * No inline at all, it means the CPU cycles saving + * is prioritized at configuration, we should not + * copy any packet data to WQE. + * + * SEND WQE, one WQEBB: + * - Control Segment, SEND opcode + * - Ethernet Segment, optional VLAN, no inline + * - Data Segment, pointer type + */ +single_no_inline: + wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); + loc->wqe_last = wqe; + mlx5_tx_cseg_init(txq, loc, wqe, 3, + MLX5_OPCODE_SEND, olx); + mlx5_tx_eseg_none(txq, loc, wqe, olx); + mlx5_tx_dseg_ptr + (txq, loc, &wqe->dseg[0], + rte_pktmbuf_mtod(loc->mbuf, uint8_t *), + rte_pktmbuf_data_len(loc->mbuf), olx); + ++txq->wqe_ci; + --loc->wqe_free; + /* + * We should not store mbuf pointer in elts + * if no inlining is configured, this is done + * by calling routine in a batch copy. + */ + MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE)); + --loc->elts_free; +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Update sent data bytes counter. */ + txq->stats.obytes += rte_pktmbuf_data_len(loc->mbuf); + if (MLX5_TXOFF_CONFIG(VLAN) && + loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) + txq->stats.obytes += + sizeof(struct rte_vlan_hdr); +#endif + } + ++loc->pkts_sent; + --pkts_n; + if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) + return MLX5_TXCMP_CODE_EXIT; + loc->mbuf = *pkts++; + if (pkts_n > 1) + rte_prefetch0(*pkts); + ret = mlx5_tx_able_to_empw(txq, loc, olx, true); + if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE)) + return ret; + } + MLX5_ASSERT(false); +} + +static __rte_always_inline enum mlx5_txcmp_code +mlx5_tx_burst_single(struct mlx5_txq_data *restrict txq, + struct rte_mbuf **restrict pkts, + unsigned int pkts_n, + struct mlx5_txq_local *restrict loc, + unsigned int olx) +{ + enum mlx5_txcmp_code ret; + + ret = mlx5_tx_able_to_empw(txq, loc, olx, false); + if (ret == MLX5_TXCMP_CODE_SINGLE) + goto ordinary_send; + MLX5_ASSERT(ret == MLX5_TXCMP_CODE_EMPW); + for (;;) { + /* Optimize for inline/no inline eMPW send. */ + ret = (MLX5_TXOFF_CONFIG(INLINE)) ? + mlx5_tx_burst_empw_inline + (txq, pkts, pkts_n, loc, olx) : + mlx5_tx_burst_empw_simple + (txq, pkts, pkts_n, loc, olx); + if (ret != MLX5_TXCMP_CODE_SINGLE) + return ret; + /* The resources to send one packet should remain. */ + MLX5_ASSERT(loc->elts_free && loc->wqe_free); +ordinary_send: + ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx); + MLX5_ASSERT(ret != MLX5_TXCMP_CODE_SINGLE); + if (ret != MLX5_TXCMP_CODE_EMPW) + return ret; + /* The resources to send one packet should remain. */ + MLX5_ASSERT(loc->elts_free && loc->wqe_free); + } +} + +/** + * DPDK Tx callback template. This is configured template + * used to generate routines optimized for specified offload setup. + * One of this generated functions is chosen at SQ configuration + * time. + * + * @param txq + * Generic pointer to TX queue structure. + * @param[in] pkts + * Packets to transmit. + * @param pkts_n + * Number of packets in array. + * @param olx + * Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx + * values. Should be static to take compile time static configuration + * advantages. + * + * @return + * Number of packets successfully transmitted (<= pkts_n). + */ +static __rte_always_inline uint16_t +mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq, + struct rte_mbuf **restrict pkts, + uint16_t pkts_n, + unsigned int olx) +{ + struct mlx5_txq_local loc; + enum mlx5_txcmp_code ret; + unsigned int part; + + MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); + MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); + if (unlikely(!pkts_n)) + return 0; + loc.pkts_sent = 0; + loc.pkts_copy = 0; + loc.wqe_last = NULL; + +send_loop: + loc.pkts_loop = loc.pkts_sent; + /* + * Check if there are some CQEs, if any: + * - process an encountered errors + * - process the completed WQEs + * - free related mbufs + * - doorbell the NIC about processed CQEs + */ + rte_prefetch0(*(pkts + loc.pkts_sent)); + mlx5_tx_handle_completion(txq, olx); + /* + * Calculate the number of available resources - elts and WQEs. + * There are two possible different scenarios: + * - no data inlining into WQEs, one WQEBB may contains up to + * four packets, in this case elts become scarce resource + * - data inlining into WQEs, one packet may require multiple + * WQEBBs, the WQEs become the limiting factor. + */ + MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); + loc.elts_free = txq->elts_s - + (uint16_t)(txq->elts_head - txq->elts_tail); + MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); + loc.wqe_free = txq->wqe_s - + (uint16_t)(txq->wqe_ci - txq->wqe_pi); + if (unlikely(!loc.elts_free || !loc.wqe_free)) + goto burst_exit; + for (;;) { + /* + * Fetch the packet from array. Usually this is + * the first packet in series of multi/single + * segment packets. + */ + loc.mbuf = *(pkts + loc.pkts_sent); + /* Dedicated branch for multi-segment packets. */ + if (MLX5_TXOFF_CONFIG(MULTI) && + unlikely(NB_SEGS(loc.mbuf) > 1)) { + /* + * Multi-segment packet encountered. + * Hardware is able to process it only + * with SEND/TSO opcodes, one packet + * per WQE, do it in dedicated routine. + */ +enter_send_multi: + MLX5_ASSERT(loc.pkts_sent >= loc.pkts_copy); + part = loc.pkts_sent - loc.pkts_copy; + if (!MLX5_TXOFF_CONFIG(INLINE) && part) { + /* + * There are some single-segment mbufs not + * stored in elts. The mbufs must be in the + * same order as WQEs, so we must copy the + * mbufs to elts here, before the coming + * multi-segment packet mbufs is appended. + */ + mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, + part, olx); + loc.pkts_copy = loc.pkts_sent; + } + MLX5_ASSERT(pkts_n > loc.pkts_sent); + ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx); + if (!MLX5_TXOFF_CONFIG(INLINE)) + loc.pkts_copy = loc.pkts_sent; + /* + * These returned code checks are supposed + * to be optimized out due to routine inlining. + */ + if (ret == MLX5_TXCMP_CODE_EXIT) { + /* + * The routine returns this code when + * all packets are sent or there is no + * enough resources to complete request. + */ + break; + } + if (ret == MLX5_TXCMP_CODE_ERROR) { + /* + * The routine returns this code when + * some error in the incoming packets + * format occurred. + */ + txq->stats.oerrors++; + break; + } + if (ret == MLX5_TXCMP_CODE_SINGLE) { + /* + * The single-segment packet was encountered + * in the array, try to send it with the + * best optimized way, possible engaging eMPW. + */ + goto enter_send_single; + } + if (MLX5_TXOFF_CONFIG(TSO) && + ret == MLX5_TXCMP_CODE_TSO) { + /* + * The single-segment TSO packet was + * encountered in the array. + */ + goto enter_send_tso; + } + /* We must not get here. Something is going wrong. */ + MLX5_ASSERT(false); + txq->stats.oerrors++; + break; + } + /* Dedicated branch for single-segment TSO packets. */ + if (MLX5_TXOFF_CONFIG(TSO) && + unlikely(loc.mbuf->ol_flags & PKT_TX_TCP_SEG)) { + /* + * TSO might require special way for inlining + * (dedicated parameters) and is sent with + * MLX5_OPCODE_TSO opcode only, provide this + * in dedicated branch. + */ +enter_send_tso: + MLX5_ASSERT(NB_SEGS(loc.mbuf) == 1); + MLX5_ASSERT(pkts_n > loc.pkts_sent); + ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx); + /* + * These returned code checks are supposed + * to be optimized out due to routine inlining. + */ + if (ret == MLX5_TXCMP_CODE_EXIT) + break; + if (ret == MLX5_TXCMP_CODE_ERROR) { + txq->stats.oerrors++; + break; + } + if (ret == MLX5_TXCMP_CODE_SINGLE) + goto enter_send_single; + if (MLX5_TXOFF_CONFIG(MULTI) && + ret == MLX5_TXCMP_CODE_MULTI) { + /* + * The multi-segment packet was + * encountered in the array. + */ + goto enter_send_multi; + } + /* We must not get here. Something is going wrong. */ + MLX5_ASSERT(false); + txq->stats.oerrors++; + break; + } + /* + * The dedicated branch for the single-segment packets + * without TSO. Often these ones can be sent using + * MLX5_OPCODE_EMPW with multiple packets in one WQE. + * The routine builds the WQEs till it encounters + * the TSO or multi-segment packet (in case if these + * offloads are requested at SQ configuration time). + */ +enter_send_single: + MLX5_ASSERT(pkts_n > loc.pkts_sent); + ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx); + /* + * These returned code checks are supposed + * to be optimized out due to routine inlining. + */ + if (ret == MLX5_TXCMP_CODE_EXIT) + break; + if (ret == MLX5_TXCMP_CODE_ERROR) { + txq->stats.oerrors++; + break; + } + if (MLX5_TXOFF_CONFIG(MULTI) && + ret == MLX5_TXCMP_CODE_MULTI) { + /* + * The multi-segment packet was + * encountered in the array. + */ + goto enter_send_multi; + } + if (MLX5_TXOFF_CONFIG(TSO) && + ret == MLX5_TXCMP_CODE_TSO) { + /* + * The single-segment TSO packet was + * encountered in the array. + */ + goto enter_send_tso; + } + /* We must not get here. Something is going wrong. */ + MLX5_ASSERT(false); + txq->stats.oerrors++; + break; + } + /* + * Main Tx loop is completed, do the rest: + * - set completion request if thresholds are reached + * - doorbell the hardware + * - copy the rest of mbufs to elts (if any) + */ + MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE) || + loc.pkts_sent >= loc.pkts_copy); + /* Take a shortcut if nothing is sent. */ + if (unlikely(loc.pkts_sent == loc.pkts_loop)) + goto burst_exit; + /* Request CQE generation if limits are reached. */ + mlx5_tx_request_completion(txq, &loc, olx); + /* + * Ring QP doorbell immediately after WQE building completion + * to improve latencies. The pure software related data treatment + * can be completed after doorbell. Tx CQEs for this SQ are + * processed in this thread only by the polling. + * + * The rdma core library can map doorbell register in two ways, + * depending on the environment variable "MLX5_SHUT_UP_BF": + * + * - as regular cached memory, the variable is either missing or + * set to zero. This type of mapping may cause the significant + * doorbell register writing latency and requires explicit + * memory write barrier to mitigate this issue and prevent + * write combining. + * + * - as non-cached memory, the variable is present and set to + * not "0" value. This type of mapping may cause performance + * impact under heavy loading conditions but the explicit write + * memory barrier is not required and it may improve core + * performance. + * + * - the legacy behaviour (prior 19.08 release) was to use some + * heuristics to decide whether write memory barrier should + * be performed. This behavior is supported with specifying + * tx_db_nc=2, write barrier is skipped if application + * provides the full recommended burst of packets, it + * supposes the next packets are coming and the write barrier + * will be issued on the next burst (after descriptor writing, + * at least). + */ + mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, !txq->db_nc && + (!txq->db_heu || pkts_n % MLX5_TX_DEFAULT_BURST)); + /* Not all of the mbufs may be stored into elts yet. */ + part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy; + if (!MLX5_TXOFF_CONFIG(INLINE) && part) { + /* + * There are some single-segment mbufs not stored in elts. + * It can be only if the last packet was single-segment. + * The copying is gathered into one place due to it is + * a good opportunity to optimize that with SIMD. + * Unfortunately if inlining is enabled the gaps in + * pointer array may happen due to early freeing of the + * inlined mbufs. + */ + mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx); + loc.pkts_copy = loc.pkts_sent; + } + MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); + MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); + if (pkts_n > loc.pkts_sent) { + /* + * If burst size is large there might be no enough CQE + * fetched from completion queue and no enough resources + * freed to send all the packets. + */ + goto send_loop; + } +burst_exit: +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Increment sent packets counter. */ + txq->stats.opackets += loc.pkts_sent; +#endif + return loc.pkts_sent; +} + +/* Generate routines with Enhanced Multi-Packet Write support. */ +MLX5_TXOFF_DECL(full_empw, + MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_DECL(none_empw, + MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_DECL(md_empw, + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_DECL(mt_empw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_DECL(mtsc_empw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_DECL(mti_empw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_INLINE | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_DECL(mtv_empw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_DECL(mtiv_empw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_DECL(sc_empw, + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_DECL(sci_empw, + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_INLINE | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_DECL(scv_empw, + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_DECL(sciv_empw, + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_DECL(i_empw, + MLX5_TXOFF_CONFIG_INLINE | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_DECL(v_empw, + MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_DECL(iv_empw, + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +/* Generate routines without Enhanced Multi-Packet Write support. */ +MLX5_TXOFF_DECL(full, + MLX5_TXOFF_CONFIG_FULL) + +MLX5_TXOFF_DECL(none, + MLX5_TXOFF_CONFIG_NONE) + +MLX5_TXOFF_DECL(md, + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_DECL(mt, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_DECL(mtsc, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_DECL(mti, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_INLINE | + MLX5_TXOFF_CONFIG_METADATA) + + +MLX5_TXOFF_DECL(mtv, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA) + + +MLX5_TXOFF_DECL(mtiv, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_DECL(sc, + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_DECL(sci, + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_INLINE | + MLX5_TXOFF_CONFIG_METADATA) + + +MLX5_TXOFF_DECL(scv, + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA) + + +MLX5_TXOFF_DECL(sciv, + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_DECL(i, + MLX5_TXOFF_CONFIG_INLINE | + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_DECL(v, + MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_DECL(iv, + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA) + +/* + * Generate routines with Legacy Multi-Packet Write support. + * This mode is supported by ConnectX-4 Lx only and imposes + * offload limitations, not supported: + * - ACL/Flows (metadata are becoming meaningless) + * - WQE Inline headers + * - SRIOV (E-Switch offloads) + * - VLAN insertion + * - tunnel encapsulation/decapsulation + * - TSO + */ +MLX5_TXOFF_DECL(none_mpw, + MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) + +MLX5_TXOFF_DECL(mci_mpw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) + +MLX5_TXOFF_DECL(mc_mpw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW) + +MLX5_TXOFF_DECL(i_mpw, + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) + +/* + * Array of declared and compiled Tx burst function and corresponding + * supported offloads set. The array is used to select the Tx burst + * function for specified offloads set at Tx queue configuration time. + */ +const struct { + eth_tx_burst_t func; + unsigned int olx; +} txoff_func[] = { +MLX5_TXOFF_INFO(full_empw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_INFO(none_empw, + MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_INFO(md_empw, + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_INFO(mt_empw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_INFO(mtsc_empw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_INFO(mti_empw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_INLINE | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_INFO(mtv_empw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_INFO(mtiv_empw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_INFO(sc_empw, + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_INFO(sci_empw, + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_INLINE | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_INFO(scv_empw, + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_INFO(sciv_empw, + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_INFO(i_empw, + MLX5_TXOFF_CONFIG_INLINE | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_INFO(v_empw, + MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_INFO(iv_empw, + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_INFO(full, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_INFO(none, + MLX5_TXOFF_CONFIG_NONE) + +MLX5_TXOFF_INFO(md, + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_INFO(mt, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_INFO(mtsc, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_INFO(mti, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_INLINE | + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_INFO(mtv, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_INFO(mtiv, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_INFO(sc, + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_INFO(sci, + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_INLINE | + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_INFO(scv, + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_INFO(sciv, + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_INFO(i, + MLX5_TXOFF_CONFIG_INLINE | + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_INFO(v, + MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_INFO(iv, + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_INFO(none_mpw, + MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) + +MLX5_TXOFF_INFO(mci_mpw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) + +MLX5_TXOFF_INFO(mc_mpw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW) + +MLX5_TXOFF_INFO(i_mpw, + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) +}; + +/** + * Configure the Tx function to use. The routine checks configured + * Tx offloads for the device and selects appropriate Tx burst + * routine. There are multiple Tx burst routines compiled from + * the same template in the most optimal way for the dedicated + * Tx offloads set. + * + * @param dev + * Pointer to private data structure. + * + * @return + * Pointer to selected Tx burst function. + */ +eth_tx_burst_t +mlx5_select_tx_function(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_dev_config *config = &priv->config; + uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads; + unsigned int diff = 0, olx = 0, i, m; + + static_assert(MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE <= + MLX5_DSEG_MAX, "invalid WQE max size"); + static_assert(MLX5_WQE_CSEG_SIZE == MLX5_WSEG_SIZE, + "invalid WQE Control Segment size"); + static_assert(MLX5_WQE_ESEG_SIZE == MLX5_WSEG_SIZE, + "invalid WQE Ethernet Segment size"); + static_assert(MLX5_WQE_DSEG_SIZE == MLX5_WSEG_SIZE, + "invalid WQE Data Segment size"); + static_assert(MLX5_WQE_SIZE == 4 * MLX5_WSEG_SIZE, + "invalid WQE size"); + MLX5_ASSERT(priv); + if (tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS) { + /* We should support Multi-Segment Packets. */ + olx |= MLX5_TXOFF_CONFIG_MULTI; + } + if (tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO | + DEV_TX_OFFLOAD_VXLAN_TNL_TSO | + DEV_TX_OFFLOAD_GRE_TNL_TSO | + DEV_TX_OFFLOAD_IP_TNL_TSO | + DEV_TX_OFFLOAD_UDP_TNL_TSO)) { + /* We should support TCP Send Offload. */ + olx |= MLX5_TXOFF_CONFIG_TSO; + } + if (tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO | + DEV_TX_OFFLOAD_UDP_TNL_TSO | + DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) { + /* We should support Software Parser for Tunnels. */ + olx |= MLX5_TXOFF_CONFIG_SWP; + } + if (tx_offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM | + DEV_TX_OFFLOAD_UDP_CKSUM | + DEV_TX_OFFLOAD_TCP_CKSUM | + DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) { + /* We should support IP/TCP/UDP Checksums. */ + olx |= MLX5_TXOFF_CONFIG_CSUM; + } + if (tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT) { + /* We should support VLAN insertion. */ + olx |= MLX5_TXOFF_CONFIG_VLAN; + } + if (priv->txqs_n && (*priv->txqs)[0]) { + struct mlx5_txq_data *txd = (*priv->txqs)[0]; + + if (txd->inlen_send) { + /* + * Check the data inline requirements. Data inline + * is enabled on per device basis, we can check + * the first Tx queue only. + * + * If device does not support VLAN insertion in WQE + * and some queues are requested to perform VLAN + * insertion offload than inline must be enabled. + */ + olx |= MLX5_TXOFF_CONFIG_INLINE; + } + } + if (config->mps == MLX5_MPW_ENHANCED && + config->txq_inline_min <= 0) { + /* + * The NIC supports Enhanced Multi-Packet Write + * and does not require minimal inline data. + */ + olx |= MLX5_TXOFF_CONFIG_EMPW; + } + if (rte_flow_dynf_metadata_avail()) { + /* We should support Flow metadata. */ + olx |= MLX5_TXOFF_CONFIG_METADATA; + } + if (config->mps == MLX5_MPW) { + /* + * The NIC supports Legacy Multi-Packet Write. + * The MLX5_TXOFF_CONFIG_MPW controls the + * descriptor building method in combination + * with MLX5_TXOFF_CONFIG_EMPW. + */ + if (!(olx & (MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_SWP | + MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA))) + olx |= MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW; + } + /* + * Scan the routines table to find the minimal + * satisfying routine with requested offloads. + */ + m = RTE_DIM(txoff_func); + for (i = 0; i < RTE_DIM(txoff_func); i++) { + unsigned int tmp; + + tmp = txoff_func[i].olx; + if (tmp == olx) { + /* Meets requested offloads exactly.*/ + m = i; + break; + } + if ((tmp & olx) != olx) { + /* Does not meet requested offloads at all. */ + continue; + } + if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_EMPW) + /* Do not enable eMPW if not configured. */ + continue; + if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_INLINE) + /* Do not enable inlining if not configured. */ + continue; + /* + * Some routine meets the requirements. + * Check whether it has minimal amount + * of not requested offloads. + */ + tmp = __builtin_popcountl(tmp & ~olx); + if (m >= RTE_DIM(txoff_func) || tmp < diff) { + /* First or better match, save and continue. */ + m = i; + diff = tmp; + continue; + } + if (tmp == diff) { + tmp = txoff_func[i].olx ^ txoff_func[m].olx; + if (__builtin_ffsl(txoff_func[i].olx & ~tmp) < + __builtin_ffsl(txoff_func[m].olx & ~tmp)) { + /* Lighter not requested offload. */ + m = i; + } + } + } + if (m >= RTE_DIM(txoff_func)) { + DRV_LOG(DEBUG, "port %u has no selected Tx function" + " for requested offloads %04X", + dev->data->port_id, olx); + return NULL; + } + DRV_LOG(DEBUG, "port %u has selected Tx function" + " supporting offloads %04X/%04X", + dev->data->port_id, olx, txoff_func[m].olx); + if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MULTI) + DRV_LOG(DEBUG, "\tMULTI (multi segment)"); + if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_TSO) + DRV_LOG(DEBUG, "\tTSO (TCP send offload)"); + if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_SWP) + DRV_LOG(DEBUG, "\tSWP (software parser)"); + if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_CSUM) + DRV_LOG(DEBUG, "\tCSUM (checksum offload)"); + if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_INLINE) + DRV_LOG(DEBUG, "\tINLIN (inline data)"); + if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_VLAN) + DRV_LOG(DEBUG, "\tVLANI (VLAN insertion)"); + if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_METADATA) + DRV_LOG(DEBUG, "\tMETAD (tx Flow metadata)"); + if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW) { + if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MPW) + DRV_LOG(DEBUG, "\tMPW (Legacy MPW)"); + else + DRV_LOG(DEBUG, "\tEMPW (Enhanced MPW)"); + } + return txoff_func[m].func; +} + +/** + * DPDK callback to get the TX queue information + * + * @param dev + * Pointer to the device structure. + * + * @param tx_queue_id + * Tx queue identificator. + * + * @param qinfo + * Pointer to the TX queue information structure. + * + * @return + * None. + */ + +void +mlx5_txq_info_get(struct rte_eth_dev *dev, uint16_t tx_queue_id, + struct rte_eth_txq_info *qinfo) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_txq_data *txq = (*priv->txqs)[tx_queue_id]; + struct mlx5_txq_ctrl *txq_ctrl = + container_of(txq, struct mlx5_txq_ctrl, txq); + + if (!txq) + return; + qinfo->nb_desc = txq->elts_s; + qinfo->conf.tx_thresh.pthresh = 0; + qinfo->conf.tx_thresh.hthresh = 0; + qinfo->conf.tx_thresh.wthresh = 0; + qinfo->conf.tx_rs_thresh = 0; + qinfo->conf.tx_free_thresh = 0; + qinfo->conf.tx_deferred_start = txq_ctrl ? 0 : 1; + qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads; +} + +/** + * DPDK callback to get the TX packet burst mode information + * + * @param dev + * Pointer to the device structure. + * + * @param tx_queue_id + * Tx queue identificatior. + * + * @param mode + * Pointer to the burts mode information. + * + * @return + * 0 as success, -EINVAL as failure. + */ + +int +mlx5_tx_burst_mode_get(struct rte_eth_dev *dev, + uint16_t tx_queue_id __rte_unused, + struct rte_eth_burst_mode *mode) +{ + eth_tx_burst_t pkt_burst = dev->tx_pkt_burst; + unsigned int i, olx; + + for (i = 0; i < RTE_DIM(txoff_func); i++) { + if (pkt_burst == txoff_func[i].func) { + olx = txoff_func[i].olx; + snprintf(mode->info, sizeof(mode->info), + "%s%s%s%s%s%s%s%s", + (olx & MLX5_TXOFF_CONFIG_EMPW) ? + ((olx & MLX5_TXOFF_CONFIG_MPW) ? + "Legacy MPW" : "Enhanced MPW") : "No MPW", + (olx & MLX5_TXOFF_CONFIG_MULTI) ? + " + MULTI" : "", + (olx & MLX5_TXOFF_CONFIG_TSO) ? + " + TSO" : "", + (olx & MLX5_TXOFF_CONFIG_SWP) ? + " + SWP" : "", + (olx & MLX5_TXOFF_CONFIG_CSUM) ? + " + CSUM" : "", + (olx & MLX5_TXOFF_CONFIG_INLINE) ? + " + INLINE" : "", + (olx & MLX5_TXOFF_CONFIG_VLAN) ? + " + VLAN" : "", + (olx & MLX5_TXOFF_CONFIG_METADATA) ? + " + METADATA" : ""); + return 0; + } + } + return -EINVAL; +} diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx.h new file mode 100644 index 000000000..48f2b7941 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx.h @@ -0,0 +1,683 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2015 6WIND S.A. + * Copyright 2015 Mellanox Technologies, Ltd + */ + +#ifndef RTE_PMD_MLX5_RXTX_H_ +#define RTE_PMD_MLX5_RXTX_H_ + +#include <stddef.h> +#include <stdint.h> +#include <sys/queue.h> + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#include <infiniband/mlx5dv.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_mbuf.h> +#include <rte_mempool.h> +#include <rte_common.h> +#include <rte_hexdump.h> +#include <rte_atomic.h> +#include <rte_spinlock.h> +#include <rte_io.h> +#include <rte_bus_pci.h> +#include <rte_malloc.h> + +#include <mlx5_glue.h> +#include <mlx5_prm.h> +#include <mlx5_common.h> +#include <mlx5_common_mr.h> + +#include "mlx5_defs.h" +#include "mlx5_utils.h" +#include "mlx5.h" +#include "mlx5_autoconf.h" + +/* Support tunnel matching. */ +#define MLX5_FLOW_TUNNEL 10 + +/* Mbuf dynamic flag offset for inline. */ +extern uint64_t rte_net_mlx5_dynf_inline_mask; + +struct mlx5_rxq_stats { +#ifdef MLX5_PMD_SOFT_COUNTERS + uint64_t ipackets; /**< Total of successfully received packets. */ + uint64_t ibytes; /**< Total of successfully received bytes. */ +#endif + uint64_t idropped; /**< Total of packets dropped when RX ring full. */ + uint64_t rx_nombuf; /**< Total of RX mbuf allocation failures. */ +}; + +struct mlx5_txq_stats { +#ifdef MLX5_PMD_SOFT_COUNTERS + uint64_t opackets; /**< Total of successfully sent packets. */ + uint64_t obytes; /**< Total of successfully sent bytes. */ +#endif + uint64_t oerrors; /**< Total number of failed transmitted packets. */ +}; + +struct mlx5_priv; + +/* Compressed CQE context. */ +struct rxq_zip { + uint16_t ai; /* Array index. */ + uint16_t ca; /* Current array index. */ + uint16_t na; /* Next array index. */ + uint16_t cq_ci; /* The next CQE. */ + uint32_t cqe_cnt; /* Number of CQEs. */ +}; + +/* Multi-Packet RQ buffer header. */ +struct mlx5_mprq_buf { + struct rte_mempool *mp; + rte_atomic16_t refcnt; /* Atomically accessed refcnt. */ + uint8_t pad[RTE_PKTMBUF_HEADROOM]; /* Headroom for the first packet. */ + struct rte_mbuf_ext_shared_info shinfos[]; + /* + * Shared information per stride. + * More memory will be allocated for the first stride head-room and for + * the strides data. + */ +} __rte_cache_aligned; + +/* Get pointer to the first stride. */ +#define mlx5_mprq_buf_addr(ptr, strd_n) (RTE_PTR_ADD((ptr), \ + sizeof(struct mlx5_mprq_buf) + \ + (strd_n) * \ + sizeof(struct rte_mbuf_ext_shared_info) + \ + RTE_PKTMBUF_HEADROOM)) + +#define MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES 6 +#define MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES 9 + +enum mlx5_rxq_err_state { + MLX5_RXQ_ERR_STATE_NO_ERROR = 0, + MLX5_RXQ_ERR_STATE_NEED_RESET, + MLX5_RXQ_ERR_STATE_NEED_READY, +}; + +/* RX queue descriptor. */ +struct mlx5_rxq_data { + unsigned int csum:1; /* Enable checksum offloading. */ + unsigned int hw_timestamp:1; /* Enable HW timestamp. */ + unsigned int vlan_strip:1; /* Enable VLAN stripping. */ + unsigned int crc_present:1; /* CRC must be subtracted. */ + unsigned int sges_n:3; /* Log 2 of SGEs (max buffers per packet). */ + unsigned int cqe_n:4; /* Log 2 of CQ elements. */ + unsigned int elts_n:4; /* Log 2 of Mbufs. */ + unsigned int rss_hash:1; /* RSS hash result is enabled. */ + unsigned int mark:1; /* Marked flow available on the queue. */ + unsigned int strd_num_n:5; /* Log 2 of the number of stride. */ + unsigned int strd_sz_n:4; /* Log 2 of stride size. */ + unsigned int strd_shift_en:1; /* Enable 2bytes shift on a stride. */ + unsigned int err_state:2; /* enum mlx5_rxq_err_state. */ + unsigned int strd_scatter_en:1; /* Scattered packets from a stride. */ + unsigned int lro:1; /* Enable LRO. */ + unsigned int dynf_meta:1; /* Dynamic metadata is configured. */ + volatile uint32_t *rq_db; + volatile uint32_t *cq_db; + uint16_t port_id; + uint32_t rq_ci; + uint16_t consumed_strd; /* Number of consumed strides in WQE. */ + uint32_t rq_pi; + uint32_t cq_ci; + uint16_t rq_repl_thresh; /* Threshold for buffer replenishment. */ + union { + struct rxq_zip zip; /* Compressed context. */ + uint16_t decompressed; + /* Number of ready mbufs decompressed from the CQ. */ + }; + struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */ + uint16_t mprq_max_memcpy_len; /* Maximum size of packet to memcpy. */ + volatile void *wqes; + volatile struct mlx5_cqe(*cqes)[]; + RTE_STD_C11 + union { + struct rte_mbuf *(*elts)[]; + struct mlx5_mprq_buf *(*mprq_bufs)[]; + }; + struct rte_mempool *mp; + struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */ + struct mlx5_mprq_buf *mprq_repl; /* Stashed mbuf for replenish. */ + uint16_t idx; /* Queue index. */ + struct mlx5_rxq_stats stats; + rte_xmm_t mbuf_initializer; /* Default rearm/flags for vectorized Rx. */ + struct rte_mbuf fake_mbuf; /* elts padding for vectorized Rx. */ + void *cq_uar; /* CQ user access region. */ + uint32_t cqn; /* CQ number. */ + uint8_t cq_arm_sn; /* CQ arm seq number. */ +#ifndef RTE_ARCH_64 + rte_spinlock_t *uar_lock_cq; + /* CQ (UAR) access lock required for 32bit implementations */ +#endif + uint32_t tunnel; /* Tunnel information. */ + uint64_t flow_meta_mask; + int32_t flow_meta_offset; +} __rte_cache_aligned; + +enum mlx5_rxq_obj_type { + MLX5_RXQ_OBJ_TYPE_IBV, /* mlx5_rxq_obj with ibv_wq. */ + MLX5_RXQ_OBJ_TYPE_DEVX_RQ, /* mlx5_rxq_obj with mlx5_devx_rq. */ + MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN, + /* mlx5_rxq_obj with mlx5_devx_rq and hairpin support. */ +}; + +enum mlx5_rxq_type { + MLX5_RXQ_TYPE_STANDARD, /* Standard Rx queue. */ + MLX5_RXQ_TYPE_HAIRPIN, /* Hairpin Rx queue. */ + MLX5_RXQ_TYPE_UNDEFINED, +}; + +/* Verbs/DevX Rx queue elements. */ +struct mlx5_rxq_obj { + LIST_ENTRY(mlx5_rxq_obj) next; /* Pointer to the next element. */ + rte_atomic32_t refcnt; /* Reference counter. */ + struct mlx5_rxq_ctrl *rxq_ctrl; /* Back pointer to parent. */ + struct ibv_cq *cq; /* Completion Queue. */ + enum mlx5_rxq_obj_type type; + RTE_STD_C11 + union { + struct ibv_wq *wq; /* Work Queue. */ + struct mlx5_devx_obj *rq; /* DevX object for Rx Queue. */ + }; + struct ibv_comp_channel *channel; +}; + +/* RX queue control descriptor. */ +struct mlx5_rxq_ctrl { + struct mlx5_rxq_data rxq; /* Data path structure. */ + LIST_ENTRY(mlx5_rxq_ctrl) next; /* Pointer to the next element. */ + rte_atomic32_t refcnt; /* Reference counter. */ + struct mlx5_rxq_obj *obj; /* Verbs/DevX elements. */ + struct mlx5_priv *priv; /* Back pointer to private data. */ + enum mlx5_rxq_type type; /* Rxq type. */ + unsigned int socket; /* CPU socket ID for allocations. */ + unsigned int irq:1; /* Whether IRQ is enabled. */ + unsigned int dbr_umem_id_valid:1; /* dbr_umem_id holds a valid value. */ + uint32_t flow_mark_n; /* Number of Mark/Flag flows using this Queue. */ + uint32_t flow_tunnels_n[MLX5_FLOW_TUNNEL]; /* Tunnels counters. */ + uint32_t wqn; /* WQ number. */ + uint16_t dump_file_n; /* Number of dump files. */ + uint32_t dbr_umem_id; /* Storing door-bell information, */ + uint64_t dbr_offset; /* needed when freeing door-bell. */ + struct mlx5dv_devx_umem *wq_umem; /* WQ buffer registration info. */ + struct rte_eth_hairpin_conf hairpin_conf; /* Hairpin configuration. */ +}; + +enum mlx5_ind_tbl_type { + MLX5_IND_TBL_TYPE_IBV, + MLX5_IND_TBL_TYPE_DEVX, +}; + +/* Indirection table. */ +struct mlx5_ind_table_obj { + LIST_ENTRY(mlx5_ind_table_obj) next; /* Pointer to the next element. */ + rte_atomic32_t refcnt; /* Reference counter. */ + enum mlx5_ind_tbl_type type; + RTE_STD_C11 + union { + struct ibv_rwq_ind_table *ind_table; /**< Indirection table. */ + struct mlx5_devx_obj *rqt; /* DevX RQT object. */ + }; + uint32_t queues_n; /**< Number of queues in the list. */ + uint16_t queues[]; /**< Queue list. */ +}; + +/* Hash Rx queue. */ +struct mlx5_hrxq { + ILIST_ENTRY(uint32_t)next; /* Index to the next element. */ + rte_atomic32_t refcnt; /* Reference counter. */ + struct mlx5_ind_table_obj *ind_table; /* Indirection table. */ + RTE_STD_C11 + union { + struct ibv_qp *qp; /* Verbs queue pair. */ + struct mlx5_devx_obj *tir; /* DevX TIR object. */ + }; +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + void *action; /* DV QP action pointer. */ +#endif + uint64_t hash_fields; /* Verbs Hash fields. */ + uint32_t rss_key_len; /* Hash key length in bytes. */ + uint8_t rss_key[]; /* Hash key. */ +}; + +/* TX queue send local data. */ +__extension__ +struct mlx5_txq_local { + struct mlx5_wqe *wqe_last; /* last sent WQE pointer. */ + struct rte_mbuf *mbuf; /* first mbuf to process. */ + uint16_t pkts_copy; /* packets copied to elts. */ + uint16_t pkts_sent; /* packets sent. */ + uint16_t pkts_loop; /* packets sent on loop entry. */ + uint16_t elts_free; /* available elts remain. */ + uint16_t wqe_free; /* available wqe remain. */ + uint16_t mbuf_off; /* data offset in current mbuf. */ + uint16_t mbuf_nseg; /* number of remaining mbuf. */ +}; + +/* TX queue descriptor. */ +__extension__ +struct mlx5_txq_data { + uint16_t elts_head; /* Current counter in (*elts)[]. */ + uint16_t elts_tail; /* Counter of first element awaiting completion. */ + uint16_t elts_comp; /* elts index since last completion request. */ + uint16_t elts_s; /* Number of mbuf elements. */ + uint16_t elts_m; /* Mask for mbuf elements indices. */ + /* Fields related to elts mbuf storage. */ + uint16_t wqe_ci; /* Consumer index for work queue. */ + uint16_t wqe_pi; /* Producer index for work queue. */ + uint16_t wqe_s; /* Number of WQ elements. */ + uint16_t wqe_m; /* Mask Number for WQ elements. */ + uint16_t wqe_comp; /* WQE index since last completion request. */ + uint16_t wqe_thres; /* WQE threshold to request completion in CQ. */ + /* WQ related fields. */ + uint16_t cq_ci; /* Consumer index for completion queue. */ + uint16_t cq_pi; /* Production index for completion queue. */ + uint16_t cqe_s; /* Number of CQ elements. */ + uint16_t cqe_m; /* Mask for CQ indices. */ + /* CQ related fields. */ + uint16_t elts_n:4; /* elts[] length (in log2). */ + uint16_t cqe_n:4; /* Number of CQ elements (in log2). */ + uint16_t wqe_n:4; /* Number of WQ elements (in log2). */ + uint16_t tso_en:1; /* When set hardware TSO is enabled. */ + uint16_t tunnel_en:1; + /* When set TX offload for tunneled packets are supported. */ + uint16_t swp_en:1; /* Whether SW parser is enabled. */ + uint16_t vlan_en:1; /* VLAN insertion in WQE is supported. */ + uint16_t db_nc:1; /* Doorbell mapped to non-cached region. */ + uint16_t db_heu:1; /* Doorbell heuristic write barrier. */ + uint16_t inlen_send; /* Ordinary send data inline size. */ + uint16_t inlen_empw; /* eMPW max packet size to inline. */ + uint16_t inlen_mode; /* Minimal data length to inline. */ + uint32_t qp_num_8s; /* QP number shifted by 8. */ + uint64_t offloads; /* Offloads for Tx Queue. */ + struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */ + struct mlx5_wqe *wqes; /* Work queue. */ + struct mlx5_wqe *wqes_end; /* Work queue array limit. */ +#ifdef RTE_LIBRTE_MLX5_DEBUG + uint32_t *fcqs; /* Free completion queue (debug extended). */ +#else + uint16_t *fcqs; /* Free completion queue. */ +#endif + volatile struct mlx5_cqe *cqes; /* Completion queue. */ + volatile uint32_t *qp_db; /* Work queue doorbell. */ + volatile uint32_t *cq_db; /* Completion queue doorbell. */ + uint16_t port_id; /* Port ID of device. */ + uint16_t idx; /* Queue index. */ + struct mlx5_txq_stats stats; /* TX queue counters. */ +#ifndef RTE_ARCH_64 + rte_spinlock_t *uar_lock; + /* UAR access lock required for 32bit implementations */ +#endif + struct rte_mbuf *elts[0]; + /* Storage for queued packets, must be the last field. */ +} __rte_cache_aligned; + +enum mlx5_txq_obj_type { + MLX5_TXQ_OBJ_TYPE_IBV, /* mlx5_txq_obj with ibv_wq. */ + MLX5_TXQ_OBJ_TYPE_DEVX_HAIRPIN, + /* mlx5_txq_obj with mlx5_devx_tq and hairpin support. */ +}; + +enum mlx5_txq_type { + MLX5_TXQ_TYPE_STANDARD, /* Standard Tx queue. */ + MLX5_TXQ_TYPE_HAIRPIN, /* Hairpin Rx queue. */ +}; + +/* Verbs/DevX Tx queue elements. */ +struct mlx5_txq_obj { + LIST_ENTRY(mlx5_txq_obj) next; /* Pointer to the next element. */ + rte_atomic32_t refcnt; /* Reference counter. */ + struct mlx5_txq_ctrl *txq_ctrl; /* Pointer to the control queue. */ + enum mlx5_txq_obj_type type; /* The txq object type. */ + RTE_STD_C11 + union { + struct { + struct ibv_cq *cq; /* Completion Queue. */ + struct ibv_qp *qp; /* Queue Pair. */ + }; + struct { + struct mlx5_devx_obj *sq; + /* DevX object for Sx queue. */ + struct mlx5_devx_obj *tis; /* The TIS object. */ + }; + }; +}; + +/* TX queue control descriptor. */ +struct mlx5_txq_ctrl { + LIST_ENTRY(mlx5_txq_ctrl) next; /* Pointer to the next element. */ + rte_atomic32_t refcnt; /* Reference counter. */ + unsigned int socket; /* CPU socket ID for allocations. */ + enum mlx5_txq_type type; /* The txq ctrl type. */ + unsigned int max_inline_data; /* Max inline data. */ + unsigned int max_tso_header; /* Max TSO header size. */ + struct mlx5_txq_obj *obj; /* Verbs/DevX queue object. */ + struct mlx5_priv *priv; /* Back pointer to private data. */ + off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */ + void *bf_reg; /* BlueFlame register from Verbs. */ + uint16_t dump_file_n; /* Number of dump files. */ + struct rte_eth_hairpin_conf hairpin_conf; /* Hairpin configuration. */ + struct mlx5_txq_data txq; /* Data path structure. */ + /* Must be the last field in the structure, contains elts[]. */ +}; + +#define MLX5_TX_BFREG(txq) \ + (MLX5_PROC_PRIV((txq)->port_id)->uar_table[(txq)->idx]) + +/* mlx5_rxq.c */ + +extern uint8_t rss_hash_default_key[]; + +int mlx5_check_mprq_support(struct rte_eth_dev *dev); +int mlx5_rxq_mprq_enabled(struct mlx5_rxq_data *rxq); +int mlx5_mprq_enabled(struct rte_eth_dev *dev); +int mlx5_mprq_free_mp(struct rte_eth_dev *dev); +int mlx5_mprq_alloc_mp(struct rte_eth_dev *dev); +int mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + unsigned int socket, const struct rte_eth_rxconf *conf, + struct rte_mempool *mp); +int mlx5_rx_hairpin_queue_setup + (struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + const struct rte_eth_hairpin_conf *hairpin_conf); +void mlx5_rx_queue_release(void *dpdk_rxq); +int mlx5_rx_intr_vec_enable(struct rte_eth_dev *dev); +void mlx5_rx_intr_vec_disable(struct rte_eth_dev *dev); +int mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id); +int mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id); +struct mlx5_rxq_obj *mlx5_rxq_obj_new(struct rte_eth_dev *dev, uint16_t idx, + enum mlx5_rxq_obj_type type); +int mlx5_rxq_obj_verify(struct rte_eth_dev *dev); +struct mlx5_rxq_ctrl *mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, + uint16_t desc, unsigned int socket, + const struct rte_eth_rxconf *conf, + struct rte_mempool *mp); +struct mlx5_rxq_ctrl *mlx5_rxq_hairpin_new + (struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + const struct rte_eth_hairpin_conf *hairpin_conf); +struct mlx5_rxq_ctrl *mlx5_rxq_get(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_rxq_release(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_rxq_verify(struct rte_eth_dev *dev); +int rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl); +int mlx5_ind_table_obj_verify(struct rte_eth_dev *dev); +uint32_t mlx5_hrxq_new(struct rte_eth_dev *dev, + const uint8_t *rss_key, uint32_t rss_key_len, + uint64_t hash_fields, + const uint16_t *queues, uint32_t queues_n, + int tunnel __rte_unused); +uint32_t mlx5_hrxq_get(struct rte_eth_dev *dev, + const uint8_t *rss_key, uint32_t rss_key_len, + uint64_t hash_fields, + const uint16_t *queues, uint32_t queues_n); +int mlx5_hrxq_release(struct rte_eth_dev *dev, uint32_t hxrq_idx); +int mlx5_hrxq_verify(struct rte_eth_dev *dev); +enum mlx5_rxq_type mlx5_rxq_get_type(struct rte_eth_dev *dev, uint16_t idx); +struct mlx5_hrxq *mlx5_hrxq_drop_new(struct rte_eth_dev *dev); +void mlx5_hrxq_drop_release(struct rte_eth_dev *dev); +uint64_t mlx5_get_rx_port_offloads(void); +uint64_t mlx5_get_rx_queue_offloads(struct rte_eth_dev *dev); + +/* mlx5_txq.c */ + +int mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + unsigned int socket, const struct rte_eth_txconf *conf); +int mlx5_tx_hairpin_queue_setup + (struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + const struct rte_eth_hairpin_conf *hairpin_conf); +void mlx5_tx_queue_release(void *dpdk_txq); +int mlx5_tx_uar_init_secondary(struct rte_eth_dev *dev, int fd); +struct mlx5_txq_obj *mlx5_txq_obj_new(struct rte_eth_dev *dev, uint16_t idx, + enum mlx5_txq_obj_type type); +struct mlx5_txq_obj *mlx5_txq_obj_get(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_txq_obj_release(struct mlx5_txq_obj *txq_ibv); +int mlx5_txq_obj_verify(struct rte_eth_dev *dev); +struct mlx5_txq_ctrl *mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx, + uint16_t desc, unsigned int socket, + const struct rte_eth_txconf *conf); +struct mlx5_txq_ctrl *mlx5_txq_hairpin_new + (struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + const struct rte_eth_hairpin_conf *hairpin_conf); +struct mlx5_txq_ctrl *mlx5_txq_get(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_txq_releasable(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_txq_verify(struct rte_eth_dev *dev); +void txq_alloc_elts(struct mlx5_txq_ctrl *txq_ctrl); +void txq_free_elts(struct mlx5_txq_ctrl *txq_ctrl); +uint64_t mlx5_get_tx_port_offloads(struct rte_eth_dev *dev); + +/* mlx5_rxtx.c */ + +extern uint32_t mlx5_ptype_table[]; +extern uint8_t mlx5_cksum_table[]; +extern uint8_t mlx5_swp_types_table[]; + +void mlx5_set_ptype_table(void); +void mlx5_set_cksum_table(void); +void mlx5_set_swp_types_table(void); +uint16_t mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n); +void mlx5_rxq_initialize(struct mlx5_rxq_data *rxq); +__rte_noinline int mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec); +void mlx5_mprq_buf_free_cb(void *addr, void *opaque); +void mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf); +uint16_t mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, + uint16_t pkts_n); +int mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset); +int mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset); +uint32_t mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id); +void mlx5_dump_debug_information(const char *path, const char *title, + const void *buf, unsigned int len); +int mlx5_queue_state_modify_primary(struct rte_eth_dev *dev, + const struct mlx5_mp_arg_queue_state_modify *sm); +void mlx5_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id, + struct rte_eth_rxq_info *qinfo); +void mlx5_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id, + struct rte_eth_txq_info *qinfo); +int mlx5_rx_burst_mode_get(struct rte_eth_dev *dev, uint16_t rx_queue_id, + struct rte_eth_burst_mode *mode); +int mlx5_tx_burst_mode_get(struct rte_eth_dev *dev, uint16_t tx_queue_id, + struct rte_eth_burst_mode *mode); + +/* Vectorized version of mlx5_rxtx.c */ +int mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq_data); +int mlx5_check_vec_rx_support(struct rte_eth_dev *dev); +uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); + +/* mlx5_mr.c */ + +void mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl); +uint32_t mlx5_rx_addr2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr); +uint32_t mlx5_tx_mb2mr_bh(struct mlx5_txq_data *txq, struct rte_mbuf *mb); +uint32_t mlx5_tx_update_ext_mp(struct mlx5_txq_data *txq, uintptr_t addr, + struct rte_mempool *mp); +int mlx5_dma_map(struct rte_pci_device *pdev, void *addr, uint64_t iova, + size_t len); +int mlx5_dma_unmap(struct rte_pci_device *pdev, void *addr, uint64_t iova, + size_t len); + +/** + * Provide safe 64bit store operation to mlx5 UAR region for both 32bit and + * 64bit architectures. + * + * @param val + * value to write in CPU endian format. + * @param addr + * Address to write to. + * @param lock + * Address of the lock to use for that UAR access. + */ +static __rte_always_inline void +__mlx5_uar_write64_relaxed(uint64_t val, void *addr, + rte_spinlock_t *lock __rte_unused) +{ +#ifdef RTE_ARCH_64 + *(uint64_t *)addr = val; +#else /* !RTE_ARCH_64 */ + rte_spinlock_lock(lock); + *(uint32_t *)addr = val; + rte_io_wmb(); + *((uint32_t *)addr + 1) = val >> 32; + rte_spinlock_unlock(lock); +#endif +} + +/** + * Provide safe 64bit store operation to mlx5 UAR region for both 32bit and + * 64bit architectures while guaranteeing the order of execution with the + * code being executed. + * + * @param val + * value to write in CPU endian format. + * @param addr + * Address to write to. + * @param lock + * Address of the lock to use for that UAR access. + */ +static __rte_always_inline void +__mlx5_uar_write64(uint64_t val, void *addr, rte_spinlock_t *lock) +{ + rte_io_wmb(); + __mlx5_uar_write64_relaxed(val, addr, lock); +} + +/* Assist macros, used instead of directly calling the functions they wrap. */ +#ifdef RTE_ARCH_64 +#define mlx5_uar_write64_relaxed(val, dst, lock) \ + __mlx5_uar_write64_relaxed(val, dst, NULL) +#define mlx5_uar_write64(val, dst, lock) __mlx5_uar_write64(val, dst, NULL) +#else +#define mlx5_uar_write64_relaxed(val, dst, lock) \ + __mlx5_uar_write64_relaxed(val, dst, lock) +#define mlx5_uar_write64(val, dst, lock) __mlx5_uar_write64(val, dst, lock) +#endif + +/** + * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which the + * cloned mbuf is allocated is returned instead. + * + * @param buf + * Pointer to mbuf. + * + * @return + * Memory pool where data is located for given mbuf. + */ +static inline struct rte_mempool * +mlx5_mb2mp(struct rte_mbuf *buf) +{ + if (unlikely(RTE_MBUF_CLONED(buf))) + return rte_mbuf_from_indirect(buf)->pool; + return buf->pool; +} + +/** + * Query LKey from a packet buffer for Rx. No need to flush local caches for Rx + * as mempool is pre-configured and static. + * + * @param rxq + * Pointer to Rx queue structure. + * @param addr + * Address to search. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +static __rte_always_inline uint32_t +mlx5_rx_addr2mr(struct mlx5_rxq_data *rxq, uintptr_t addr) +{ + struct mlx5_mr_ctrl *mr_ctrl = &rxq->mr_ctrl; + uint32_t lkey; + + /* Linear search on MR cache array. */ + lkey = mlx5_mr_lookup_lkey(mr_ctrl->cache, &mr_ctrl->mru, + MLX5_MR_CACHE_N, addr); + if (likely(lkey != UINT32_MAX)) + return lkey; + /* Take slower bottom-half (Binary Search) on miss. */ + return mlx5_rx_addr2mr_bh(rxq, addr); +} + +#define mlx5_rx_mb2mr(rxq, mb) mlx5_rx_addr2mr(rxq, (uintptr_t)((mb)->buf_addr)) + +/** + * Query LKey from a packet buffer for Tx. If not found, add the mempool. + * + * @param txq + * Pointer to Tx queue structure. + * @param addr + * Address to search. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +static __rte_always_inline uint32_t +mlx5_tx_mb2mr(struct mlx5_txq_data *txq, struct rte_mbuf *mb) +{ + struct mlx5_mr_ctrl *mr_ctrl = &txq->mr_ctrl; + uintptr_t addr = (uintptr_t)mb->buf_addr; + uint32_t lkey; + + /* Check generation bit to see if there's any change on existing MRs. */ + if (unlikely(*mr_ctrl->dev_gen_ptr != mr_ctrl->cur_gen)) + mlx5_mr_flush_local_cache(mr_ctrl); + /* Linear search on MR cache array. */ + lkey = mlx5_mr_lookup_lkey(mr_ctrl->cache, &mr_ctrl->mru, + MLX5_MR_CACHE_N, addr); + if (likely(lkey != UINT32_MAX)) + return lkey; + /* Take slower bottom-half on miss. */ + return mlx5_tx_mb2mr_bh(txq, mb); +} + +/** + * Ring TX queue doorbell and flush the update if requested. + * + * @param txq + * Pointer to TX queue structure. + * @param wqe + * Pointer to the last WQE posted in the NIC. + * @param cond + * Request for write memory barrier after BlueFlame update. + */ +static __rte_always_inline void +mlx5_tx_dbrec_cond_wmb(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe, + int cond) +{ + uint64_t *dst = MLX5_TX_BFREG(txq); + volatile uint64_t *src = ((volatile uint64_t *)wqe); + + rte_cio_wmb(); + *txq->qp_db = rte_cpu_to_be_32(txq->wqe_ci); + /* Ensure ordering between DB record and BF copy. */ + rte_wmb(); + mlx5_uar_write64_relaxed(*src, dst, txq->uar_lock); + if (cond) + rte_wmb(); +} + +/** + * Ring TX queue doorbell and flush the update by write memory barrier. + * + * @param txq + * Pointer to TX queue structure. + * @param wqe + * Pointer to the last WQE posted in the NIC. + */ +static __rte_always_inline void +mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe) +{ + mlx5_tx_dbrec_cond_wmb(txq, wqe, 1); +} + +#endif /* RTE_PMD_MLX5_RXTX_H_ */ diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec.c new file mode 100644 index 000000000..1518bdd5b --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec.c @@ -0,0 +1,170 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox Technologies, Ltd + */ + +#include <stdint.h> +#include <string.h> +#include <stdlib.h> + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#include <infiniband/mlx5dv.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_mbuf.h> +#include <rte_mempool.h> +#include <rte_prefetch.h> + +#include <mlx5_prm.h> + +#include "mlx5_defs.h" +#include "mlx5.h" +#include "mlx5_utils.h" +#include "mlx5_rxtx.h" +#include "mlx5_rxtx_vec.h" +#include "mlx5_autoconf.h" + +#if defined RTE_ARCH_X86_64 +#include "mlx5_rxtx_vec_sse.h" +#elif defined RTE_ARCH_ARM64 +#include "mlx5_rxtx_vec_neon.h" +#elif defined RTE_ARCH_PPC_64 +#include "mlx5_rxtx_vec_altivec.h" +#else +#error "This should not be compiled if SIMD instructions are not supported." +#endif + +/** + * Skip error packets. + * + * @param rxq + * Pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * + * @return + * Number of packets successfully received (<= pkts_n). + */ +static uint16_t +rxq_handle_pending_error(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, + uint16_t pkts_n) +{ + uint16_t n = 0; + unsigned int i; +#ifdef MLX5_PMD_SOFT_COUNTERS + uint32_t err_bytes = 0; +#endif + + for (i = 0; i < pkts_n; ++i) { + struct rte_mbuf *pkt = pkts[i]; + + if (pkt->packet_type == RTE_PTYPE_ALL_MASK || rxq->err_state) { +#ifdef MLX5_PMD_SOFT_COUNTERS + err_bytes += PKT_LEN(pkt); +#endif + rte_pktmbuf_free_seg(pkt); + } else { + pkts[n++] = pkt; + } + } + rxq->stats.idropped += (pkts_n - n); +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Correct counters of errored completions. */ + rxq->stats.ipackets -= (pkts_n - n); + rxq->stats.ibytes -= err_bytes; +#endif + mlx5_rx_err_handle(rxq, 1); + return n; +} + +/** + * DPDK callback for vectorized RX. + * + * @param dpdk_rxq + * Generic pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * + * @return + * Number of packets successfully received (<= pkts_n). + */ +uint16_t +mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + struct mlx5_rxq_data *rxq = dpdk_rxq; + uint16_t nb_rx; + uint64_t err = 0; + + nb_rx = rxq_burst_v(rxq, pkts, pkts_n, &err); + if (unlikely(err | rxq->err_state)) + nb_rx = rxq_handle_pending_error(rxq, pkts, nb_rx); + return nb_rx; +} + +/** + * Check a RX queue can support vectorized RX. + * + * @param rxq + * Pointer to RX queue. + * + * @return + * 1 if supported, negative errno value if not. + */ +int __rte_cold +mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq) +{ + struct mlx5_rxq_ctrl *ctrl = + container_of(rxq, struct mlx5_rxq_ctrl, rxq); + + if (mlx5_mprq_enabled(ETH_DEV(ctrl->priv))) + return -ENOTSUP; + if (!ctrl->priv->config.rx_vec_en || rxq->sges_n != 0) + return -ENOTSUP; + if (rxq->lro) + return -ENOTSUP; + return 1; +} + +/** + * Check a device can support vectorized RX. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * 1 if supported, negative errno value if not. + */ +int __rte_cold +mlx5_check_vec_rx_support(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + uint16_t i; + + if (!priv->config.rx_vec_en) + return -ENOTSUP; + if (mlx5_mprq_enabled(dev)) + return -ENOTSUP; + /* All the configured queues should support. */ + for (i = 0; i < priv->rxqs_n; ++i) { + struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; + + if (!rxq) + continue; + if (mlx5_rxq_check_vec_support(rxq) < 0) + break; + } + if (i != priv->rxqs_n) + return -ENOTSUP; + return 1; +} diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec.h new file mode 100644 index 000000000..6ddcbfb0a --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox Technologies, Ltd + */ + +#ifndef RTE_PMD_MLX5_RXTX_VEC_H_ +#define RTE_PMD_MLX5_RXTX_VEC_H_ + +#include <rte_common.h> +#include <rte_mbuf.h> + +#include <mlx5_prm.h> + +#include "mlx5_autoconf.h" + +#include "mlx5_mr.h" + +/* HW checksum offload capabilities of vectorized Tx. */ +#define MLX5_VEC_TX_CKSUM_OFFLOAD_CAP \ + (DEV_TX_OFFLOAD_IPV4_CKSUM | \ + DEV_TX_OFFLOAD_UDP_CKSUM | \ + DEV_TX_OFFLOAD_TCP_CKSUM | \ + DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM) + +/* + * Compile time sanity check for vectorized functions. + */ + +#define S_ASSERT_RTE_MBUF(s) \ + static_assert(s, "A field of struct rte_mbuf is changed") +#define S_ASSERT_MLX5_CQE(s) \ + static_assert(s, "A field of struct mlx5_cqe is changed") + +/* rxq_cq_decompress_v() */ +S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, pkt_len) == + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); +S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, data_len) == + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); +S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, hash) == + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); + +/* rxq_cq_to_ptype_oflags_v() */ +S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, ol_flags) == + offsetof(struct rte_mbuf, rearm_data) + 8); +S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, rearm_data) == + RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16)); + +/* rxq_burst_v() */ +S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, pkt_len) == + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); +S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, data_len) == + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); +#if (RTE_CACHE_LINE_SIZE == 128) +S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, pkt_info) == 64); +#else +S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, pkt_info) == 0); +#endif +S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, rx_hash_res) == + offsetof(struct mlx5_cqe, pkt_info) + 12); +S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, rsvd1) + 11 == + offsetof(struct mlx5_cqe, hdr_type_etc)); +S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, vlan_info) == + offsetof(struct mlx5_cqe, hdr_type_etc) + 2); +S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, lro_num_seg) + 12 == + offsetof(struct mlx5_cqe, byte_cnt)); +S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, sop_drop_qpn) == + RTE_ALIGN(offsetof(struct mlx5_cqe, sop_drop_qpn), 8)); +S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, op_own) == + offsetof(struct mlx5_cqe, sop_drop_qpn) + 7); + +/** + * Replenish buffers for RX in bulk. + * + * @param rxq + * Pointer to RX queue structure. + * @param n + * Number of buffers to be replenished. + */ +static inline void +mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n) +{ + const uint16_t q_n = 1 << rxq->elts_n; + const uint16_t q_mask = q_n - 1; + uint16_t elts_idx = rxq->rq_ci & q_mask; + struct rte_mbuf **elts = &(*rxq->elts)[elts_idx]; + volatile struct mlx5_wqe_data_seg *wq = + &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[elts_idx]; + unsigned int i; + + MLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n)); + MLX5_ASSERT(n <= (uint16_t)(q_n - (rxq->rq_ci - rxq->rq_pi))); + MLX5_ASSERT(MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n) > + MLX5_VPMD_DESCS_PER_LOOP); + /* Not to cross queue end. */ + n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx); + if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) { + rxq->stats.rx_nombuf += n; + return; + } + for (i = 0; i < n; ++i) { + void *buf_addr; + + /* + * In order to support the mbufs with external attached + * data buffer we should use the buf_addr pointer instead of + * rte_mbuf_buf_addr(). It touches the mbuf itself and may + * impact the performance. + */ + buf_addr = elts[i]->buf_addr; + wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr + + RTE_PKTMBUF_HEADROOM); + /* If there's only one MR, no need to replace LKey in WQE. */ + if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) + wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]); + } + rxq->rq_ci += n; + /* Prevent overflowing into consumed mbufs. */ + elts_idx = rxq->rq_ci & q_mask; + for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) + (*rxq->elts)[elts_idx + i] = &rxq->fake_mbuf; + rte_cio_wmb(); + *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); +} + +#endif /* RTE_PMD_MLX5_RXTX_VEC_H_ */ diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h new file mode 100644 index 000000000..26715ef45 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h @@ -0,0 +1,1114 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox Technologies, Ltd + */ + +#ifndef RTE_PMD_MLX5_RXTX_VEC_ALTIVEC_H_ +#define RTE_PMD_MLX5_RXTX_VEC_ALTIVEC_H_ + +#include <stdint.h> +#include <string.h> +#include <stdlib.h> + +#include <rte_altivec.h> + +#include <rte_mbuf.h> +#include <rte_mempool.h> +#include <rte_prefetch.h> + +#include <mlx5_prm.h> + +#include "mlx5_defs.h" +#include "mlx5.h" +#include "mlx5_utils.h" +#include "mlx5_rxtx.h" +#include "mlx5_rxtx_vec.h" +#include "mlx5_autoconf.h" + +#ifndef __INTEL_COMPILER +#pragma GCC diagnostic ignored "-Wcast-qual" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif + +/** + * Store free buffers to RX SW ring. + * + * @param rxq + * Pointer to RX queue structure. + * @param pkts + * Pointer to array of packets to be stored. + * @param pkts_n + * Number of packets to be stored. + */ +static inline void +rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n) +{ + const uint16_t q_mask = (1 << rxq->elts_n) - 1; + struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask]; + unsigned int pos; + uint16_t p = n & -2; + + for (pos = 0; pos < p; pos += 2) { + vector unsigned char mbp; + + mbp = (vector unsigned char)vec_vsx_ld(0, + (signed int const *)&elts[pos]); + *(vector unsigned char *)&pkts[pos] = mbp; + } + if (n & 1) + pkts[pos] = elts[pos]; +} + +/** + * Decompress a compressed completion and fill in mbufs in RX SW ring with data + * extracted from the title completion descriptor. + * + * @param rxq + * Pointer to RX queue structure. + * @param cq + * Pointer to completion array having a compressed completion at first. + * @param elts + * Pointer to SW ring to be filled. The first mbuf has to be pre-built from + * the title completion descriptor to be copied to the rest of mbufs. + * + * @return + * Number of mini-CQEs successfully decompressed. + */ +static inline uint16_t +rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, + struct rte_mbuf **elts) +{ + volatile struct mlx5_mini_cqe8 *mcq = (void *)&(cq + 1)->pkt_info; + struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */ + const vector unsigned char zero = (vector unsigned char){0}; + /* Mask to shuffle from extracted mini CQE to mbuf. */ + const vector unsigned char shuf_mask1 = (vector unsigned char){ + -1, -1, -1, -1, /* skip packet_type */ + 7, 6, -1, -1, /* bswap16, pkt_len */ + 7, 6, /* bswap16, data_len */ + -1, -1, /* skip vlan_tci */ + 3, 2, 1, 0}; /* bswap32, rss */ + const vector unsigned char shuf_mask2 = (vector unsigned char){ + -1, -1, -1, -1, /* skip packet_type */ + 15, 14, -1, -1, /* bswap16, pkt_len */ + 15, 14, /* data_len, bswap16 */ + -1, -1, /* skip vlan_tci */ + 11, 10, 9, 8}; /* bswap32, rss */ + /* Restore the compressed count. Must be 16 bits. */ + const uint16_t mcqe_n = t_pkt->data_len + + (rxq->crc_present * RTE_ETHER_CRC_LEN); + const vector unsigned char rearm = + (vector unsigned char)vec_vsx_ld(0, + (signed int const *)&t_pkt->rearm_data); + const vector unsigned char rxdf = + (vector unsigned char)vec_vsx_ld(0, + (signed int const *)&t_pkt->rx_descriptor_fields1); + const vector unsigned char crc_adj = + (vector unsigned char)(vector unsigned short){ + 0, 0, rxq->crc_present * RTE_ETHER_CRC_LEN, 0, + rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0}; + const vector unsigned short rxdf_sel_mask = + (vector unsigned short){ + 0xffff, 0xffff, 0, 0, 0, 0xffff, 0, 0}; + const uint32_t flow_tag = t_pkt->hash.fdir.hi; + unsigned int pos; + unsigned int i; + unsigned int inv = 0; + +#ifdef MLX5_PMD_SOFT_COUNTERS + const vector unsigned char ones = vec_splat_u8(-1); + uint32_t rcvd_byte = 0; + /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */ + const vector unsigned char len_shuf_mask = (vector unsigned char){ + 3, 2, 11, 10, + 7, 6, 15, 14, + -1, -1, -1, -1, + -1, -1, -1, -1}; +#endif + + /* + * A. load mCQEs into a 128bit register. + * B. store rearm data to mbuf. + * C. combine data from mCQEs with rx_descriptor_fields1. + * D. store rx_descriptor_fields1. + * E. store flow tag (rte_flow mark). + */ + for (pos = 0; pos < mcqe_n; ) { + vector unsigned char mcqe1, mcqe2; + vector unsigned char rxdf1, rxdf2; +#ifdef MLX5_PMD_SOFT_COUNTERS + const vector unsigned short mcqe_sel_mask = + (vector unsigned short){0, 0, 0xffff, 0xffff, + 0, 0, 0xfff, 0xffff}; + const vector unsigned char lower_half = { + 0, 1, 4, 5, 8, 9, 12, 13, 16, + 17, 20, 21, 24, 25, 28, 29}; + const vector unsigned char upper_half = { + 2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + vector unsigned short left, right; + vector unsigned char byte_cnt, invalid_mask; + vector unsigned long lshift; + __attribute__((altivec(vector__))) + __attribute__((altivec(bool__))) + unsigned long long shmask; + const vector unsigned long shmax = {64, 64}; +#endif + + for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) + if (likely(pos + i < mcqe_n)) + rte_prefetch0((void *)(cq + pos + i)); + + /* A.1 load mCQEs into a 128bit register. */ + mcqe1 = (vector unsigned char)vec_vsx_ld(0, + (signed int const *)&mcq[pos % 8]); + mcqe2 = (vector unsigned char)vec_vsx_ld(0, + (signed int const *)&mcq[pos % 8 + 2]); + + /* B.1 store rearm data to mbuf. */ + *(vector unsigned char *) + &elts[pos]->rearm_data = rearm; + *(vector unsigned char *) + &elts[pos + 1]->rearm_data = rearm; + + /* C.1 combine data from mCQEs with rx_descriptor_fields1. */ + rxdf1 = vec_perm(mcqe1, zero, shuf_mask1); + rxdf2 = vec_perm(mcqe1, zero, shuf_mask2); + rxdf1 = (vector unsigned char) + ((vector unsigned short)rxdf1 - + (vector unsigned short)crc_adj); + rxdf2 = (vector unsigned char) + ((vector unsigned short)rxdf2 - + (vector unsigned short)crc_adj); + rxdf1 = (vector unsigned char) + vec_sel((vector unsigned short)rxdf1, + (vector unsigned short)rxdf, rxdf_sel_mask); + rxdf2 = (vector unsigned char) + vec_sel((vector unsigned short)rxdf2, + (vector unsigned short)rxdf, rxdf_sel_mask); + + /* D.1 store rx_descriptor_fields1. */ + *(vector unsigned char *) + &elts[pos]->rx_descriptor_fields1 = rxdf1; + *(vector unsigned char *) + &elts[pos + 1]->rx_descriptor_fields1 = rxdf2; + + /* B.1 store rearm data to mbuf. */ + *(vector unsigned char *) + &elts[pos + 2]->rearm_data = rearm; + *(vector unsigned char *) + &elts[pos + 3]->rearm_data = rearm; + + /* C.1 combine data from mCQEs with rx_descriptor_fields1. */ + rxdf1 = vec_perm(mcqe2, zero, shuf_mask1); + rxdf2 = vec_perm(mcqe2, zero, shuf_mask2); + rxdf1 = (vector unsigned char) + ((vector unsigned short)rxdf1 - + (vector unsigned short)crc_adj); + rxdf2 = (vector unsigned char) + ((vector unsigned short)rxdf2 - + (vector unsigned short)crc_adj); + rxdf1 = (vector unsigned char) + vec_sel((vector unsigned short)rxdf1, + (vector unsigned short)rxdf, rxdf_sel_mask); + rxdf2 = (vector unsigned char) + vec_sel((vector unsigned short)rxdf2, + (vector unsigned short)rxdf, rxdf_sel_mask); + + /* D.1 store rx_descriptor_fields1. */ + *(vector unsigned char *) + &elts[pos + 2]->rx_descriptor_fields1 = rxdf1; + *(vector unsigned char *) + &elts[pos + 3]->rx_descriptor_fields1 = rxdf2; + +#ifdef MLX5_PMD_SOFT_COUNTERS + invalid_mask = (vector unsigned char)(vector unsigned long){ + (mcqe_n - pos) * sizeof(uint16_t) * 8, 0}; + + lshift = + vec_splat((vector unsigned long)invalid_mask, 0); + shmask = vec_cmpgt(shmax, lshift); + invalid_mask = (vector unsigned char) + vec_sl((vector unsigned long)ones, lshift); + invalid_mask = (vector unsigned char) + vec_sel((vector unsigned long)shmask, + (vector unsigned long)invalid_mask, shmask); + + mcqe1 = (vector unsigned char) + vec_sro((vector unsigned short)mcqe1, + (vector unsigned char){32}), + byte_cnt = (vector unsigned char) + vec_sel((vector unsigned short)mcqe1, + (vector unsigned short)mcqe2, mcqe_sel_mask); + byte_cnt = vec_perm(byte_cnt, zero, len_shuf_mask); + byte_cnt = (vector unsigned char) + vec_andc((vector unsigned long)byte_cnt, + (vector unsigned long)invalid_mask); + left = vec_perm((vector unsigned short)byte_cnt, + (vector unsigned short)zero, lower_half); + right = vec_perm((vector unsigned short)byte_cnt, + (vector unsigned short)zero, upper_half); + byte_cnt = (vector unsigned char)vec_add(left, right); + left = vec_perm((vector unsigned short)byte_cnt, + (vector unsigned short)zero, lower_half); + right = vec_perm((vector unsigned short)byte_cnt, + (vector unsigned short)zero, upper_half); + byte_cnt = (vector unsigned char)vec_add(left, right); + rcvd_byte += ((vector unsigned long)byte_cnt)[0]; +#endif + + if (rxq->mark) { + /* E.1 store flow tag (rte_flow mark). */ + elts[pos]->hash.fdir.hi = flow_tag; + elts[pos + 1]->hash.fdir.hi = flow_tag; + elts[pos + 2]->hash.fdir.hi = flow_tag; + elts[pos + 3]->hash.fdir.hi = flow_tag; + } + if (rxq->dynf_meta) { + int32_t offs = rxq->flow_meta_offset; + const uint32_t meta = + *RTE_MBUF_DYNFIELD(t_pkt, offs, uint32_t *); + + /* Check if title packet has valid metadata. */ + if (meta) { + MLX5_ASSERT(t_pkt->ol_flags & + rxq->flow_meta_mask); + *RTE_MBUF_DYNFIELD(elts[pos], offs, + uint32_t *) = meta; + *RTE_MBUF_DYNFIELD(elts[pos + 1], offs, + uint32_t *) = meta; + *RTE_MBUF_DYNFIELD(elts[pos + 2], offs, + uint32_t *) = meta; + *RTE_MBUF_DYNFIELD(elts[pos + 3], offs, + uint32_t *) = meta; + } + } + + pos += MLX5_VPMD_DESCS_PER_LOOP; + /* Move to next CQE and invalidate consumed CQEs. */ + if (!(pos & 0x7) && pos < mcqe_n) { + mcq = (void *)&(cq + pos)->pkt_info; + for (i = 0; i < 8; ++i) + cq[inv++].op_own = MLX5_CQE_INVALIDATE; + } + } + + /* Invalidate the rest of CQEs. */ + for (; inv < mcqe_n; ++inv) + cq[inv].op_own = MLX5_CQE_INVALIDATE; + +#ifdef MLX5_PMD_SOFT_COUNTERS + rxq->stats.ipackets += mcqe_n; + rxq->stats.ibytes += rcvd_byte; +#endif + + rxq->cq_ci += mcqe_n; + return mcqe_n; +} + +/** + * Calculate packet type and offload flag for mbuf and store it. + * + * @param rxq + * Pointer to RX queue structure. + * @param cqes[4] + * Array of four 16bytes completions extracted from the original completion + * descriptor. + * @param op_err + * Opcode vector having responder error status. Each field is 4B. + * @param pkts + * Pointer to array of packets to be filled. + */ +static inline void +rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, + vector unsigned char cqes[4], vector unsigned char op_err, + struct rte_mbuf **pkts) +{ + vector unsigned char pinfo0, pinfo1; + vector unsigned char pinfo, ptype; + vector unsigned char ol_flags = (vector unsigned char) + (vector unsigned int){ + rxq->rss_hash * PKT_RX_RSS_HASH | + rxq->hw_timestamp * PKT_RX_TIMESTAMP, + rxq->rss_hash * PKT_RX_RSS_HASH | + rxq->hw_timestamp * PKT_RX_TIMESTAMP, + rxq->rss_hash * PKT_RX_RSS_HASH | + rxq->hw_timestamp * PKT_RX_TIMESTAMP, + rxq->rss_hash * PKT_RX_RSS_HASH | + rxq->hw_timestamp * PKT_RX_TIMESTAMP}; + vector unsigned char cv_flags; + const vector unsigned char zero = (vector unsigned char){0}; + const vector unsigned char ptype_mask = + (vector unsigned char)(vector unsigned int){ + 0x0000fd06, 0x0000fd06, 0x0000fd06, 0x0000fd06}; + const vector unsigned char ptype_ol_mask = + (vector unsigned char)(vector unsigned int){ + 0x00000106, 0x00000106, 0x00000106, 0x00000106}; + const vector unsigned char pinfo_mask = + (vector unsigned char)(vector unsigned int){ + 0x00000003, 0x00000003, 0x00000003, 0x00000003}; + const vector unsigned char cv_flag_sel = (vector unsigned char){ + 0, (uint8_t)(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED), + (uint8_t)(PKT_RX_IP_CKSUM_GOOD >> 1), 0, + (uint8_t)(PKT_RX_L4_CKSUM_GOOD >> 1), 0, + (uint8_t)((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1), + 0, 0, 0, 0, 0, 0, 0, 0, 0}; + const vector unsigned char cv_mask = + (vector unsigned char)(vector unsigned int){ + PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, + PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, + PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, + PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED}; + const vector unsigned char mbuf_init = + (vector unsigned char)vec_vsx_ld + (0, (vector unsigned char *)&rxq->mbuf_initializer); + const vector unsigned short rearm_sel_mask = + (vector unsigned short){0, 0, 0, 0, 0xffff, 0xffff, 0, 0}; + vector unsigned char rearm0, rearm1, rearm2, rearm3; + uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3; + + /* Extract pkt_info field. */ + pinfo0 = (vector unsigned char) + vec_mergeh((vector unsigned int)cqes[0], + (vector unsigned int)cqes[1]); + pinfo1 = (vector unsigned char) + vec_mergeh((vector unsigned int)cqes[2], + (vector unsigned int)cqes[3]); + pinfo = (vector unsigned char) + vec_mergeh((vector unsigned long)pinfo0, + (vector unsigned long)pinfo1); + + /* Extract hdr_type_etc field. */ + pinfo0 = (vector unsigned char) + vec_mergel((vector unsigned int)cqes[0], + (vector unsigned int)cqes[1]); + pinfo1 = (vector unsigned char) + vec_mergel((vector unsigned int)cqes[2], + (vector unsigned int)cqes[3]); + ptype = (vector unsigned char) + vec_mergeh((vector unsigned long)pinfo0, + (vector unsigned long)pinfo1); + + if (rxq->mark) { + const vector unsigned char pinfo_ft_mask = + (vector unsigned char)(vector unsigned int){ + 0xffffff00, 0xffffff00, 0xffffff00, 0xffffff00}; + const vector unsigned char fdir_flags = + (vector unsigned char)(vector unsigned int){ + PKT_RX_FDIR, PKT_RX_FDIR, + PKT_RX_FDIR, PKT_RX_FDIR}; + vector unsigned char fdir_id_flags = + (vector unsigned char)(vector unsigned int){ + PKT_RX_FDIR_ID, PKT_RX_FDIR_ID, + PKT_RX_FDIR_ID, PKT_RX_FDIR_ID}; + vector unsigned char flow_tag, invalid_mask; + + flow_tag = (vector unsigned char) + vec_and((vector unsigned long)pinfo, + (vector unsigned long)pinfo_ft_mask); + + /* Check if flow tag is non-zero then set PKT_RX_FDIR. */ + invalid_mask = (vector unsigned char) + vec_cmpeq((vector unsigned int)flow_tag, + (vector unsigned int)zero); + ol_flags = (vector unsigned char) + vec_or((vector unsigned long)ol_flags, + (vector unsigned long) + vec_andc((vector unsigned long)fdir_flags, + (vector unsigned long)invalid_mask)); + + /* Mask out invalid entries. */ + fdir_id_flags = (vector unsigned char) + vec_andc((vector unsigned long)fdir_id_flags, + (vector unsigned long)invalid_mask); + + /* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */ + ol_flags = (vector unsigned char) + vec_or((vector unsigned long)ol_flags, + (vector unsigned long) + vec_andc((vector unsigned long)fdir_id_flags, + (vector unsigned long) + vec_cmpeq((vector unsigned int)flow_tag, + (vector unsigned int)pinfo_ft_mask))); + } + /* + * Merge the two fields to generate the following: + * bit[1] = l3_ok + * bit[2] = l4_ok + * bit[8] = cv + * bit[11:10] = l3_hdr_type + * bit[14:12] = l4_hdr_type + * bit[15] = ip_frag + * bit[16] = tunneled + * bit[17] = outer_l3_type + */ + ptype = (vector unsigned char) + vec_and((vector unsigned long)ptype, + (vector unsigned long)ptype_mask); + pinfo = (vector unsigned char) + vec_and((vector unsigned long)pinfo, + (vector unsigned long)pinfo_mask); + pinfo = (vector unsigned char) + vec_sl((vector unsigned int)pinfo, + (vector unsigned int){16, 16, 16, 16}); + + /* Make pinfo has merged fields for ol_flags calculation. */ + pinfo = (vector unsigned char) + vec_or((vector unsigned long)ptype, + (vector unsigned long)pinfo); + ptype = (vector unsigned char) + vec_sr((vector unsigned int)pinfo, + (vector unsigned int){10, 10, 10, 10}); + ptype = (vector unsigned char) + vec_packs((vector unsigned int)ptype, + (vector unsigned int)zero); + + /* Errored packets will have RTE_PTYPE_ALL_MASK. */ + op_err = (vector unsigned char) + vec_sr((vector unsigned short)op_err, + (vector unsigned short){8, 8, 8, 8, 8, 8, 8, 8}); + ptype = (vector unsigned char) + vec_or((vector unsigned long)ptype, + (vector unsigned long)op_err); + + pt_idx0 = (uint8_t)((vector unsigned char)ptype)[0]; + pt_idx1 = (uint8_t)((vector unsigned char)ptype)[2]; + pt_idx2 = (uint8_t)((vector unsigned char)ptype)[4]; + pt_idx3 = (uint8_t)((vector unsigned char)ptype)[6]; + + pkts[0]->packet_type = mlx5_ptype_table[pt_idx0] | + !!(pt_idx0 & (1 << 6)) * rxq->tunnel; + pkts[1]->packet_type = mlx5_ptype_table[pt_idx1] | + !!(pt_idx1 & (1 << 6)) * rxq->tunnel; + pkts[2]->packet_type = mlx5_ptype_table[pt_idx2] | + !!(pt_idx2 & (1 << 6)) * rxq->tunnel; + pkts[3]->packet_type = mlx5_ptype_table[pt_idx3] | + !!(pt_idx3 & (1 << 6)) * rxq->tunnel; + + /* Fill flags for checksum and VLAN. */ + pinfo = (vector unsigned char) + vec_and((vector unsigned long)pinfo, + (vector unsigned long)ptype_ol_mask); + pinfo = vec_perm(cv_flag_sel, zero, pinfo); + + /* Locate checksum flags at byte[2:1] and merge with VLAN flags. */ + cv_flags = (vector unsigned char) + vec_sl((vector unsigned int)pinfo, + (vector unsigned int){9, 9, 9, 9}); + cv_flags = (vector unsigned char) + vec_or((vector unsigned long)pinfo, + (vector unsigned long)cv_flags); + + /* Move back flags to start from byte[0]. */ + cv_flags = (vector unsigned char) + vec_sr((vector unsigned int)cv_flags, + (vector unsigned int){8, 8, 8, 8}); + + /* Mask out garbage bits. */ + cv_flags = (vector unsigned char) + vec_and((vector unsigned long)cv_flags, + (vector unsigned long)cv_mask); + + /* Merge to ol_flags. */ + ol_flags = (vector unsigned char) + vec_or((vector unsigned long)ol_flags, + (vector unsigned long)cv_flags); + + /* Merge mbuf_init and ol_flags. */ + rearm0 = (vector unsigned char) + vec_sel((vector unsigned short)mbuf_init, + (vector unsigned short) + vec_slo((vector unsigned short)ol_flags, + (vector unsigned char){64}), rearm_sel_mask); + rearm1 = (vector unsigned char) + vec_sel((vector unsigned short)mbuf_init, + (vector unsigned short) + vec_slo((vector unsigned short)ol_flags, + (vector unsigned char){32}), rearm_sel_mask); + rearm2 = (vector unsigned char) + vec_sel((vector unsigned short)mbuf_init, + (vector unsigned short)ol_flags, rearm_sel_mask); + rearm3 = (vector unsigned char) + vec_sel((vector unsigned short)mbuf_init, + (vector unsigned short) + vec_sro((vector unsigned short)ol_flags, + (vector unsigned char){32}), rearm_sel_mask); + + /* Write 8B rearm_data and 8B ol_flags. */ + vec_vsx_st(rearm0, 0, + (vector unsigned char *)&pkts[0]->rearm_data); + vec_vsx_st(rearm1, 0, + (vector unsigned char *)&pkts[1]->rearm_data); + vec_vsx_st(rearm2, 0, + (vector unsigned char *)&pkts[2]->rearm_data); + vec_vsx_st(rearm3, 0, + (vector unsigned char *)&pkts[3]->rearm_data); +} + + +/** + * Receive burst of packets. An errored completion also consumes a mbuf, but the + * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed + * before returning to application. + * + * @param rxq + * Pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * @param[out] err + * Pointer to a flag. Set non-zero value if pkts array has at least one error + * packet to handle. + * + * @return + * Number of packets received including errors (<= pkts_n). + */ +static inline uint16_t +rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, + uint64_t *err) +{ + const uint16_t q_n = 1 << rxq->cqe_n; + const uint16_t q_mask = q_n - 1; + volatile struct mlx5_cqe *cq; + struct rte_mbuf **elts; + unsigned int pos; + uint64_t n; + uint16_t repl_n; + uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP; + uint16_t nocmp_n = 0; + uint16_t rcvd_pkt = 0; + unsigned int cq_idx = rxq->cq_ci & q_mask; + unsigned int elts_idx; + unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1)); + const vector unsigned char zero = (vector unsigned char){0}; + const vector unsigned char ones = vec_splat_u8(-1); + const vector unsigned char owner_check = + (vector unsigned char)(vector unsigned long){ + 0x0100000001000000LL, 0x0100000001000000LL}; + const vector unsigned char opcode_check = + (vector unsigned char)(vector unsigned long){ + 0xf0000000f0000000LL, 0xf0000000f0000000LL}; + const vector unsigned char format_check = + (vector unsigned char)(vector unsigned long){ + 0x0c0000000c000000LL, 0x0c0000000c000000LL}; + const vector unsigned char resp_err_check = + (vector unsigned char)(vector unsigned long){ + 0xe0000000e0000000LL, 0xe0000000e0000000LL}; +#ifdef MLX5_PMD_SOFT_COUNTERS + uint32_t rcvd_byte = 0; + /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */ + const vector unsigned char len_shuf_mask = (vector unsigned char){ + 1, 0, 5, 4, + 9, 8, 13, 12, + -1, -1, -1, -1, + -1, -1, -1, -1}; +#endif + /* Mask to shuffle from extracted CQE to mbuf. */ + const vector unsigned char shuf_mask = (vector unsigned char){ + 5, 4, /* bswap16, pkt_len */ + -1, -1, /* zero out 2nd half of pkt_len */ + 5, 4, /* bswap16, data_len */ + 11, 10, /* bswap16, vlan+tci */ + 15, 14, 13, 12, /* bswap32, rss */ + 1, 2, 3, -1}; /* fdir.hi */ + /* Mask to blend from the last Qword to the first DQword. */ + /* Mask to blend from the last Qword to the first DQword. */ + const vector unsigned char blend_mask = (vector unsigned char){ + -1, 0, 0, 0, + 0, 0, 0, 0, + -1, -1, -1, -1, + -1, -1, -1, -1}; + const vector unsigned char crc_adj = + (vector unsigned char)(vector unsigned short){ + rxq->crc_present * RTE_ETHER_CRC_LEN, 0, + rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0, 0, 0}; + const vector unsigned char flow_mark_adj = + (vector unsigned char)(vector unsigned int){ + 0, 0, 0, rxq->mark * (-1)}; + const vector unsigned short cqe_sel_mask1 = + (vector unsigned short){0, 0, 0, 0, 0xffff, 0xffff, 0, 0}; + const vector unsigned short cqe_sel_mask2 = + (vector unsigned short){0, 0, 0xffff, 0, 0, 0, 0, 0}; + + MLX5_ASSERT(rxq->sges_n == 0); + MLX5_ASSERT(rxq->cqe_n == rxq->elts_n); + cq = &(*rxq->cqes)[cq_idx]; + rte_prefetch0(cq); + rte_prefetch0(cq + 1); + rte_prefetch0(cq + 2); + rte_prefetch0(cq + 3); + pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST); + + repl_n = q_n - (rxq->rq_ci - rxq->rq_pi); + if (repl_n >= rxq->rq_repl_thresh) + mlx5_rx_replenish_bulk_mbuf(rxq, repl_n); + /* See if there're unreturned mbufs from compressed CQE. */ + rcvd_pkt = rxq->decompressed; + if (rcvd_pkt > 0) { + rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n); + rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt); + rxq->rq_pi += rcvd_pkt; + rxq->decompressed -= rcvd_pkt; + pkts += rcvd_pkt; + } + elts_idx = rxq->rq_pi & q_mask; + elts = &(*rxq->elts)[elts_idx]; + /* Not to overflow pkts array. */ + pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP); + /* Not to cross queue end. */ + pkts_n = RTE_MIN(pkts_n, q_n - elts_idx); + pkts_n = RTE_MIN(pkts_n, q_n - cq_idx); + if (!pkts_n) + return rcvd_pkt; + /* At this point, there shouldn't be any remaining packets. */ + MLX5_ASSERT(rxq->decompressed == 0); + + /* + * A. load first Qword (8bytes) in one loop. + * B. copy 4 mbuf pointers from elts ring to returing pkts. + * C. load remaining CQE data and extract necessary fields. + * Final 16bytes cqes[] extracted from original 64bytes CQE has the + * following structure: + * struct { + * uint8_t pkt_info; + * uint8_t flow_tag[3]; + * uint16_t byte_cnt; + * uint8_t rsvd4; + * uint8_t op_own; + * uint16_t hdr_type_etc; + * uint16_t vlan_info; + * uint32_t rx_has_res; + * } c; + * D. fill in mbuf. + * E. get valid CQEs. + * F. find compressed CQE. + */ + for (pos = 0; + pos < pkts_n; + pos += MLX5_VPMD_DESCS_PER_LOOP) { + vector unsigned char cqes[MLX5_VPMD_DESCS_PER_LOOP]; + vector unsigned char cqe_tmp1, cqe_tmp2; + vector unsigned char pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3; + vector unsigned char op_own, op_own_tmp1, op_own_tmp2; + vector unsigned char opcode, owner_mask, invalid_mask; + vector unsigned char comp_mask; + vector unsigned char mask; +#ifdef MLX5_PMD_SOFT_COUNTERS + const vector unsigned char lower_half = { + 0, 1, 4, 5, 8, 9, 12, 13, + 16, 17, 20, 21, 24, 25, 28, 29}; + const vector unsigned char upper_half = { + 2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + const vector unsigned long shmax = {64, 64}; + vector unsigned char byte_cnt; + vector unsigned short left, right; + vector unsigned long lshift; + vector __attribute__((altivec(bool__))) + unsigned long shmask; +#endif + vector unsigned char mbp1, mbp2; + vector unsigned char p = + (vector unsigned char)(vector unsigned short){ + 0, 1, 2, 3, 0, 0, 0, 0}; + unsigned int p1, p2, p3; + + /* Prefetch next 4 CQEs. */ + if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) { + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]); + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]); + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]); + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]); + } + + /* A.0 do not cross the end of CQ. */ + mask = (vector unsigned char)(vector unsigned long){ + (pkts_n - pos) * sizeof(uint16_t) * 8, 0}; + + { + vector unsigned long lshift; + vector __attribute__((altivec(bool__))) + unsigned long shmask; + const vector unsigned long shmax = {64, 64}; + + lshift = vec_splat((vector unsigned long)mask, 0); + shmask = vec_cmpgt(shmax, lshift); + mask = (vector unsigned char) + vec_sl((vector unsigned long)ones, lshift); + mask = (vector unsigned char) + vec_sel((vector unsigned long)shmask, + (vector unsigned long)mask, shmask); + } + + p = (vector unsigned char) + vec_andc((vector unsigned long)p, + (vector unsigned long)mask); + + /* A.1 load cqes. */ + p3 = (unsigned int)((vector unsigned short)p)[3]; + cqes[3] = (vector unsigned char)(vector unsigned long){ + *(__rte_aligned(8) unsigned long *) + &cq[pos + p3].sop_drop_qpn, 0LL}; + rte_compiler_barrier(); + + p2 = (unsigned int)((vector unsigned short)p)[2]; + cqes[2] = (vector unsigned char)(vector unsigned long){ + *(__rte_aligned(8) unsigned long *) + &cq[pos + p2].sop_drop_qpn, 0LL}; + rte_compiler_barrier(); + + /* B.1 load mbuf pointers. */ + mbp1 = (vector unsigned char)vec_vsx_ld(0, + (signed int const *)&elts[pos]); + mbp2 = (vector unsigned char)vec_vsx_ld(0, + (signed int const *)&elts[pos + 2]); + + /* A.1 load a block having op_own. */ + p1 = (unsigned int)((vector unsigned short)p)[1]; + cqes[1] = (vector unsigned char)(vector unsigned long){ + *(__rte_aligned(8) unsigned long *) + &cq[pos + p1].sop_drop_qpn, 0LL}; + rte_compiler_barrier(); + + cqes[0] = (vector unsigned char)(vector unsigned long){ + *(__rte_aligned(8) unsigned long *) + &cq[pos].sop_drop_qpn, 0LL}; + rte_compiler_barrier(); + + /* B.2 copy mbuf pointers. */ + *(vector unsigned char *)&pkts[pos] = mbp1; + *(vector unsigned char *)&pkts[pos + 2] = mbp2; + rte_cio_rmb(); + + /* C.1 load remaining CQE data and extract necessary fields. */ + cqe_tmp2 = *(vector unsigned char *) + &cq[pos + p3].pkt_info; + cqe_tmp1 = *(vector unsigned char *) + &cq[pos + p2].pkt_info; + cqes[3] = vec_sel(cqes[3], cqe_tmp2, blend_mask); + cqes[2] = vec_sel(cqes[2], cqe_tmp1, blend_mask); + cqe_tmp2 = (vector unsigned char)vec_vsx_ld(0, + (signed int const *)&cq[pos + p3].csum); + cqe_tmp1 = (vector unsigned char)vec_vsx_ld(0, + (signed int const *)&cq[pos + p2].csum); + cqes[3] = (vector unsigned char) + vec_sel((vector unsigned short)cqes[3], + (vector unsigned short)cqe_tmp2, cqe_sel_mask1); + cqes[2] = (vector unsigned char) + vec_sel((vector unsigned short)cqes[2], + (vector unsigned short)cqe_tmp1, cqe_sel_mask1); + cqe_tmp2 = (vector unsigned char)(vector unsigned long){ + *(__rte_aligned(8) unsigned long *) + &cq[pos + p3].rsvd3[9], 0LL}; + cqe_tmp1 = (vector unsigned char)(vector unsigned long){ + *(__rte_aligned(8) unsigned long *) + &cq[pos + p2].rsvd3[9], 0LL}; + cqes[3] = (vector unsigned char) + vec_sel((vector unsigned short)cqes[3], + (vector unsigned short)cqe_tmp2, + (vector unsigned short)cqe_sel_mask2); + cqes[2] = (vector unsigned char) + vec_sel((vector unsigned short)cqes[2], + (vector unsigned short)cqe_tmp1, + (vector unsigned short)cqe_sel_mask2); + + /* C.2 generate final structure for mbuf with swapping bytes. */ + pkt_mb3 = vec_perm(cqes[3], zero, shuf_mask); + pkt_mb2 = vec_perm(cqes[2], zero, shuf_mask); + + /* C.3 adjust CRC length. */ + pkt_mb3 = (vector unsigned char) + ((vector unsigned short)pkt_mb3 - + (vector unsigned short)crc_adj); + pkt_mb2 = (vector unsigned char) + ((vector unsigned short)pkt_mb2 - + (vector unsigned short)crc_adj); + + /* C.4 adjust flow mark. */ + pkt_mb3 = (vector unsigned char) + ((vector unsigned int)pkt_mb3 + + (vector unsigned int)flow_mark_adj); + pkt_mb2 = (vector unsigned char) + ((vector unsigned int)pkt_mb2 + + (vector unsigned int)flow_mark_adj); + + /* D.1 fill in mbuf - rx_descriptor_fields1. */ + *(vector unsigned char *) + &pkts[pos + 3]->pkt_len = pkt_mb3; + *(vector unsigned char *) + &pkts[pos + 2]->pkt_len = pkt_mb2; + + /* E.1 extract op_own field. */ + op_own_tmp2 = (vector unsigned char) + vec_mergeh((vector unsigned int)cqes[2], + (vector unsigned int)cqes[3]); + + /* C.1 load remaining CQE data and extract necessary fields. */ + cqe_tmp2 = *(vector unsigned char *) + &cq[pos + p1].pkt_info; + cqe_tmp1 = *(vector unsigned char *) + &cq[pos].pkt_info; + cqes[1] = vec_sel(cqes[1], cqe_tmp2, blend_mask); + cqes[0] = vec_sel(cqes[0], cqe_tmp2, blend_mask); + cqe_tmp2 = (vector unsigned char)vec_vsx_ld(0, + (signed int const *)&cq[pos + p1].csum); + cqe_tmp1 = (vector unsigned char)vec_vsx_ld(0, + (signed int const *)&cq[pos].csum); + cqes[1] = (vector unsigned char) + vec_sel((vector unsigned short)cqes[1], + (vector unsigned short)cqe_tmp2, cqe_sel_mask1); + cqes[0] = (vector unsigned char) + vec_sel((vector unsigned short)cqes[0], + (vector unsigned short)cqe_tmp1, cqe_sel_mask1); + cqe_tmp2 = (vector unsigned char)(vector unsigned long){ + *(__rte_aligned(8) unsigned long *) + &cq[pos + p1].rsvd3[9], 0LL}; + cqe_tmp1 = (vector unsigned char)(vector unsigned long){ + *(__rte_aligned(8) unsigned long *) + &cq[pos].rsvd3[9], 0LL}; + cqes[1] = (vector unsigned char) + vec_sel((vector unsigned short)cqes[1], + (vector unsigned short)cqe_tmp2, cqe_sel_mask2); + cqes[0] = (vector unsigned char) + vec_sel((vector unsigned short)cqes[0], + (vector unsigned short)cqe_tmp1, cqe_sel_mask2); + + /* C.2 generate final structure for mbuf with swapping bytes. */ + pkt_mb1 = vec_perm(cqes[1], zero, shuf_mask); + pkt_mb0 = vec_perm(cqes[0], zero, shuf_mask); + + /* C.3 adjust CRC length. */ + pkt_mb1 = (vector unsigned char) + ((vector unsigned short)pkt_mb1 - + (vector unsigned short)crc_adj); + pkt_mb0 = (vector unsigned char) + ((vector unsigned short)pkt_mb0 - + (vector unsigned short)crc_adj); + + /* C.4 adjust flow mark. */ + pkt_mb1 = (vector unsigned char) + ((vector unsigned int)pkt_mb1 + + (vector unsigned int)flow_mark_adj); + pkt_mb0 = (vector unsigned char) + ((vector unsigned int)pkt_mb0 + + (vector unsigned int)flow_mark_adj); + + /* E.1 extract op_own byte. */ + op_own_tmp1 = (vector unsigned char) + vec_mergeh((vector unsigned int)cqes[0], + (vector unsigned int)cqes[1]); + op_own = (vector unsigned char) + vec_mergel((vector unsigned long)op_own_tmp1, + (vector unsigned long)op_own_tmp2); + + /* D.1 fill in mbuf - rx_descriptor_fields1. */ + *(vector unsigned char *) + &pkts[pos + 1]->pkt_len = pkt_mb1; + *(vector unsigned char *) + &pkts[pos]->pkt_len = pkt_mb0; + + /* E.2 flip owner bit to mark CQEs from last round. */ + owner_mask = (vector unsigned char) + vec_and((vector unsigned long)op_own, + (vector unsigned long)owner_check); + if (ownership) + owner_mask = (vector unsigned char) + vec_xor((vector unsigned long)owner_mask, + (vector unsigned long)owner_check); + owner_mask = (vector unsigned char) + vec_cmpeq((vector unsigned int)owner_mask, + (vector unsigned int)owner_check); + owner_mask = (vector unsigned char) + vec_packs((vector unsigned int)owner_mask, + (vector unsigned int)zero); + + /* E.3 get mask for invalidated CQEs. */ + opcode = (vector unsigned char) + vec_and((vector unsigned long)op_own, + (vector unsigned long)opcode_check); + invalid_mask = (vector unsigned char) + vec_cmpeq((vector unsigned int)opcode_check, + (vector unsigned int)opcode); + invalid_mask = (vector unsigned char) + vec_packs((vector unsigned int)invalid_mask, + (vector unsigned int)zero); + + /* E.4 mask out beyond boundary. */ + invalid_mask = (vector unsigned char) + vec_or((vector unsigned long)invalid_mask, + (vector unsigned long)mask); + + /* E.5 merge invalid_mask with invalid owner. */ + invalid_mask = (vector unsigned char) + vec_or((vector unsigned long)invalid_mask, + (vector unsigned long)owner_mask); + + /* F.1 find compressed CQE format. */ + comp_mask = (vector unsigned char) + vec_and((vector unsigned long)op_own, + (vector unsigned long)format_check); + comp_mask = (vector unsigned char) + vec_cmpeq((vector unsigned int)comp_mask, + (vector unsigned int)format_check); + comp_mask = (vector unsigned char) + vec_packs((vector unsigned int)comp_mask, + (vector unsigned int)zero); + + /* F.2 mask out invalid entries. */ + comp_mask = (vector unsigned char) + vec_andc((vector unsigned long)comp_mask, + (vector unsigned long)invalid_mask); + comp_idx = ((vector unsigned long)comp_mask)[0]; + + /* F.3 get the first compressed CQE. */ + comp_idx = comp_idx ? __builtin_ctzll(comp_idx) / + (sizeof(uint16_t) * 8) : MLX5_VPMD_DESCS_PER_LOOP; + + /* E.6 mask out entries after the compressed CQE. */ + mask = (vector unsigned char)(vector unsigned long){ + (comp_idx * sizeof(uint16_t) * 8), 0}; + lshift = vec_splat((vector unsigned long)mask, 0); + shmask = vec_cmpgt(shmax, lshift); + mask = (vector unsigned char) + vec_sl((vector unsigned long)ones, lshift); + mask = (vector unsigned char) + vec_sel((vector unsigned long)shmask, + (vector unsigned long)mask, shmask); + invalid_mask = (vector unsigned char) + vec_or((vector unsigned long)invalid_mask, + (vector unsigned long)mask); + + /* E.7 count non-compressed valid CQEs. */ + n = ((vector unsigned long)invalid_mask)[0]; + n = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) : + MLX5_VPMD_DESCS_PER_LOOP; + nocmp_n += n; + + /* D.2 get the final invalid mask. */ + mask = (vector unsigned char)(vector unsigned long){ + (n * sizeof(uint16_t) * 8), 0}; + lshift = vec_splat((vector unsigned long)mask, 0); + shmask = vec_cmpgt(shmax, lshift); + mask = (vector unsigned char) + vec_sl((vector unsigned long)ones, lshift); + mask = (vector unsigned char) + vec_sel((vector unsigned long)shmask, + (vector unsigned long)mask, shmask); + invalid_mask = (vector unsigned char) + vec_or((vector unsigned long)invalid_mask, + (vector unsigned long)mask); + + /* D.3 check error in opcode. */ + opcode = (vector unsigned char) + vec_cmpeq((vector unsigned int)resp_err_check, + (vector unsigned int)opcode); + opcode = (vector unsigned char) + vec_packs((vector unsigned int)opcode, + (vector unsigned int)zero); + opcode = (vector unsigned char) + vec_andc((vector unsigned long)opcode, + (vector unsigned long)invalid_mask); + + /* D.4 mark if any error is set */ + *err |= ((vector unsigned long)opcode)[0]; + + /* D.5 fill in mbuf - rearm_data and packet_type. */ + rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]); + if (rxq->hw_timestamp) { + pkts[pos]->timestamp = + rte_be_to_cpu_64(cq[pos].timestamp); + pkts[pos + 1]->timestamp = + rte_be_to_cpu_64(cq[pos + p1].timestamp); + pkts[pos + 2]->timestamp = + rte_be_to_cpu_64(cq[pos + p2].timestamp); + pkts[pos + 3]->timestamp = + rte_be_to_cpu_64(cq[pos + p3].timestamp); + } + if (rxq->dynf_meta) { + uint64_t flag = rxq->flow_meta_mask; + int32_t offs = rxq->flow_meta_offset; + uint32_t metadata; + + /* This code is subject for futher optimization. */ + metadata = cq[pos].flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) = + metadata; + pkts[pos]->ol_flags |= metadata ? flag : 0ULL; + metadata = cq[pos + 1].flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) = + metadata; + pkts[pos + 1]->ol_flags |= metadata ? flag : 0ULL; + metadata = cq[pos + 2].flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) = + metadata; + pkts[pos + 2]->ol_flags |= metadata ? flag : 0ULL; + metadata = cq[pos + 3].flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) = + metadata; + pkts[pos + 3]->ol_flags |= metadata ? flag : 0ULL; + } +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Add up received bytes count. */ + byte_cnt = vec_perm(op_own, zero, len_shuf_mask); + byte_cnt = (vector unsigned char) + vec_andc((vector unsigned long)byte_cnt, + (vector unsigned long)invalid_mask); + left = vec_perm((vector unsigned short)byte_cnt, + (vector unsigned short)zero, lower_half); + right = vec_perm((vector unsigned short)byte_cnt, + (vector unsigned short)zero, upper_half); + byte_cnt = (vector unsigned char)vec_add(left, right); + left = vec_perm((vector unsigned short)byte_cnt, + (vector unsigned short)zero, lower_half); + right = vec_perm((vector unsigned short)byte_cnt, + (vector unsigned short)zero, upper_half); + byte_cnt = (vector unsigned char)vec_add(left, right); + rcvd_byte += ((vector unsigned long)byte_cnt)[0]; +#endif + + /* + * Break the loop unless more valid CQE is expected, or if + * there's a compressed CQE. + */ + if (n != MLX5_VPMD_DESCS_PER_LOOP) + break; + } + /* If no new CQE seen, return without updating cq_db. */ + if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) + return rcvd_pkt; + /* Update the consumer indexes for non-compressed CQEs. */ + MLX5_ASSERT(nocmp_n <= pkts_n); + rxq->cq_ci += nocmp_n; + rxq->rq_pi += nocmp_n; + rcvd_pkt += nocmp_n; +#ifdef MLX5_PMD_SOFT_COUNTERS + rxq->stats.ipackets += nocmp_n; + rxq->stats.ibytes += rcvd_byte; +#endif + /* Decompress the last CQE if compressed. */ + if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) { + MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP)); + rxq->decompressed = + rxq_cq_decompress_v(rxq, &cq[nocmp_n], &elts[nocmp_n]); + /* Return more packets if needed. */ + if (nocmp_n < pkts_n) { + uint16_t n = rxq->decompressed; + + n = RTE_MIN(n, pkts_n - nocmp_n); + rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n); + rxq->rq_pi += n; + rcvd_pkt += n; + rxq->decompressed -= n; + } + } + rte_compiler_barrier(); + *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); + return rcvd_pkt; +} + +#endif /* RTE_PMD_MLX5_RXTX_VEC_ALTIVEC_H_ */ diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_neon.h new file mode 100644 index 000000000..ecafbf800 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_neon.h @@ -0,0 +1,780 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox Technologies, Ltd + */ + +#ifndef RTE_PMD_MLX5_RXTX_VEC_NEON_H_ +#define RTE_PMD_MLX5_RXTX_VEC_NEON_H_ + +#include <stdint.h> +#include <string.h> +#include <stdlib.h> +#include <arm_neon.h> + +#include <rte_mbuf.h> +#include <rte_mempool.h> +#include <rte_prefetch.h> + +#include <mlx5_prm.h> + +#include "mlx5_defs.h" +#include "mlx5.h" +#include "mlx5_utils.h" +#include "mlx5_rxtx.h" +#include "mlx5_rxtx_vec.h" +#include "mlx5_autoconf.h" + +#pragma GCC diagnostic ignored "-Wcast-qual" + +/** + * Store free buffers to RX SW ring. + * + * @param rxq + * Pointer to RX queue structure. + * @param pkts + * Pointer to array of packets to be stored. + * @param pkts_n + * Number of packets to be stored. + */ +static inline void +rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n) +{ + const uint16_t q_mask = (1 << rxq->elts_n) - 1; + struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask]; + unsigned int pos; + uint16_t p = n & -2; + + for (pos = 0; pos < p; pos += 2) { + uint64x2_t mbp; + + mbp = vld1q_u64((void *)&elts[pos]); + vst1q_u64((void *)&pkts[pos], mbp); + } + if (n & 1) + pkts[pos] = elts[pos]; +} + +/** + * Decompress a compressed completion and fill in mbufs in RX SW ring with data + * extracted from the title completion descriptor. + * + * @param rxq + * Pointer to RX queue structure. + * @param cq + * Pointer to completion array having a compressed completion at first. + * @param elts + * Pointer to SW ring to be filled. The first mbuf has to be pre-built from + * the title completion descriptor to be copied to the rest of mbufs. + * + * @return + * Number of mini-CQEs successfully decompressed. + */ +static inline uint16_t +rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, + struct rte_mbuf **elts) +{ + volatile struct mlx5_mini_cqe8 *mcq = (void *)&(cq + 1)->pkt_info; + struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */ + unsigned int pos; + unsigned int i; + unsigned int inv = 0; + /* Mask to shuffle from extracted mini CQE to mbuf. */ + const uint8x16_t mcqe_shuf_m1 = { + -1, -1, -1, -1, /* skip packet_type */ + 7, 6, -1, -1, /* pkt_len, bswap16 */ + 7, 6, /* data_len, bswap16 */ + -1, -1, /* skip vlan_tci */ + 3, 2, 1, 0 /* hash.rss, bswap32 */ + }; + const uint8x16_t mcqe_shuf_m2 = { + -1, -1, -1, -1, /* skip packet_type */ + 15, 14, -1, -1, /* pkt_len, bswap16 */ + 15, 14, /* data_len, bswap16 */ + -1, -1, /* skip vlan_tci */ + 11, 10, 9, 8 /* hash.rss, bswap32 */ + }; + /* Restore the compressed count. Must be 16 bits. */ + const uint16_t mcqe_n = t_pkt->data_len + + (rxq->crc_present * RTE_ETHER_CRC_LEN); + const uint64x2_t rearm = + vld1q_u64((void *)&t_pkt->rearm_data); + const uint32x4_t rxdf_mask = { + 0xffffffff, /* packet_type */ + 0, /* skip pkt_len */ + 0xffff0000, /* vlan_tci, skip data_len */ + 0, /* skip hash.rss */ + }; + const uint8x16_t rxdf = + vandq_u8(vld1q_u8((void *)&t_pkt->rx_descriptor_fields1), + vreinterpretq_u8_u32(rxdf_mask)); + const uint16x8_t crc_adj = { + 0, 0, + rxq->crc_present * RTE_ETHER_CRC_LEN, 0, + rxq->crc_present * RTE_ETHER_CRC_LEN, 0, + 0, 0 + }; + const uint32_t flow_tag = t_pkt->hash.fdir.hi; +#ifdef MLX5_PMD_SOFT_COUNTERS + uint32_t rcvd_byte = 0; +#endif + /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */ + const uint8x8_t len_shuf_m = { + 7, 6, /* 1st mCQE */ + 15, 14, /* 2nd mCQE */ + 23, 22, /* 3rd mCQE */ + 31, 30 /* 4th mCQE */ + }; + + /* + * A. load mCQEs into a 128bit register. + * B. store rearm data to mbuf. + * C. combine data from mCQEs with rx_descriptor_fields1. + * D. store rx_descriptor_fields1. + * E. store flow tag (rte_flow mark). + */ + for (pos = 0; pos < mcqe_n; ) { + uint8_t *p = (void *)&mcq[pos % 8]; + uint8_t *e0 = (void *)&elts[pos]->rearm_data; + uint8_t *e1 = (void *)&elts[pos + 1]->rearm_data; + uint8_t *e2 = (void *)&elts[pos + 2]->rearm_data; + uint8_t *e3 = (void *)&elts[pos + 3]->rearm_data; + uint16x4_t byte_cnt; +#ifdef MLX5_PMD_SOFT_COUNTERS + uint16x4_t invalid_mask = + vcreate_u16(mcqe_n - pos < MLX5_VPMD_DESCS_PER_LOOP ? + -1UL << ((mcqe_n - pos) * + sizeof(uint16_t) * 8) : 0); +#endif + for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) + if (likely(pos + i < mcqe_n)) + rte_prefetch0((void *)(cq + pos + i)); + __asm__ volatile ( + /* A.1 load mCQEs into a 128bit register. */ + "ld1 {v16.16b - v17.16b}, [%[mcq]] \n\t" + /* B.1 store rearm data to mbuf. */ + "st1 {%[rearm].2d}, [%[e0]] \n\t" + "add %[e0], %[e0], #16 \n\t" + "st1 {%[rearm].2d}, [%[e1]] \n\t" + "add %[e1], %[e1], #16 \n\t" + /* C.1 combine data from mCQEs with rx_descriptor_fields1. */ + "tbl v18.16b, {v16.16b}, %[mcqe_shuf_m1].16b \n\t" + "tbl v19.16b, {v16.16b}, %[mcqe_shuf_m2].16b \n\t" + "sub v18.8h, v18.8h, %[crc_adj].8h \n\t" + "sub v19.8h, v19.8h, %[crc_adj].8h \n\t" + "orr v18.16b, v18.16b, %[rxdf].16b \n\t" + "orr v19.16b, v19.16b, %[rxdf].16b \n\t" + /* D.1 store rx_descriptor_fields1. */ + "st1 {v18.2d}, [%[e0]] \n\t" + "st1 {v19.2d}, [%[e1]] \n\t" + /* B.1 store rearm data to mbuf. */ + "st1 {%[rearm].2d}, [%[e2]] \n\t" + "add %[e2], %[e2], #16 \n\t" + "st1 {%[rearm].2d}, [%[e3]] \n\t" + "add %[e3], %[e3], #16 \n\t" + /* C.1 combine data from mCQEs with rx_descriptor_fields1. */ + "tbl v18.16b, {v17.16b}, %[mcqe_shuf_m1].16b \n\t" + "tbl v19.16b, {v17.16b}, %[mcqe_shuf_m2].16b \n\t" + "sub v18.8h, v18.8h, %[crc_adj].8h \n\t" + "sub v19.8h, v19.8h, %[crc_adj].8h \n\t" + "orr v18.16b, v18.16b, %[rxdf].16b \n\t" + "orr v19.16b, v19.16b, %[rxdf].16b \n\t" + /* D.1 store rx_descriptor_fields1. */ + "st1 {v18.2d}, [%[e2]] \n\t" + "st1 {v19.2d}, [%[e3]] \n\t" +#ifdef MLX5_PMD_SOFT_COUNTERS + "tbl %[byte_cnt].8b, {v16.16b - v17.16b}, %[len_shuf_m].8b \n\t" +#endif + :[byte_cnt]"=&w"(byte_cnt) + :[mcq]"r"(p), + [rxdf]"w"(rxdf), + [rearm]"w"(rearm), + [e3]"r"(e3), [e2]"r"(e2), [e1]"r"(e1), [e0]"r"(e0), + [mcqe_shuf_m1]"w"(mcqe_shuf_m1), + [mcqe_shuf_m2]"w"(mcqe_shuf_m2), + [crc_adj]"w"(crc_adj), + [len_shuf_m]"w"(len_shuf_m) + :"memory", "v16", "v17", "v18", "v19"); +#ifdef MLX5_PMD_SOFT_COUNTERS + byte_cnt = vbic_u16(byte_cnt, invalid_mask); + rcvd_byte += vget_lane_u64(vpaddl_u32(vpaddl_u16(byte_cnt)), 0); +#endif + if (rxq->mark) { + /* E.1 store flow tag (rte_flow mark). */ + elts[pos]->hash.fdir.hi = flow_tag; + elts[pos + 1]->hash.fdir.hi = flow_tag; + elts[pos + 2]->hash.fdir.hi = flow_tag; + elts[pos + 3]->hash.fdir.hi = flow_tag; + } + if (rxq->dynf_meta) { + int32_t offs = rxq->flow_meta_offset; + const uint32_t meta = + *RTE_MBUF_DYNFIELD(t_pkt, offs, uint32_t *); + + /* Check if title packet has valid metadata. */ + if (meta) { + MLX5_ASSERT(t_pkt->ol_flags & + rxq->flow_meta_mask); + *RTE_MBUF_DYNFIELD(elts[pos], offs, + uint32_t *) = meta; + *RTE_MBUF_DYNFIELD(elts[pos + 1], offs, + uint32_t *) = meta; + *RTE_MBUF_DYNFIELD(elts[pos + 2], offs, + uint32_t *) = meta; + *RTE_MBUF_DYNFIELD(elts[pos + 3], offs, + uint32_t *) = meta; + } + } + pos += MLX5_VPMD_DESCS_PER_LOOP; + /* Move to next CQE and invalidate consumed CQEs. */ + if (!(pos & 0x7) && pos < mcqe_n) { + mcq = (void *)&(cq + pos)->pkt_info; + for (i = 0; i < 8; ++i) + cq[inv++].op_own = MLX5_CQE_INVALIDATE; + } + } + /* Invalidate the rest of CQEs. */ + for (; inv < mcqe_n; ++inv) + cq[inv].op_own = MLX5_CQE_INVALIDATE; +#ifdef MLX5_PMD_SOFT_COUNTERS + rxq->stats.ipackets += mcqe_n; + rxq->stats.ibytes += rcvd_byte; +#endif + rxq->cq_ci += mcqe_n; + return mcqe_n; +} + +/** + * Calculate packet type and offload flag for mbuf and store it. + * + * @param rxq + * Pointer to RX queue structure. + * @param ptype_info + * Array of four 4bytes packet type info extracted from the original + * completion descriptor. + * @param flow_tag + * Array of four 4bytes flow ID extracted from the original completion + * descriptor. + * @param op_err + * Opcode vector having responder error status. Each field is 4B. + * @param pkts + * Pointer to array of packets to be filled. + */ +static inline void +rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, + uint32x4_t ptype_info, uint32x4_t flow_tag, + uint16x4_t op_err, struct rte_mbuf **pkts) +{ + uint16x4_t ptype; + uint32x4_t pinfo, cv_flags; + uint32x4_t ol_flags = + vdupq_n_u32(rxq->rss_hash * PKT_RX_RSS_HASH | + rxq->hw_timestamp * PKT_RX_TIMESTAMP); + const uint32x4_t ptype_ol_mask = { 0x106, 0x106, 0x106, 0x106 }; + const uint8x16_t cv_flag_sel = { + 0, + (uint8_t)(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED), + (uint8_t)(PKT_RX_IP_CKSUM_GOOD >> 1), + 0, + (uint8_t)(PKT_RX_L4_CKSUM_GOOD >> 1), + 0, + (uint8_t)((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1), + 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + const uint32x4_t cv_mask = + vdupq_n_u32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED); + const uint64x2_t mbuf_init = vld1q_u64 + ((const uint64_t *)&rxq->mbuf_initializer); + uint64x2_t rearm0, rearm1, rearm2, rearm3; + uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3; + + if (rxq->mark) { + const uint32x4_t ft_def = vdupq_n_u32(MLX5_FLOW_MARK_DEFAULT); + const uint32x4_t fdir_flags = vdupq_n_u32(PKT_RX_FDIR); + uint32x4_t fdir_id_flags = vdupq_n_u32(PKT_RX_FDIR_ID); + uint32x4_t invalid_mask; + + /* Check if flow tag is non-zero then set PKT_RX_FDIR. */ + invalid_mask = vceqzq_u32(flow_tag); + ol_flags = vorrq_u32(ol_flags, + vbicq_u32(fdir_flags, invalid_mask)); + /* Mask out invalid entries. */ + fdir_id_flags = vbicq_u32(fdir_id_flags, invalid_mask); + /* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */ + ol_flags = vorrq_u32(ol_flags, + vbicq_u32(fdir_id_flags, + vceqq_u32(flow_tag, ft_def))); + } + /* + * ptype_info has the following: + * bit[1] = l3_ok + * bit[2] = l4_ok + * bit[8] = cv + * bit[11:10] = l3_hdr_type + * bit[14:12] = l4_hdr_type + * bit[15] = ip_frag + * bit[16] = tunneled + * bit[17] = outer_l3_type + */ + ptype = vshrn_n_u32(ptype_info, 10); + /* Errored packets will have RTE_PTYPE_ALL_MASK. */ + ptype = vorr_u16(ptype, op_err); + pt_idx0 = vget_lane_u8(vreinterpret_u8_u16(ptype), 6); + pt_idx1 = vget_lane_u8(vreinterpret_u8_u16(ptype), 4); + pt_idx2 = vget_lane_u8(vreinterpret_u8_u16(ptype), 2); + pt_idx3 = vget_lane_u8(vreinterpret_u8_u16(ptype), 0); + pkts[0]->packet_type = mlx5_ptype_table[pt_idx0] | + !!(pt_idx0 & (1 << 6)) * rxq->tunnel; + pkts[1]->packet_type = mlx5_ptype_table[pt_idx1] | + !!(pt_idx1 & (1 << 6)) * rxq->tunnel; + pkts[2]->packet_type = mlx5_ptype_table[pt_idx2] | + !!(pt_idx2 & (1 << 6)) * rxq->tunnel; + pkts[3]->packet_type = mlx5_ptype_table[pt_idx3] | + !!(pt_idx3 & (1 << 6)) * rxq->tunnel; + /* Fill flags for checksum and VLAN. */ + pinfo = vandq_u32(ptype_info, ptype_ol_mask); + pinfo = vreinterpretq_u32_u8( + vqtbl1q_u8(cv_flag_sel, vreinterpretq_u8_u32(pinfo))); + /* Locate checksum flags at byte[2:1] and merge with VLAN flags. */ + cv_flags = vshlq_n_u32(pinfo, 9); + cv_flags = vorrq_u32(pinfo, cv_flags); + /* Move back flags to start from byte[0]. */ + cv_flags = vshrq_n_u32(cv_flags, 8); + /* Mask out garbage bits. */ + cv_flags = vandq_u32(cv_flags, cv_mask); + /* Merge to ol_flags. */ + ol_flags = vorrq_u32(ol_flags, cv_flags); + /* Merge mbuf_init and ol_flags, and store. */ + rearm0 = vreinterpretq_u64_u32(vsetq_lane_u32 + (vgetq_lane_u32(ol_flags, 3), + vreinterpretq_u32_u64(mbuf_init), 2)); + rearm1 = vreinterpretq_u64_u32(vsetq_lane_u32 + (vgetq_lane_u32(ol_flags, 2), + vreinterpretq_u32_u64(mbuf_init), 2)); + rearm2 = vreinterpretq_u64_u32(vsetq_lane_u32 + (vgetq_lane_u32(ol_flags, 1), + vreinterpretq_u32_u64(mbuf_init), 2)); + rearm3 = vreinterpretq_u64_u32(vsetq_lane_u32 + (vgetq_lane_u32(ol_flags, 0), + vreinterpretq_u32_u64(mbuf_init), 2)); + + vst1q_u64((void *)&pkts[0]->rearm_data, rearm0); + vst1q_u64((void *)&pkts[1]->rearm_data, rearm1); + vst1q_u64((void *)&pkts[2]->rearm_data, rearm2); + vst1q_u64((void *)&pkts[3]->rearm_data, rearm3); +} + +/** + * Receive burst of packets. An errored completion also consumes a mbuf, but the + * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed + * before returning to application. + * + * @param rxq + * Pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * @param[out] err + * Pointer to a flag. Set non-zero value if pkts array has at least one error + * packet to handle. + * + * @return + * Number of packets received including errors (<= pkts_n). + */ +static inline uint16_t +rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, + uint64_t *err) +{ + const uint16_t q_n = 1 << rxq->cqe_n; + const uint16_t q_mask = q_n - 1; + volatile struct mlx5_cqe *cq; + struct rte_mbuf **elts; + unsigned int pos; + uint64_t n; + uint16_t repl_n; + uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP; + uint16_t nocmp_n = 0; + uint16_t rcvd_pkt = 0; + unsigned int cq_idx = rxq->cq_ci & q_mask; + unsigned int elts_idx; + const uint16x4_t ownership = vdup_n_u16(!(rxq->cq_ci & (q_mask + 1))); + const uint16x4_t owner_check = vcreate_u16(0x0001000100010001); + const uint16x4_t opcode_check = vcreate_u16(0x00f000f000f000f0); + const uint16x4_t format_check = vcreate_u16(0x000c000c000c000c); + const uint16x4_t resp_err_check = vcreate_u16(0x00e000e000e000e0); +#ifdef MLX5_PMD_SOFT_COUNTERS + uint32_t rcvd_byte = 0; +#endif + /* Mask to generate 16B length vector. */ + const uint8x8_t len_shuf_m = { + 52, 53, /* 4th CQE */ + 36, 37, /* 3rd CQE */ + 20, 21, /* 2nd CQE */ + 4, 5 /* 1st CQE */ + }; + /* Mask to extract 16B data from a 64B CQE. */ + const uint8x16_t cqe_shuf_m = { + 28, 29, /* hdr_type_etc */ + 0, /* pkt_info */ + -1, /* null */ + 47, 46, /* byte_cnt, bswap16 */ + 31, 30, /* vlan_info, bswap16 */ + 15, 14, 13, 12, /* rx_hash_res, bswap32 */ + 57, 58, 59, /* flow_tag */ + 63 /* op_own */ + }; + /* Mask to generate 16B data for mbuf. */ + const uint8x16_t mb_shuf_m = { + 4, 5, -1, -1, /* pkt_len */ + 4, 5, /* data_len */ + 6, 7, /* vlan_tci */ + 8, 9, 10, 11, /* hash.rss */ + 12, 13, 14, -1 /* hash.fdir.hi */ + }; + /* Mask to generate 16B owner vector. */ + const uint8x8_t owner_shuf_m = { + 63, -1, /* 4th CQE */ + 47, -1, /* 3rd CQE */ + 31, -1, /* 2nd CQE */ + 15, -1 /* 1st CQE */ + }; + /* Mask to generate a vector having packet_type/ol_flags. */ + const uint8x16_t ptype_shuf_m = { + 48, 49, 50, -1, /* 4th CQE */ + 32, 33, 34, -1, /* 3rd CQE */ + 16, 17, 18, -1, /* 2nd CQE */ + 0, 1, 2, -1 /* 1st CQE */ + }; + /* Mask to generate a vector having flow tags. */ + const uint8x16_t ftag_shuf_m = { + 60, 61, 62, -1, /* 4th CQE */ + 44, 45, 46, -1, /* 3rd CQE */ + 28, 29, 30, -1, /* 2nd CQE */ + 12, 13, 14, -1 /* 1st CQE */ + }; + const uint16x8_t crc_adj = { + 0, 0, rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0, 0, 0 + }; + const uint32x4_t flow_mark_adj = { 0, 0, 0, rxq->mark * (-1) }; + + MLX5_ASSERT(rxq->sges_n == 0); + MLX5_ASSERT(rxq->cqe_n == rxq->elts_n); + cq = &(*rxq->cqes)[cq_idx]; + rte_prefetch_non_temporal(cq); + rte_prefetch_non_temporal(cq + 1); + rte_prefetch_non_temporal(cq + 2); + rte_prefetch_non_temporal(cq + 3); + pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST); + repl_n = q_n - (rxq->rq_ci - rxq->rq_pi); + if (repl_n >= rxq->rq_repl_thresh) + mlx5_rx_replenish_bulk_mbuf(rxq, repl_n); + /* See if there're unreturned mbufs from compressed CQE. */ + rcvd_pkt = rxq->decompressed; + if (rcvd_pkt > 0) { + rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n); + rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt); + rxq->rq_pi += rcvd_pkt; + pkts += rcvd_pkt; + rxq->decompressed -= rcvd_pkt; + } + elts_idx = rxq->rq_pi & q_mask; + elts = &(*rxq->elts)[elts_idx]; + /* Not to overflow pkts array. */ + pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP); + /* Not to cross queue end. */ + pkts_n = RTE_MIN(pkts_n, q_n - elts_idx); + pkts_n = RTE_MIN(pkts_n, q_n - cq_idx); + if (!pkts_n) + return rcvd_pkt; + /* At this point, there shouldn't be any remained packets. */ + MLX5_ASSERT(rxq->decompressed == 0); + /* + * Note that vectors have reverse order - {v3, v2, v1, v0}, because + * there's no instruction to count trailing zeros. __builtin_clzl() is + * used instead. + * + * A. copy 4 mbuf pointers from elts ring to returing pkts. + * B. load 64B CQE and extract necessary fields + * Final 16bytes cqes[] extracted from original 64bytes CQE has the + * following structure: + * struct { + * uint16_t hdr_type_etc; + * uint8_t pkt_info; + * uint8_t rsvd; + * uint16_t byte_cnt; + * uint16_t vlan_info; + * uint32_t rx_has_res; + * uint8_t flow_tag[3]; + * uint8_t op_own; + * } c; + * C. fill in mbuf. + * D. get valid CQEs. + * E. find compressed CQE. + */ + for (pos = 0; + pos < pkts_n; + pos += MLX5_VPMD_DESCS_PER_LOOP) { + uint16x4_t op_own; + uint16x4_t opcode, owner_mask, invalid_mask; + uint16x4_t comp_mask; + uint16x4_t mask; + uint16x4_t byte_cnt; + uint32x4_t ptype_info, flow_tag; + register uint64x2_t c0, c1, c2, c3; + uint8_t *p0, *p1, *p2, *p3; + uint8_t *e0 = (void *)&elts[pos]->pkt_len; + uint8_t *e1 = (void *)&elts[pos + 1]->pkt_len; + uint8_t *e2 = (void *)&elts[pos + 2]->pkt_len; + uint8_t *e3 = (void *)&elts[pos + 3]->pkt_len; + void *elts_p = (void *)&elts[pos]; + void *pkts_p = (void *)&pkts[pos]; + + /* A.0 do not cross the end of CQ. */ + mask = vcreate_u16(pkts_n - pos < MLX5_VPMD_DESCS_PER_LOOP ? + -1UL >> ((pkts_n - pos) * + sizeof(uint16_t) * 8) : 0); + p0 = (void *)&cq[pos].pkt_info; + p1 = p0 + (pkts_n - pos > 1) * sizeof(struct mlx5_cqe); + p2 = p1 + (pkts_n - pos > 2) * sizeof(struct mlx5_cqe); + p3 = p2 + (pkts_n - pos > 3) * sizeof(struct mlx5_cqe); + /* B.0 (CQE 3) load a block having op_own. */ + c3 = vld1q_u64((uint64_t *)(p3 + 48)); + /* B.0 (CQE 2) load a block having op_own. */ + c2 = vld1q_u64((uint64_t *)(p2 + 48)); + /* B.0 (CQE 1) load a block having op_own. */ + c1 = vld1q_u64((uint64_t *)(p1 + 48)); + /* B.0 (CQE 0) load a block having op_own. */ + c0 = vld1q_u64((uint64_t *)(p0 + 48)); + /* Synchronize for loading the rest of blocks. */ + rte_cio_rmb(); + /* Prefetch next 4 CQEs. */ + if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) { + unsigned int next = pos + MLX5_VPMD_DESCS_PER_LOOP; + rte_prefetch_non_temporal(&cq[next]); + rte_prefetch_non_temporal(&cq[next + 1]); + rte_prefetch_non_temporal(&cq[next + 2]); + rte_prefetch_non_temporal(&cq[next + 3]); + } + __asm__ volatile ( + /* B.1 (CQE 3) load the rest of blocks. */ + "ld1 {v16.16b - v18.16b}, [%[p3]] \n\t" + /* B.2 (CQE 3) move the block having op_own. */ + "mov v19.16b, %[c3].16b \n\t" + /* B.3 (CQE 3) extract 16B fields. */ + "tbl v23.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t" + /* B.1 (CQE 2) load the rest of blocks. */ + "ld1 {v16.16b - v18.16b}, [%[p2]] \n\t" + /* B.4 (CQE 3) adjust CRC length. */ + "sub v23.8h, v23.8h, %[crc_adj].8h \n\t" + /* C.1 (CQE 3) generate final structure for mbuf. */ + "tbl v15.16b, {v23.16b}, %[mb_shuf_m].16b \n\t" + /* B.2 (CQE 2) move the block having op_own. */ + "mov v19.16b, %[c2].16b \n\t" + /* B.3 (CQE 2) extract 16B fields. */ + "tbl v22.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t" + /* B.1 (CQE 1) load the rest of blocks. */ + "ld1 {v16.16b - v18.16b}, [%[p1]] \n\t" + /* B.4 (CQE 2) adjust CRC length. */ + "sub v22.8h, v22.8h, %[crc_adj].8h \n\t" + /* C.1 (CQE 2) generate final structure for mbuf. */ + "tbl v14.16b, {v22.16b}, %[mb_shuf_m].16b \n\t" + /* B.2 (CQE 1) move the block having op_own. */ + "mov v19.16b, %[c1].16b \n\t" + /* B.3 (CQE 1) extract 16B fields. */ + "tbl v21.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t" + /* B.1 (CQE 0) load the rest of blocks. */ + "ld1 {v16.16b - v18.16b}, [%[p0]] \n\t" + /* B.4 (CQE 1) adjust CRC length. */ + "sub v21.8h, v21.8h, %[crc_adj].8h \n\t" + /* C.1 (CQE 1) generate final structure for mbuf. */ + "tbl v13.16b, {v21.16b}, %[mb_shuf_m].16b \n\t" + /* B.2 (CQE 0) move the block having op_own. */ + "mov v19.16b, %[c0].16b \n\t" + /* A.1 load mbuf pointers. */ + "ld1 {v24.2d - v25.2d}, [%[elts_p]] \n\t" + /* B.3 (CQE 0) extract 16B fields. */ + "tbl v20.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t" + /* B.4 (CQE 0) adjust CRC length. */ + "sub v20.8h, v20.8h, %[crc_adj].8h \n\t" + /* D.1 extract op_own byte. */ + "tbl %[op_own].8b, {v20.16b - v23.16b}, %[owner_shuf_m].8b \n\t" + /* C.2 (CQE 3) adjust flow mark. */ + "add v15.4s, v15.4s, %[flow_mark_adj].4s \n\t" + /* C.3 (CQE 3) fill in mbuf - rx_descriptor_fields1. */ + "st1 {v15.2d}, [%[e3]] \n\t" + /* C.2 (CQE 2) adjust flow mark. */ + "add v14.4s, v14.4s, %[flow_mark_adj].4s \n\t" + /* C.3 (CQE 2) fill in mbuf - rx_descriptor_fields1. */ + "st1 {v14.2d}, [%[e2]] \n\t" + /* C.1 (CQE 0) generate final structure for mbuf. */ + "tbl v12.16b, {v20.16b}, %[mb_shuf_m].16b \n\t" + /* C.2 (CQE 1) adjust flow mark. */ + "add v13.4s, v13.4s, %[flow_mark_adj].4s \n\t" + /* C.3 (CQE 1) fill in mbuf - rx_descriptor_fields1. */ + "st1 {v13.2d}, [%[e1]] \n\t" +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Extract byte_cnt. */ + "tbl %[byte_cnt].8b, {v20.16b - v23.16b}, %[len_shuf_m].8b \n\t" +#endif + /* Extract ptype_info. */ + "tbl %[ptype_info].16b, {v20.16b - v23.16b}, %[ptype_shuf_m].16b \n\t" + /* Extract flow_tag. */ + "tbl %[flow_tag].16b, {v20.16b - v23.16b}, %[ftag_shuf_m].16b \n\t" + /* A.2 copy mbuf pointers. */ + "st1 {v24.2d - v25.2d}, [%[pkts_p]] \n\t" + /* C.2 (CQE 0) adjust flow mark. */ + "add v12.4s, v12.4s, %[flow_mark_adj].4s \n\t" + /* C.3 (CQE 1) fill in mbuf - rx_descriptor_fields1. */ + "st1 {v12.2d}, [%[e0]] \n\t" + :[op_own]"=&w"(op_own), + [byte_cnt]"=&w"(byte_cnt), + [ptype_info]"=&w"(ptype_info), + [flow_tag]"=&w"(flow_tag) + :[p3]"r"(p3), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0), + [e3]"r"(e3), [e2]"r"(e2), [e1]"r"(e1), [e0]"r"(e0), + [c3]"w"(c3), [c2]"w"(c2), [c1]"w"(c1), [c0]"w"(c0), + [elts_p]"r"(elts_p), + [pkts_p]"r"(pkts_p), + [cqe_shuf_m]"w"(cqe_shuf_m), + [mb_shuf_m]"w"(mb_shuf_m), + [owner_shuf_m]"w"(owner_shuf_m), + [len_shuf_m]"w"(len_shuf_m), + [ptype_shuf_m]"w"(ptype_shuf_m), + [ftag_shuf_m]"w"(ftag_shuf_m), + [crc_adj]"w"(crc_adj), + [flow_mark_adj]"w"(flow_mark_adj) + :"memory", + "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", + "v24", "v25"); + /* D.2 flip owner bit to mark CQEs from last round. */ + owner_mask = vand_u16(op_own, owner_check); + owner_mask = vceq_u16(owner_mask, ownership); + /* D.3 get mask for invalidated CQEs. */ + opcode = vand_u16(op_own, opcode_check); + invalid_mask = vceq_u16(opcode_check, opcode); + /* E.1 find compressed CQE format. */ + comp_mask = vand_u16(op_own, format_check); + comp_mask = vceq_u16(comp_mask, format_check); + /* D.4 mask out beyond boundary. */ + invalid_mask = vorr_u16(invalid_mask, mask); + /* D.5 merge invalid_mask with invalid owner. */ + invalid_mask = vorr_u16(invalid_mask, owner_mask); + /* E.2 mask out invalid entries. */ + comp_mask = vbic_u16(comp_mask, invalid_mask); + /* E.3 get the first compressed CQE. */ + comp_idx = __builtin_clzl(vget_lane_u64(vreinterpret_u64_u16( + comp_mask), 0)) / + (sizeof(uint16_t) * 8); + /* D.6 mask out entries after the compressed CQE. */ + mask = vcreate_u16(comp_idx < MLX5_VPMD_DESCS_PER_LOOP ? + -1UL >> (comp_idx * sizeof(uint16_t) * 8) : + 0); + invalid_mask = vorr_u16(invalid_mask, mask); + /* D.7 count non-compressed valid CQEs. */ + n = __builtin_clzl(vget_lane_u64(vreinterpret_u64_u16( + invalid_mask), 0)) / (sizeof(uint16_t) * 8); + nocmp_n += n; + /* D.2 get the final invalid mask. */ + mask = vcreate_u16(n < MLX5_VPMD_DESCS_PER_LOOP ? + -1UL >> (n * sizeof(uint16_t) * 8) : 0); + invalid_mask = vorr_u16(invalid_mask, mask); + /* D.3 check error in opcode. */ + opcode = vceq_u16(resp_err_check, opcode); + opcode = vbic_u16(opcode, invalid_mask); + /* D.4 mark if any error is set */ + *err |= vget_lane_u64(vreinterpret_u64_u16(opcode), 0); + /* C.4 fill in mbuf - rearm_data and packet_type. */ + rxq_cq_to_ptype_oflags_v(rxq, ptype_info, flow_tag, + opcode, &elts[pos]); + if (rxq->hw_timestamp) { + elts[pos]->timestamp = + rte_be_to_cpu_64( + container_of(p0, struct mlx5_cqe, + pkt_info)->timestamp); + elts[pos + 1]->timestamp = + rte_be_to_cpu_64( + container_of(p1, struct mlx5_cqe, + pkt_info)->timestamp); + elts[pos + 2]->timestamp = + rte_be_to_cpu_64( + container_of(p2, struct mlx5_cqe, + pkt_info)->timestamp); + elts[pos + 3]->timestamp = + rte_be_to_cpu_64( + container_of(p3, struct mlx5_cqe, + pkt_info)->timestamp); + } + if (!!rxq->flow_meta_mask) { + /* This code is subject for futher optimization. */ + int32_t offs = rxq->flow_meta_offset; + + *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) = + container_of(p0, struct mlx5_cqe, + pkt_info)->flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) = + container_of(p1, struct mlx5_cqe, + pkt_info)->flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) = + container_of(p2, struct mlx5_cqe, + pkt_info)->flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) = + container_of(p3, struct mlx5_cqe, + pkt_info)->flow_table_metadata; + if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *)) + elts[pos]->ol_flags |= rxq->flow_meta_mask; + if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *)) + elts[pos + 1]->ol_flags |= rxq->flow_meta_mask; + if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *)) + elts[pos + 2]->ol_flags |= rxq->flow_meta_mask; + if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *)) + elts[pos + 3]->ol_flags |= rxq->flow_meta_mask; + } +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Add up received bytes count. */ + byte_cnt = vbic_u16(byte_cnt, invalid_mask); + rcvd_byte += vget_lane_u64(vpaddl_u32(vpaddl_u16(byte_cnt)), 0); +#endif + /* + * Break the loop unless more valid CQE is expected, or if + * there's a compressed CQE. + */ + if (n != MLX5_VPMD_DESCS_PER_LOOP) + break; + } + /* If no new CQE seen, return without updating cq_db. */ + if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) + return rcvd_pkt; + /* Update the consumer indexes for non-compressed CQEs. */ + MLX5_ASSERT(nocmp_n <= pkts_n); + rxq->cq_ci += nocmp_n; + rxq->rq_pi += nocmp_n; + rcvd_pkt += nocmp_n; +#ifdef MLX5_PMD_SOFT_COUNTERS + rxq->stats.ipackets += nocmp_n; + rxq->stats.ibytes += rcvd_byte; +#endif + /* Decompress the last CQE if compressed. */ + if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) { + MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP)); + rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n], + &elts[nocmp_n]); + /* Return more packets if needed. */ + if (nocmp_n < pkts_n) { + uint16_t n = rxq->decompressed; + + n = RTE_MIN(n, pkts_n - nocmp_n); + rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n); + rxq->rq_pi += n; + rcvd_pkt += n; + rxq->decompressed -= n; + } + } + rte_cio_wmb(); + *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); + return rcvd_pkt; +} + +#endif /* RTE_PMD_MLX5_RXTX_VEC_NEON_H_ */ diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_sse.h new file mode 100644 index 000000000..6847ae782 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_sse.h @@ -0,0 +1,731 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox Technologies, Ltd + */ + +#ifndef RTE_PMD_MLX5_RXTX_VEC_SSE_H_ +#define RTE_PMD_MLX5_RXTX_VEC_SSE_H_ + +#include <stdint.h> +#include <string.h> +#include <stdlib.h> +#include <smmintrin.h> + +#include <rte_mbuf.h> +#include <rte_mempool.h> +#include <rte_prefetch.h> + +#include <mlx5_prm.h> + +#include "mlx5_defs.h" +#include "mlx5.h" +#include "mlx5_utils.h" +#include "mlx5_rxtx.h" +#include "mlx5_rxtx_vec.h" +#include "mlx5_autoconf.h" + +#ifndef __INTEL_COMPILER +#pragma GCC diagnostic ignored "-Wcast-qual" +#endif + +/** + * Store free buffers to RX SW ring. + * + * @param rxq + * Pointer to RX queue structure. + * @param pkts + * Pointer to array of packets to be stored. + * @param pkts_n + * Number of packets to be stored. + */ +static inline void +rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n) +{ + const uint16_t q_mask = (1 << rxq->elts_n) - 1; + struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask]; + unsigned int pos; + uint16_t p = n & -2; + + for (pos = 0; pos < p; pos += 2) { + __m128i mbp; + + mbp = _mm_loadu_si128((__m128i *)&elts[pos]); + _mm_storeu_si128((__m128i *)&pkts[pos], mbp); + } + if (n & 1) + pkts[pos] = elts[pos]; +} + +/** + * Decompress a compressed completion and fill in mbufs in RX SW ring with data + * extracted from the title completion descriptor. + * + * @param rxq + * Pointer to RX queue structure. + * @param cq + * Pointer to completion array having a compressed completion at first. + * @param elts + * Pointer to SW ring to be filled. The first mbuf has to be pre-built from + * the title completion descriptor to be copied to the rest of mbufs. + * + * @return + * Number of mini-CQEs successfully decompressed. + */ +static inline uint16_t +rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, + struct rte_mbuf **elts) +{ + volatile struct mlx5_mini_cqe8 *mcq = (void *)(cq + 1); + struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */ + unsigned int pos; + unsigned int i; + unsigned int inv = 0; + /* Mask to shuffle from extracted mini CQE to mbuf. */ + const __m128i shuf_mask1 = + _mm_set_epi8(0, 1, 2, 3, /* rss, bswap32 */ + -1, -1, /* skip vlan_tci */ + 6, 7, /* data_len, bswap16 */ + -1, -1, 6, 7, /* pkt_len, bswap16 */ + -1, -1, -1, -1 /* skip packet_type */); + const __m128i shuf_mask2 = + _mm_set_epi8(8, 9, 10, 11, /* rss, bswap32 */ + -1, -1, /* skip vlan_tci */ + 14, 15, /* data_len, bswap16 */ + -1, -1, 14, 15, /* pkt_len, bswap16 */ + -1, -1, -1, -1 /* skip packet_type */); + /* Restore the compressed count. Must be 16 bits. */ + const uint16_t mcqe_n = t_pkt->data_len + + (rxq->crc_present * RTE_ETHER_CRC_LEN); + const __m128i rearm = + _mm_loadu_si128((__m128i *)&t_pkt->rearm_data); + const __m128i rxdf = + _mm_loadu_si128((__m128i *)&t_pkt->rx_descriptor_fields1); + const __m128i crc_adj = + _mm_set_epi16(0, 0, 0, + rxq->crc_present * RTE_ETHER_CRC_LEN, + 0, + rxq->crc_present * RTE_ETHER_CRC_LEN, + 0, 0); + const uint32_t flow_tag = t_pkt->hash.fdir.hi; +#ifdef MLX5_PMD_SOFT_COUNTERS + const __m128i zero = _mm_setzero_si128(); + const __m128i ones = _mm_cmpeq_epi32(zero, zero); + uint32_t rcvd_byte = 0; + /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */ + const __m128i len_shuf_mask = + _mm_set_epi8(-1, -1, -1, -1, + -1, -1, -1, -1, + 14, 15, 6, 7, + 10, 11, 2, 3); +#endif + /* + * A. load mCQEs into a 128bit register. + * B. store rearm data to mbuf. + * C. combine data from mCQEs with rx_descriptor_fields1. + * D. store rx_descriptor_fields1. + * E. store flow tag (rte_flow mark). + */ + for (pos = 0; pos < mcqe_n; ) { + __m128i mcqe1, mcqe2; + __m128i rxdf1, rxdf2; +#ifdef MLX5_PMD_SOFT_COUNTERS + __m128i byte_cnt, invalid_mask; +#endif + + for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) + if (likely(pos + i < mcqe_n)) + rte_prefetch0((void *)(cq + pos + i)); + + /* A.1 load mCQEs into a 128bit register. */ + mcqe1 = _mm_loadu_si128((__m128i *)&mcq[pos % 8]); + mcqe2 = _mm_loadu_si128((__m128i *)&mcq[pos % 8 + 2]); + /* B.1 store rearm data to mbuf. */ + _mm_storeu_si128((__m128i *)&elts[pos]->rearm_data, rearm); + _mm_storeu_si128((__m128i *)&elts[pos + 1]->rearm_data, rearm); + /* C.1 combine data from mCQEs with rx_descriptor_fields1. */ + rxdf1 = _mm_shuffle_epi8(mcqe1, shuf_mask1); + rxdf2 = _mm_shuffle_epi8(mcqe1, shuf_mask2); + rxdf1 = _mm_sub_epi16(rxdf1, crc_adj); + rxdf2 = _mm_sub_epi16(rxdf2, crc_adj); + rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23); + rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23); + /* D.1 store rx_descriptor_fields1. */ + _mm_storeu_si128((__m128i *) + &elts[pos]->rx_descriptor_fields1, + rxdf1); + _mm_storeu_si128((__m128i *) + &elts[pos + 1]->rx_descriptor_fields1, + rxdf2); + /* B.1 store rearm data to mbuf. */ + _mm_storeu_si128((__m128i *)&elts[pos + 2]->rearm_data, rearm); + _mm_storeu_si128((__m128i *)&elts[pos + 3]->rearm_data, rearm); + /* C.1 combine data from mCQEs with rx_descriptor_fields1. */ + rxdf1 = _mm_shuffle_epi8(mcqe2, shuf_mask1); + rxdf2 = _mm_shuffle_epi8(mcqe2, shuf_mask2); + rxdf1 = _mm_sub_epi16(rxdf1, crc_adj); + rxdf2 = _mm_sub_epi16(rxdf2, crc_adj); + rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23); + rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23); + /* D.1 store rx_descriptor_fields1. */ + _mm_storeu_si128((__m128i *) + &elts[pos + 2]->rx_descriptor_fields1, + rxdf1); + _mm_storeu_si128((__m128i *) + &elts[pos + 3]->rx_descriptor_fields1, + rxdf2); +#ifdef MLX5_PMD_SOFT_COUNTERS + invalid_mask = _mm_set_epi64x(0, + (mcqe_n - pos) * + sizeof(uint16_t) * 8); + invalid_mask = _mm_sll_epi64(ones, invalid_mask); + mcqe1 = _mm_srli_si128(mcqe1, 4); + byte_cnt = _mm_blend_epi16(mcqe1, mcqe2, 0xcc); + byte_cnt = _mm_shuffle_epi8(byte_cnt, len_shuf_mask); + byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt); + byte_cnt = _mm_hadd_epi16(byte_cnt, zero); + rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero)); +#endif + if (rxq->mark) { + /* E.1 store flow tag (rte_flow mark). */ + elts[pos]->hash.fdir.hi = flow_tag; + elts[pos + 1]->hash.fdir.hi = flow_tag; + elts[pos + 2]->hash.fdir.hi = flow_tag; + elts[pos + 3]->hash.fdir.hi = flow_tag; + } + if (rxq->dynf_meta) { + int32_t offs = rxq->flow_meta_offset; + const uint32_t meta = + *RTE_MBUF_DYNFIELD(t_pkt, offs, uint32_t *); + + /* Check if title packet has valid metadata. */ + if (meta) { + MLX5_ASSERT(t_pkt->ol_flags & + rxq->flow_meta_mask); + *RTE_MBUF_DYNFIELD(elts[pos], offs, + uint32_t *) = meta; + *RTE_MBUF_DYNFIELD(elts[pos + 1], offs, + uint32_t *) = meta; + *RTE_MBUF_DYNFIELD(elts[pos + 2], offs, + uint32_t *) = meta; + *RTE_MBUF_DYNFIELD(elts[pos + 3], offs, + uint32_t *) = meta; + } + } + pos += MLX5_VPMD_DESCS_PER_LOOP; + /* Move to next CQE and invalidate consumed CQEs. */ + if (!(pos & 0x7) && pos < mcqe_n) { + mcq = (void *)(cq + pos); + for (i = 0; i < 8; ++i) + cq[inv++].op_own = MLX5_CQE_INVALIDATE; + } + } + /* Invalidate the rest of CQEs. */ + for (; inv < mcqe_n; ++inv) + cq[inv].op_own = MLX5_CQE_INVALIDATE; +#ifdef MLX5_PMD_SOFT_COUNTERS + rxq->stats.ipackets += mcqe_n; + rxq->stats.ibytes += rcvd_byte; +#endif + rxq->cq_ci += mcqe_n; + return mcqe_n; +} + +/** + * Calculate packet type and offload flag for mbuf and store it. + * + * @param rxq + * Pointer to RX queue structure. + * @param cqes[4] + * Array of four 16bytes completions extracted from the original completion + * descriptor. + * @param op_err + * Opcode vector having responder error status. Each field is 4B. + * @param pkts + * Pointer to array of packets to be filled. + */ +static inline void +rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4], + __m128i op_err, struct rte_mbuf **pkts) +{ + __m128i pinfo0, pinfo1; + __m128i pinfo, ptype; + __m128i ol_flags = _mm_set1_epi32(rxq->rss_hash * PKT_RX_RSS_HASH | + rxq->hw_timestamp * PKT_RX_TIMESTAMP); + __m128i cv_flags; + const __m128i zero = _mm_setzero_si128(); + const __m128i ptype_mask = + _mm_set_epi32(0xfd06, 0xfd06, 0xfd06, 0xfd06); + const __m128i ptype_ol_mask = + _mm_set_epi32(0x106, 0x106, 0x106, 0x106); + const __m128i pinfo_mask = + _mm_set_epi32(0x3, 0x3, 0x3, 0x3); + const __m128i cv_flag_sel = + _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, + (uint8_t)((PKT_RX_IP_CKSUM_GOOD | + PKT_RX_L4_CKSUM_GOOD) >> 1), + 0, + (uint8_t)(PKT_RX_L4_CKSUM_GOOD >> 1), + 0, + (uint8_t)(PKT_RX_IP_CKSUM_GOOD >> 1), + (uint8_t)(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED), + 0); + const __m128i cv_mask = + _mm_set_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, + PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, + PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, + PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED); + const __m128i mbuf_init = + _mm_load_si128((__m128i *)&rxq->mbuf_initializer); + __m128i rearm0, rearm1, rearm2, rearm3; + uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3; + + /* Extract pkt_info field. */ + pinfo0 = _mm_unpacklo_epi32(cqes[0], cqes[1]); + pinfo1 = _mm_unpacklo_epi32(cqes[2], cqes[3]); + pinfo = _mm_unpacklo_epi64(pinfo0, pinfo1); + /* Extract hdr_type_etc field. */ + pinfo0 = _mm_unpackhi_epi32(cqes[0], cqes[1]); + pinfo1 = _mm_unpackhi_epi32(cqes[2], cqes[3]); + ptype = _mm_unpacklo_epi64(pinfo0, pinfo1); + if (rxq->mark) { + const __m128i pinfo_ft_mask = + _mm_set_epi32(0xffffff00, 0xffffff00, + 0xffffff00, 0xffffff00); + const __m128i fdir_flags = _mm_set1_epi32(PKT_RX_FDIR); + __m128i fdir_id_flags = _mm_set1_epi32(PKT_RX_FDIR_ID); + __m128i flow_tag, invalid_mask; + + flow_tag = _mm_and_si128(pinfo, pinfo_ft_mask); + /* Check if flow tag is non-zero then set PKT_RX_FDIR. */ + invalid_mask = _mm_cmpeq_epi32(flow_tag, zero); + ol_flags = _mm_or_si128(ol_flags, + _mm_andnot_si128(invalid_mask, + fdir_flags)); + /* Mask out invalid entries. */ + fdir_id_flags = _mm_andnot_si128(invalid_mask, fdir_id_flags); + /* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */ + ol_flags = _mm_or_si128(ol_flags, + _mm_andnot_si128( + _mm_cmpeq_epi32(flow_tag, + pinfo_ft_mask), + fdir_id_flags)); + } + /* + * Merge the two fields to generate the following: + * bit[1] = l3_ok + * bit[2] = l4_ok + * bit[8] = cv + * bit[11:10] = l3_hdr_type + * bit[14:12] = l4_hdr_type + * bit[15] = ip_frag + * bit[16] = tunneled + * bit[17] = outer_l3_type + */ + ptype = _mm_and_si128(ptype, ptype_mask); + pinfo = _mm_and_si128(pinfo, pinfo_mask); + pinfo = _mm_slli_epi32(pinfo, 16); + /* Make pinfo has merged fields for ol_flags calculation. */ + pinfo = _mm_or_si128(ptype, pinfo); + ptype = _mm_srli_epi32(pinfo, 10); + ptype = _mm_packs_epi32(ptype, zero); + /* Errored packets will have RTE_PTYPE_ALL_MASK. */ + op_err = _mm_srli_epi16(op_err, 8); + ptype = _mm_or_si128(ptype, op_err); + pt_idx0 = _mm_extract_epi8(ptype, 0); + pt_idx1 = _mm_extract_epi8(ptype, 2); + pt_idx2 = _mm_extract_epi8(ptype, 4); + pt_idx3 = _mm_extract_epi8(ptype, 6); + pkts[0]->packet_type = mlx5_ptype_table[pt_idx0] | + !!(pt_idx0 & (1 << 6)) * rxq->tunnel; + pkts[1]->packet_type = mlx5_ptype_table[pt_idx1] | + !!(pt_idx1 & (1 << 6)) * rxq->tunnel; + pkts[2]->packet_type = mlx5_ptype_table[pt_idx2] | + !!(pt_idx2 & (1 << 6)) * rxq->tunnel; + pkts[3]->packet_type = mlx5_ptype_table[pt_idx3] | + !!(pt_idx3 & (1 << 6)) * rxq->tunnel; + /* Fill flags for checksum and VLAN. */ + pinfo = _mm_and_si128(pinfo, ptype_ol_mask); + pinfo = _mm_shuffle_epi8(cv_flag_sel, pinfo); + /* Locate checksum flags at byte[2:1] and merge with VLAN flags. */ + cv_flags = _mm_slli_epi32(pinfo, 9); + cv_flags = _mm_or_si128(pinfo, cv_flags); + /* Move back flags to start from byte[0]. */ + cv_flags = _mm_srli_epi32(cv_flags, 8); + /* Mask out garbage bits. */ + cv_flags = _mm_and_si128(cv_flags, cv_mask); + /* Merge to ol_flags. */ + ol_flags = _mm_or_si128(ol_flags, cv_flags); + /* Merge mbuf_init and ol_flags. */ + rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 8), 0x30); + rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 4), 0x30); + rearm2 = _mm_blend_epi16(mbuf_init, ol_flags, 0x30); + rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(ol_flags, 4), 0x30); + /* Write 8B rearm_data and 8B ol_flags. */ + _mm_store_si128((__m128i *)&pkts[0]->rearm_data, rearm0); + _mm_store_si128((__m128i *)&pkts[1]->rearm_data, rearm1); + _mm_store_si128((__m128i *)&pkts[2]->rearm_data, rearm2); + _mm_store_si128((__m128i *)&pkts[3]->rearm_data, rearm3); +} + +/** + * Receive burst of packets. An errored completion also consumes a mbuf, but the + * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed + * before returning to application. + * + * @param rxq + * Pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * @param[out] err + * Pointer to a flag. Set non-zero value if pkts array has at least one error + * packet to handle. + * + * @return + * Number of packets received including errors (<= pkts_n). + */ +static inline uint16_t +rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, + uint64_t *err) +{ + const uint16_t q_n = 1 << rxq->cqe_n; + const uint16_t q_mask = q_n - 1; + volatile struct mlx5_cqe *cq; + struct rte_mbuf **elts; + unsigned int pos; + uint64_t n; + uint16_t repl_n; + uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP; + uint16_t nocmp_n = 0; + uint16_t rcvd_pkt = 0; + unsigned int cq_idx = rxq->cq_ci & q_mask; + unsigned int elts_idx; + unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1)); + const __m128i owner_check = + _mm_set_epi64x(0x0100000001000000LL, 0x0100000001000000LL); + const __m128i opcode_check = + _mm_set_epi64x(0xf0000000f0000000LL, 0xf0000000f0000000LL); + const __m128i format_check = + _mm_set_epi64x(0x0c0000000c000000LL, 0x0c0000000c000000LL); + const __m128i resp_err_check = + _mm_set_epi64x(0xe0000000e0000000LL, 0xe0000000e0000000LL); +#ifdef MLX5_PMD_SOFT_COUNTERS + uint32_t rcvd_byte = 0; + /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */ + const __m128i len_shuf_mask = + _mm_set_epi8(-1, -1, -1, -1, + -1, -1, -1, -1, + 12, 13, 8, 9, + 4, 5, 0, 1); +#endif + /* Mask to shuffle from extracted CQE to mbuf. */ + const __m128i shuf_mask = + _mm_set_epi8(-1, 3, 2, 1, /* fdir.hi */ + 12, 13, 14, 15, /* rss, bswap32 */ + 10, 11, /* vlan_tci, bswap16 */ + 4, 5, /* data_len, bswap16 */ + -1, -1, /* zero out 2nd half of pkt_len */ + 4, 5 /* pkt_len, bswap16 */); + /* Mask to blend from the last Qword to the first DQword. */ + const __m128i blend_mask = + _mm_set_epi8(-1, -1, -1, -1, + -1, -1, -1, -1, + 0, 0, 0, 0, + 0, 0, 0, -1); + const __m128i zero = _mm_setzero_si128(); + const __m128i ones = _mm_cmpeq_epi32(zero, zero); + const __m128i crc_adj = + _mm_set_epi16(0, 0, 0, 0, 0, + rxq->crc_present * RTE_ETHER_CRC_LEN, + 0, + rxq->crc_present * RTE_ETHER_CRC_LEN); + const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0); + + MLX5_ASSERT(rxq->sges_n == 0); + MLX5_ASSERT(rxq->cqe_n == rxq->elts_n); + cq = &(*rxq->cqes)[cq_idx]; + rte_prefetch0(cq); + rte_prefetch0(cq + 1); + rte_prefetch0(cq + 2); + rte_prefetch0(cq + 3); + pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST); + repl_n = q_n - (rxq->rq_ci - rxq->rq_pi); + if (repl_n >= rxq->rq_repl_thresh) + mlx5_rx_replenish_bulk_mbuf(rxq, repl_n); + /* See if there're unreturned mbufs from compressed CQE. */ + rcvd_pkt = rxq->decompressed; + if (rcvd_pkt > 0) { + rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n); + rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt); + rxq->rq_pi += rcvd_pkt; + rxq->decompressed -= rcvd_pkt; + pkts += rcvd_pkt; + } + elts_idx = rxq->rq_pi & q_mask; + elts = &(*rxq->elts)[elts_idx]; + /* Not to overflow pkts array. */ + pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP); + /* Not to cross queue end. */ + pkts_n = RTE_MIN(pkts_n, q_n - elts_idx); + pkts_n = RTE_MIN(pkts_n, q_n - cq_idx); + if (!pkts_n) + return rcvd_pkt; + /* At this point, there shouldn't be any remained packets. */ + MLX5_ASSERT(rxq->decompressed == 0); + /* + * A. load first Qword (8bytes) in one loop. + * B. copy 4 mbuf pointers from elts ring to returing pkts. + * C. load remained CQE data and extract necessary fields. + * Final 16bytes cqes[] extracted from original 64bytes CQE has the + * following structure: + * struct { + * uint8_t pkt_info; + * uint8_t flow_tag[3]; + * uint16_t byte_cnt; + * uint8_t rsvd4; + * uint8_t op_own; + * uint16_t hdr_type_etc; + * uint16_t vlan_info; + * uint32_t rx_has_res; + * } c; + * D. fill in mbuf. + * E. get valid CQEs. + * F. find compressed CQE. + */ + for (pos = 0; + pos < pkts_n; + pos += MLX5_VPMD_DESCS_PER_LOOP) { + __m128i cqes[MLX5_VPMD_DESCS_PER_LOOP]; + __m128i cqe_tmp1, cqe_tmp2; + __m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3; + __m128i op_own, op_own_tmp1, op_own_tmp2; + __m128i opcode, owner_mask, invalid_mask; + __m128i comp_mask; + __m128i mask; +#ifdef MLX5_PMD_SOFT_COUNTERS + __m128i byte_cnt; +#endif + __m128i mbp1, mbp2; + __m128i p = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0); + unsigned int p1, p2, p3; + + /* Prefetch next 4 CQEs. */ + if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) { + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]); + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]); + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]); + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]); + } + /* A.0 do not cross the end of CQ. */ + mask = _mm_set_epi64x(0, (pkts_n - pos) * sizeof(uint16_t) * 8); + mask = _mm_sll_epi64(ones, mask); + p = _mm_andnot_si128(mask, p); + /* A.1 load cqes. */ + p3 = _mm_extract_epi16(p, 3); + cqes[3] = _mm_loadl_epi64((__m128i *) + &cq[pos + p3].sop_drop_qpn); + rte_compiler_barrier(); + p2 = _mm_extract_epi16(p, 2); + cqes[2] = _mm_loadl_epi64((__m128i *) + &cq[pos + p2].sop_drop_qpn); + rte_compiler_barrier(); + /* B.1 load mbuf pointers. */ + mbp1 = _mm_loadu_si128((__m128i *)&elts[pos]); + mbp2 = _mm_loadu_si128((__m128i *)&elts[pos + 2]); + /* A.1 load a block having op_own. */ + p1 = _mm_extract_epi16(p, 1); + cqes[1] = _mm_loadl_epi64((__m128i *) + &cq[pos + p1].sop_drop_qpn); + rte_compiler_barrier(); + cqes[0] = _mm_loadl_epi64((__m128i *) + &cq[pos].sop_drop_qpn); + /* B.2 copy mbuf pointers. */ + _mm_storeu_si128((__m128i *)&pkts[pos], mbp1); + _mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2); + rte_cio_rmb(); + /* C.1 load remained CQE data and extract necessary fields. */ + cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p3]); + cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos + p2]); + cqes[3] = _mm_blendv_epi8(cqes[3], cqe_tmp2, blend_mask); + cqes[2] = _mm_blendv_epi8(cqes[2], cqe_tmp1, blend_mask); + cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p3].csum); + cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos + p2].csum); + cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x30); + cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x30); + cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p3].rsvd4[2]); + cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos + p2].rsvd4[2]); + cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x04); + cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x04); + /* C.2 generate final structure for mbuf with swapping bytes. */ + pkt_mb3 = _mm_shuffle_epi8(cqes[3], shuf_mask); + pkt_mb2 = _mm_shuffle_epi8(cqes[2], shuf_mask); + /* C.3 adjust CRC length. */ + pkt_mb3 = _mm_sub_epi16(pkt_mb3, crc_adj); + pkt_mb2 = _mm_sub_epi16(pkt_mb2, crc_adj); + /* C.4 adjust flow mark. */ + pkt_mb3 = _mm_add_epi32(pkt_mb3, flow_mark_adj); + pkt_mb2 = _mm_add_epi32(pkt_mb2, flow_mark_adj); + /* D.1 fill in mbuf - rx_descriptor_fields1. */ + _mm_storeu_si128((void *)&pkts[pos + 3]->pkt_len, pkt_mb3); + _mm_storeu_si128((void *)&pkts[pos + 2]->pkt_len, pkt_mb2); + /* E.1 extract op_own field. */ + op_own_tmp2 = _mm_unpacklo_epi32(cqes[2], cqes[3]); + /* C.1 load remained CQE data and extract necessary fields. */ + cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p1]); + cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos]); + cqes[1] = _mm_blendv_epi8(cqes[1], cqe_tmp2, blend_mask); + cqes[0] = _mm_blendv_epi8(cqes[0], cqe_tmp1, blend_mask); + cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p1].csum); + cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos].csum); + cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x30); + cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x30); + cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p1].rsvd4[2]); + cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos].rsvd4[2]); + cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x04); + cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x04); + /* C.2 generate final structure for mbuf with swapping bytes. */ + pkt_mb1 = _mm_shuffle_epi8(cqes[1], shuf_mask); + pkt_mb0 = _mm_shuffle_epi8(cqes[0], shuf_mask); + /* C.3 adjust CRC length. */ + pkt_mb1 = _mm_sub_epi16(pkt_mb1, crc_adj); + pkt_mb0 = _mm_sub_epi16(pkt_mb0, crc_adj); + /* C.4 adjust flow mark. */ + pkt_mb1 = _mm_add_epi32(pkt_mb1, flow_mark_adj); + pkt_mb0 = _mm_add_epi32(pkt_mb0, flow_mark_adj); + /* E.1 extract op_own byte. */ + op_own_tmp1 = _mm_unpacklo_epi32(cqes[0], cqes[1]); + op_own = _mm_unpackhi_epi64(op_own_tmp1, op_own_tmp2); + /* D.1 fill in mbuf - rx_descriptor_fields1. */ + _mm_storeu_si128((void *)&pkts[pos + 1]->pkt_len, pkt_mb1); + _mm_storeu_si128((void *)&pkts[pos]->pkt_len, pkt_mb0); + /* E.2 flip owner bit to mark CQEs from last round. */ + owner_mask = _mm_and_si128(op_own, owner_check); + if (ownership) + owner_mask = _mm_xor_si128(owner_mask, owner_check); + owner_mask = _mm_cmpeq_epi32(owner_mask, owner_check); + owner_mask = _mm_packs_epi32(owner_mask, zero); + /* E.3 get mask for invalidated CQEs. */ + opcode = _mm_and_si128(op_own, opcode_check); + invalid_mask = _mm_cmpeq_epi32(opcode_check, opcode); + invalid_mask = _mm_packs_epi32(invalid_mask, zero); + /* E.4 mask out beyond boundary. */ + invalid_mask = _mm_or_si128(invalid_mask, mask); + /* E.5 merge invalid_mask with invalid owner. */ + invalid_mask = _mm_or_si128(invalid_mask, owner_mask); + /* F.1 find compressed CQE format. */ + comp_mask = _mm_and_si128(op_own, format_check); + comp_mask = _mm_cmpeq_epi32(comp_mask, format_check); + comp_mask = _mm_packs_epi32(comp_mask, zero); + /* F.2 mask out invalid entries. */ + comp_mask = _mm_andnot_si128(invalid_mask, comp_mask); + comp_idx = _mm_cvtsi128_si64(comp_mask); + /* F.3 get the first compressed CQE. */ + comp_idx = comp_idx ? + __builtin_ctzll(comp_idx) / + (sizeof(uint16_t) * 8) : + MLX5_VPMD_DESCS_PER_LOOP; + /* E.6 mask out entries after the compressed CQE. */ + mask = _mm_set_epi64x(0, comp_idx * sizeof(uint16_t) * 8); + mask = _mm_sll_epi64(ones, mask); + invalid_mask = _mm_or_si128(invalid_mask, mask); + /* E.7 count non-compressed valid CQEs. */ + n = _mm_cvtsi128_si64(invalid_mask); + n = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) : + MLX5_VPMD_DESCS_PER_LOOP; + nocmp_n += n; + /* D.2 get the final invalid mask. */ + mask = _mm_set_epi64x(0, n * sizeof(uint16_t) * 8); + mask = _mm_sll_epi64(ones, mask); + invalid_mask = _mm_or_si128(invalid_mask, mask); + /* D.3 check error in opcode. */ + opcode = _mm_cmpeq_epi32(resp_err_check, opcode); + opcode = _mm_packs_epi32(opcode, zero); + opcode = _mm_andnot_si128(invalid_mask, opcode); + /* D.4 mark if any error is set */ + *err |= _mm_cvtsi128_si64(opcode); + /* D.5 fill in mbuf - rearm_data and packet_type. */ + rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]); + if (rxq->hw_timestamp) { + pkts[pos]->timestamp = + rte_be_to_cpu_64(cq[pos].timestamp); + pkts[pos + 1]->timestamp = + rte_be_to_cpu_64(cq[pos + p1].timestamp); + pkts[pos + 2]->timestamp = + rte_be_to_cpu_64(cq[pos + p2].timestamp); + pkts[pos + 3]->timestamp = + rte_be_to_cpu_64(cq[pos + p3].timestamp); + } + if (rxq->dynf_meta) { + /* This code is subject for futher optimization. */ + int32_t offs = rxq->flow_meta_offset; + + *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) = + cq[pos].flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) = + cq[pos + p1].flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) = + cq[pos + p2].flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) = + cq[pos + p3].flow_table_metadata; + if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *)) + pkts[pos]->ol_flags |= rxq->flow_meta_mask; + if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *)) + pkts[pos + 1]->ol_flags |= rxq->flow_meta_mask; + if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *)) + pkts[pos + 2]->ol_flags |= rxq->flow_meta_mask; + if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *)) + pkts[pos + 3]->ol_flags |= rxq->flow_meta_mask; + } +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Add up received bytes count. */ + byte_cnt = _mm_shuffle_epi8(op_own, len_shuf_mask); + byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt); + byte_cnt = _mm_hadd_epi16(byte_cnt, zero); + rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero)); +#endif + /* + * Break the loop unless more valid CQE is expected, or if + * there's a compressed CQE. + */ + if (n != MLX5_VPMD_DESCS_PER_LOOP) + break; + } + /* If no new CQE seen, return without updating cq_db. */ + if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) + return rcvd_pkt; + /* Update the consumer indexes for non-compressed CQEs. */ + MLX5_ASSERT(nocmp_n <= pkts_n); + rxq->cq_ci += nocmp_n; + rxq->rq_pi += nocmp_n; + rcvd_pkt += nocmp_n; +#ifdef MLX5_PMD_SOFT_COUNTERS + rxq->stats.ipackets += nocmp_n; + rxq->stats.ibytes += rcvd_byte; +#endif + /* Decompress the last CQE if compressed. */ + if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) { + MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP)); + rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n], + &elts[nocmp_n]); + /* Return more packets if needed. */ + if (nocmp_n < pkts_n) { + uint16_t n = rxq->decompressed; + + n = RTE_MIN(n, pkts_n - nocmp_n); + rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n); + rxq->rq_pi += n; + rcvd_pkt += n; + rxq->decompressed -= n; + } + } + rte_compiler_barrier(); + *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); + return rcvd_pkt; +} + +#endif /* RTE_PMD_MLX5_RXTX_VEC_SSE_H_ */ diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_socket.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_socket.c new file mode 100644 index 000000000..a79896cb3 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_socket.c @@ -0,0 +1,230 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2019 Mellanox Technologies, Ltd + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <fcntl.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/stat.h> + +#include "rte_eal.h" +#include "mlx5_utils.h" +#include "mlx5.h" + +/* PMD socket service for tools. */ + +int server_socket; /* Unix socket for primary process. */ +struct rte_intr_handle server_intr_handle; /* Interrupt handler. */ + +static void +mlx5_pmd_make_path(struct sockaddr_un *addr, int pid) +{ + snprintf(addr->sun_path, sizeof(addr->sun_path), "/var/tmp/dpdk_%s_%d", + MLX5_DRIVER_NAME, pid); +} + +/** + * Handle server pmd socket interrupts. + */ +static void +mlx5_pmd_socket_handle(void *cb __rte_unused) +{ + int conn_sock; + int ret = -1; + struct cmsghdr *cmsg = NULL; + int data; + char buf[CMSG_SPACE(sizeof(int))] = { 0 }; + struct iovec io = { + .iov_base = &data, + .iov_len = sizeof(data), + }; + struct msghdr msg = { + .msg_iov = &io, + .msg_iovlen = 1, + .msg_control = buf, + .msg_controllen = sizeof(buf), + }; + uint16_t port_id; + int fd; + FILE *file = NULL; + struct rte_eth_dev *dev; + + /* Accept the connection from the client. */ + conn_sock = accept(server_socket, NULL, NULL); + if (conn_sock < 0) { + DRV_LOG(WARNING, "connection failed: %s", strerror(errno)); + return; + } + ret = recvmsg(conn_sock, &msg, MSG_WAITALL); + if (ret < 0) { + DRV_LOG(WARNING, "wrong message received: %s", + strerror(errno)); + goto error; + } + /* Receive file descriptor. */ + cmsg = CMSG_FIRSTHDR(&msg); + if (cmsg == NULL || cmsg->cmsg_type != SCM_RIGHTS || + cmsg->cmsg_len < sizeof(int)) { + DRV_LOG(WARNING, "invalid file descriptor message"); + goto error; + } + memcpy(&fd, CMSG_DATA(cmsg), sizeof(fd)); + file = fdopen(fd, "w"); + if (!file) { + DRV_LOG(WARNING, "Failed to open file"); + goto error; + } + /* Receive port number. */ + if (msg.msg_iovlen != 1 || msg.msg_iov->iov_len < sizeof(uint16_t)) { + DRV_LOG(WARNING, "wrong port number message"); + goto error; + } + memcpy(&port_id, msg.msg_iov->iov_base, sizeof(port_id)); + if (!rte_eth_dev_is_valid_port(port_id)) { + DRV_LOG(WARNING, "Invalid port %u", port_id); + goto error; + } + /* Dump flow. */ + dev = &rte_eth_devices[port_id]; + ret = mlx5_flow_dev_dump(dev, file, NULL); + /* Set-up the ancillary data and reply. */ + msg.msg_controllen = 0; + msg.msg_control = NULL; + msg.msg_iovlen = 1; + msg.msg_iov = &io; + data = -ret; + io.iov_len = sizeof(data); + io.iov_base = &data; + do { + ret = sendmsg(conn_sock, &msg, 0); + } while (ret < 0 && errno == EINTR); + if (ret < 0) + DRV_LOG(WARNING, "failed to send response %s", + strerror(errno)); +error: + if (conn_sock > 0) + close(conn_sock); + if (file) + fclose(file); +} + +/** + * Install interrupt handler. + * + * @param dev + * Pointer to Ethernet device. + * @return + * 0 on success, a negative errno value otherwise. + */ +static int +mlx5_pmd_interrupt_handler_install(void) +{ + MLX5_ASSERT(server_socket); + server_intr_handle.fd = server_socket; + server_intr_handle.type = RTE_INTR_HANDLE_EXT; + return rte_intr_callback_register(&server_intr_handle, + mlx5_pmd_socket_handle, NULL); +} + +/** + * Uninstall interrupt handler. + */ +static void +mlx5_pmd_interrupt_handler_uninstall(void) +{ + if (server_socket) { + mlx5_intr_callback_unregister(&server_intr_handle, + mlx5_pmd_socket_handle, + NULL); + } + server_intr_handle.fd = 0; + server_intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; +} + +/** + * Initialise the socket to communicate with the secondary process + * + * @param[in] dev + * Pointer to Ethernet device. + * + * @return + * 0 on success, a negative value otherwise. + */ +int +mlx5_pmd_socket_init(void) +{ + struct sockaddr_un sun = { + .sun_family = AF_UNIX, + }; + int ret = -1; + int flags; + + MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); + if (server_socket) + return 0; + /* + * Initialize the socket to communicate with the secondary + * process. + */ + ret = socket(AF_UNIX, SOCK_STREAM, 0); + if (ret < 0) { + DRV_LOG(WARNING, "Failed to open mlx5 socket: %s", + strerror(errno)); + goto error; + } + server_socket = ret; + flags = fcntl(server_socket, F_GETFL, 0); + if (flags == -1) + goto error; + ret = fcntl(server_socket, F_SETFL, flags | O_NONBLOCK); + if (ret < 0) + goto error; + mlx5_pmd_make_path(&sun, getpid()); + remove(sun.sun_path); + ret = bind(server_socket, (const struct sockaddr *)&sun, sizeof(sun)); + if (ret < 0) { + DRV_LOG(WARNING, + "cannot bind mlx5 socket: %s", strerror(errno)); + goto close; + } + ret = listen(server_socket, 0); + if (ret < 0) { + DRV_LOG(WARNING, "cannot listen on mlx5 socket: %s", + strerror(errno)); + goto close; + } + if (mlx5_pmd_interrupt_handler_install()) { + DRV_LOG(WARNING, "cannot register interrupt handler for mlx5 socket: %s", + strerror(errno)); + goto close; + } + return 0; +close: + remove(sun.sun_path); +error: + claim_zero(close(server_socket)); + server_socket = 0; + DRV_LOG(ERR, "Cannot initialize socket: %s", strerror(errno)); + return -errno; +} + +/** + * Un-Initialize the pmd socket + */ +RTE_FINI(mlx5_pmd_socket_uninit) +{ + if (!server_socket) + return; + mlx5_pmd_interrupt_handler_uninstall(); + claim_zero(close(server_socket)); + server_socket = 0; + MKSTR(path, "/var/tmp/dpdk_%s_%d", MLX5_DRIVER_NAME, getpid()); + claim_zero(remove(path)); +} diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_stats.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_stats.c new file mode 100644 index 000000000..b4ca6922a --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_stats.c @@ -0,0 +1,589 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2015 6WIND S.A. + * Copyright 2015 Mellanox Technologies, Ltd + */ + +#include <fcntl.h> +#include <inttypes.h> +#include <linux/sockios.h> +#include <linux/ethtool.h> +#include <stdint.h> +#include <stdio.h> +#include <unistd.h> + +#include <rte_ethdev_driver.h> +#include <rte_common.h> +#include <rte_malloc.h> + +#include <mlx5_common.h> + +#include "mlx5_defs.h" +#include "mlx5.h" +#include "mlx5_rxtx.h" + + +static const struct mlx5_counter_ctrl mlx5_counters_init[] = { + { + .dpdk_name = "rx_port_unicast_bytes", + .ctr_name = "rx_vport_unicast_bytes", + }, + { + .dpdk_name = "rx_port_multicast_bytes", + .ctr_name = "rx_vport_multicast_bytes", + }, + { + .dpdk_name = "rx_port_broadcast_bytes", + .ctr_name = "rx_vport_broadcast_bytes", + }, + { + .dpdk_name = "rx_port_unicast_packets", + .ctr_name = "rx_vport_unicast_packets", + }, + { + .dpdk_name = "rx_port_multicast_packets", + .ctr_name = "rx_vport_multicast_packets", + }, + { + .dpdk_name = "rx_port_broadcast_packets", + .ctr_name = "rx_vport_broadcast_packets", + }, + { + .dpdk_name = "tx_port_unicast_bytes", + .ctr_name = "tx_vport_unicast_bytes", + }, + { + .dpdk_name = "tx_port_multicast_bytes", + .ctr_name = "tx_vport_multicast_bytes", + }, + { + .dpdk_name = "tx_port_broadcast_bytes", + .ctr_name = "tx_vport_broadcast_bytes", + }, + { + .dpdk_name = "tx_port_unicast_packets", + .ctr_name = "tx_vport_unicast_packets", + }, + { + .dpdk_name = "tx_port_multicast_packets", + .ctr_name = "tx_vport_multicast_packets", + }, + { + .dpdk_name = "tx_port_broadcast_packets", + .ctr_name = "tx_vport_broadcast_packets", + }, + { + .dpdk_name = "rx_wqe_err", + .ctr_name = "rx_wqe_err", + }, + { + .dpdk_name = "rx_crc_errors_phy", + .ctr_name = "rx_crc_errors_phy", + }, + { + .dpdk_name = "rx_in_range_len_errors_phy", + .ctr_name = "rx_in_range_len_errors_phy", + }, + { + .dpdk_name = "rx_symbol_err_phy", + .ctr_name = "rx_symbol_err_phy", + }, + { + .dpdk_name = "tx_errors_phy", + .ctr_name = "tx_errors_phy", + }, + { + .dpdk_name = "rx_out_of_buffer", + .ctr_name = "out_of_buffer", + .ib = 1, + }, + { + .dpdk_name = "tx_packets_phy", + .ctr_name = "tx_packets_phy", + }, + { + .dpdk_name = "rx_packets_phy", + .ctr_name = "rx_packets_phy", + }, + { + .dpdk_name = "tx_discards_phy", + .ctr_name = "tx_discards_phy", + }, + { + .dpdk_name = "rx_discards_phy", + .ctr_name = "rx_discards_phy", + }, + { + .dpdk_name = "tx_bytes_phy", + .ctr_name = "tx_bytes_phy", + }, + { + .dpdk_name = "rx_bytes_phy", + .ctr_name = "rx_bytes_phy", + }, + /* Representor only */ + { + .dpdk_name = "rx_packets", + .ctr_name = "vport_rx_packets", + }, + { + .dpdk_name = "rx_bytes", + .ctr_name = "vport_rx_bytes", + }, + { + .dpdk_name = "tx_packets", + .ctr_name = "vport_tx_packets", + }, + { + .dpdk_name = "tx_bytes", + .ctr_name = "vport_tx_bytes", + }, +}; + +static const unsigned int xstats_n = RTE_DIM(mlx5_counters_init); + +static inline int +mlx5_read_ib_stat(struct mlx5_priv *priv, const char *ctr_name, uint64_t *stat) +{ + int fd; + + if (priv->sh) { + MKSTR(path, "%s/ports/%d/hw_counters/%s", + priv->sh->ibdev_path, + priv->ibv_port, + ctr_name); + fd = open(path, O_RDONLY); + if (fd != -1) { + char buf[21] = {'\0'}; + ssize_t n = read(fd, buf, sizeof(buf)); + + close(fd); + if (n != -1) { + *stat = strtoull(buf, NULL, 10); + return 0; + } + } + } + *stat = 0; + return 1; +} + +/** + * Read device counters table. + * + * @param dev + * Pointer to Ethernet device. + * @param[out] stats + * Counters table output buffer. + * + * @return + * 0 on success and stats is filled, negative errno value otherwise and + * rte_errno is set. + */ +static int +mlx5_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; + unsigned int i; + struct ifreq ifr; + unsigned int stats_sz = xstats_ctrl->stats_n * sizeof(uint64_t); + unsigned char et_stat_buf[sizeof(struct ethtool_stats) + stats_sz]; + struct ethtool_stats *et_stats = (struct ethtool_stats *)et_stat_buf; + int ret; + + et_stats->cmd = ETHTOOL_GSTATS; + et_stats->n_stats = xstats_ctrl->stats_n; + ifr.ifr_data = (caddr_t)et_stats; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); + if (ret) { + DRV_LOG(WARNING, + "port %u unable to read statistic values from device", + dev->data->port_id); + return ret; + } + for (i = 0; i != xstats_ctrl->mlx5_stats_n; ++i) { + if (xstats_ctrl->info[i].ib) { + ret = mlx5_read_ib_stat(priv, + xstats_ctrl->info[i].ctr_name, + &stats[i]); + /* return last xstats counter if fail to read. */ + if (ret == 0) + xstats_ctrl->xstats[i] = stats[i]; + else + stats[i] = xstats_ctrl->xstats[i]; + } else { + stats[i] = (uint64_t) + et_stats->data[xstats_ctrl->dev_table_idx[i]]; + } + } + return 0; +} + +/** + * Query the number of statistics provided by ETHTOOL. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * Number of statistics on success, negative errno value otherwise and + * rte_errno is set. + */ +static int +mlx5_ethtool_get_stats_n(struct rte_eth_dev *dev) { + struct ethtool_drvinfo drvinfo; + struct ifreq ifr; + int ret; + + drvinfo.cmd = ETHTOOL_GDRVINFO; + ifr.ifr_data = (caddr_t)&drvinfo; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); + if (ret) { + DRV_LOG(WARNING, "port %u unable to query number of statistics", + dev->data->port_id); + return ret; + } + return drvinfo.n_stats; +} + +/** + * Init the structures to read device counters. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx5_stats_init(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; + struct mlx5_stats_ctrl *stats_ctrl = &priv->stats_ctrl; + unsigned int i; + unsigned int j; + struct ifreq ifr; + struct ethtool_gstrings *strings = NULL; + unsigned int dev_stats_n; + unsigned int str_sz; + int ret; + + /* So that it won't aggregate for each init. */ + xstats_ctrl->mlx5_stats_n = 0; + ret = mlx5_ethtool_get_stats_n(dev); + if (ret < 0) { + DRV_LOG(WARNING, "port %u no extended statistics available", + dev->data->port_id); + return; + } + dev_stats_n = ret; + /* Allocate memory to grab stat names and values. */ + str_sz = dev_stats_n * ETH_GSTRING_LEN; + strings = (struct ethtool_gstrings *) + rte_malloc("xstats_strings", + str_sz + sizeof(struct ethtool_gstrings), 0); + if (!strings) { + DRV_LOG(WARNING, "port %u unable to allocate memory for xstats", + dev->data->port_id); + return; + } + strings->cmd = ETHTOOL_GSTRINGS; + strings->string_set = ETH_SS_STATS; + strings->len = dev_stats_n; + ifr.ifr_data = (caddr_t)strings; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); + if (ret) { + DRV_LOG(WARNING, "port %u unable to get statistic names", + dev->data->port_id); + goto free; + } + for (i = 0; i != dev_stats_n; ++i) { + const char *curr_string = (const char *) + &strings->data[i * ETH_GSTRING_LEN]; + + for (j = 0; j != xstats_n; ++j) { + if (!strcmp(mlx5_counters_init[j].ctr_name, + curr_string)) { + unsigned int idx = xstats_ctrl->mlx5_stats_n++; + + xstats_ctrl->dev_table_idx[idx] = i; + xstats_ctrl->info[idx] = mlx5_counters_init[j]; + break; + } + } + } + /* Add IB counters. */ + for (i = 0; i != xstats_n; ++i) { + if (mlx5_counters_init[i].ib) { + unsigned int idx = xstats_ctrl->mlx5_stats_n++; + + xstats_ctrl->info[idx] = mlx5_counters_init[i]; + xstats_ctrl->hw_stats[idx] = 0; + } + } + MLX5_ASSERT(xstats_ctrl->mlx5_stats_n <= MLX5_MAX_XSTATS); + xstats_ctrl->stats_n = dev_stats_n; + /* Copy to base at first time. */ + ret = mlx5_read_dev_counters(dev, xstats_ctrl->base); + if (ret) + DRV_LOG(ERR, "port %u cannot read device counters: %s", + dev->data->port_id, strerror(rte_errno)); + mlx5_read_ib_stat(priv, "out_of_buffer", &stats_ctrl->imissed_base); + stats_ctrl->imissed = 0; +free: + rte_free(strings); +} + +/** + * DPDK callback to get extended device statistics. + * + * @param dev + * Pointer to Ethernet device. + * @param[out] stats + * Pointer to rte extended stats table. + * @param n + * The size of the stats table. + * + * @return + * Number of extended stats on success and stats is filled, + * negative on error and rte_errno is set. + */ +int +mlx5_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *stats, + unsigned int n) +{ + struct mlx5_priv *priv = dev->data->dev_private; + unsigned int i; + uint64_t counters[n]; + struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; + uint16_t mlx5_stats_n = xstats_ctrl->mlx5_stats_n; + + if (n >= mlx5_stats_n && stats) { + int stats_n; + int ret; + + stats_n = mlx5_ethtool_get_stats_n(dev); + if (stats_n < 0) + return stats_n; + if (xstats_ctrl->stats_n != stats_n) + mlx5_stats_init(dev); + ret = mlx5_read_dev_counters(dev, counters); + if (ret) + return ret; + for (i = 0; i != mlx5_stats_n; ++i) { + stats[i].id = i; + if (xstats_ctrl->info[i].ib) { + uint64_t wrap_n; + uint64_t hw_stat = xstats_ctrl->hw_stats[i]; + + stats[i].value = (counters[i] - + xstats_ctrl->base[i]) & + (uint64_t)UINT32_MAX; + wrap_n = hw_stat >> 32; + if (stats[i].value < + (hw_stat & (uint64_t)UINT32_MAX)) + wrap_n++; + stats[i].value |= (wrap_n) << 32; + xstats_ctrl->hw_stats[i] = stats[i].value; + } else { + stats[i].value = + (counters[i] - xstats_ctrl->base[i]); + } + } + } + return mlx5_stats_n; +} + +/** + * DPDK callback to get device statistics. + * + * @param dev + * Pointer to Ethernet device structure. + * @param[out] stats + * Stats structure output buffer. + * + * @return + * 0 on success and stats is filled, negative errno value otherwise and + * rte_errno is set. + */ +int +mlx5_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_stats_ctrl *stats_ctrl = &priv->stats_ctrl; + struct rte_eth_stats tmp; + unsigned int i; + unsigned int idx; + uint64_t wrap_n; + int ret; + + memset(&tmp, 0, sizeof(tmp)); + /* Add software counters. */ + for (i = 0; (i != priv->rxqs_n); ++i) { + struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; + + if (rxq == NULL) + continue; + idx = rxq->idx; + if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) { +#ifdef MLX5_PMD_SOFT_COUNTERS + tmp.q_ipackets[idx] += rxq->stats.ipackets; + tmp.q_ibytes[idx] += rxq->stats.ibytes; +#endif + tmp.q_errors[idx] += (rxq->stats.idropped + + rxq->stats.rx_nombuf); + } +#ifdef MLX5_PMD_SOFT_COUNTERS + tmp.ipackets += rxq->stats.ipackets; + tmp.ibytes += rxq->stats.ibytes; +#endif + tmp.ierrors += rxq->stats.idropped; + tmp.rx_nombuf += rxq->stats.rx_nombuf; + } + for (i = 0; (i != priv->txqs_n); ++i) { + struct mlx5_txq_data *txq = (*priv->txqs)[i]; + + if (txq == NULL) + continue; + idx = txq->idx; + if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) { +#ifdef MLX5_PMD_SOFT_COUNTERS + tmp.q_opackets[idx] += txq->stats.opackets; + tmp.q_obytes[idx] += txq->stats.obytes; +#endif + } +#ifdef MLX5_PMD_SOFT_COUNTERS + tmp.opackets += txq->stats.opackets; + tmp.obytes += txq->stats.obytes; +#endif + tmp.oerrors += txq->stats.oerrors; + } + ret = mlx5_read_ib_stat(priv, "out_of_buffer", &tmp.imissed); + if (ret == 0) { + tmp.imissed = (tmp.imissed - stats_ctrl->imissed_base) & + (uint64_t)UINT32_MAX; + wrap_n = stats_ctrl->imissed >> 32; + if (tmp.imissed < (stats_ctrl->imissed & (uint64_t)UINT32_MAX)) + wrap_n++; + tmp.imissed |= (wrap_n) << 32; + stats_ctrl->imissed = tmp.imissed; + } else { + tmp.imissed = stats_ctrl->imissed; + } +#ifndef MLX5_PMD_SOFT_COUNTERS + /* FIXME: retrieve and add hardware counters. */ +#endif + *stats = tmp; + return 0; +} + +/** + * DPDK callback to clear device statistics. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * always 0 on success and stats is reset + */ +int +mlx5_stats_reset(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_stats_ctrl *stats_ctrl = &priv->stats_ctrl; + unsigned int i; + + for (i = 0; (i != priv->rxqs_n); ++i) { + if ((*priv->rxqs)[i] == NULL) + continue; + memset(&(*priv->rxqs)[i]->stats, 0, + sizeof(struct mlx5_rxq_stats)); + } + for (i = 0; (i != priv->txqs_n); ++i) { + if ((*priv->txqs)[i] == NULL) + continue; + memset(&(*priv->txqs)[i]->stats, 0, + sizeof(struct mlx5_txq_stats)); + } + mlx5_read_ib_stat(priv, "out_of_buffer", &stats_ctrl->imissed_base); + stats_ctrl->imissed = 0; +#ifndef MLX5_PMD_SOFT_COUNTERS + /* FIXME: reset hardware counters. */ +#endif + + return 0; +} + +/** + * DPDK callback to clear device extended statistics. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success and stats is reset, negative errno value otherwise and + * rte_errno is set. + */ +int +mlx5_xstats_reset(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; + int stats_n; + unsigned int i; + unsigned int n = xstats_ctrl->mlx5_stats_n; + uint64_t counters[n]; + int ret; + + stats_n = mlx5_ethtool_get_stats_n(dev); + if (stats_n < 0) { + DRV_LOG(ERR, "port %u cannot get stats: %s", dev->data->port_id, + strerror(-stats_n)); + return stats_n; + } + if (xstats_ctrl->stats_n != stats_n) + mlx5_stats_init(dev); + ret = mlx5_read_dev_counters(dev, counters); + if (ret) { + DRV_LOG(ERR, "port %u cannot read device counters: %s", + dev->data->port_id, strerror(rte_errno)); + return ret; + } + for (i = 0; i != n; ++i) { + xstats_ctrl->base[i] = counters[i]; + xstats_ctrl->hw_stats[i] = 0; + } + + return 0; +} + +/** + * DPDK callback to retrieve names of extended device statistics + * + * @param dev + * Pointer to Ethernet device structure. + * @param[out] xstats_names + * Buffer to insert names into. + * @param n + * Number of names. + * + * @return + * Number of xstats names. + */ +int +mlx5_xstats_get_names(struct rte_eth_dev *dev __rte_unused, + struct rte_eth_xstat_name *xstats_names, unsigned int n) +{ + unsigned int i; + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; + unsigned int mlx5_xstats_n = xstats_ctrl->mlx5_stats_n; + + if (n >= mlx5_xstats_n && xstats_names) { + for (i = 0; i != mlx5_xstats_n; ++i) { + strncpy(xstats_names[i].name, + xstats_ctrl->info[i].dpdk_name, + RTE_ETH_XSTATS_NAME_SIZE); + xstats_names[i].name[RTE_ETH_XSTATS_NAME_SIZE - 1] = 0; + } + } + return mlx5_xstats_n; +} diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_trigger.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_trigger.c new file mode 100644 index 000000000..8106598ff --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_trigger.c @@ -0,0 +1,579 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2015 6WIND S.A. + * Copyright 2015 Mellanox Technologies, Ltd + */ + +#include <unistd.h> + +#include <rte_ether.h> +#include <rte_ethdev_driver.h> +#include <rte_interrupts.h> +#include <rte_alarm.h> + +#include "mlx5.h" +#include "mlx5_mr.h" +#include "mlx5_rxtx.h" +#include "mlx5_utils.h" +#include "rte_pmd_mlx5.h" + +/** + * Stop traffic on Tx queues. + * + * @param dev + * Pointer to Ethernet device structure. + */ +static void +mlx5_txq_stop(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + unsigned int i; + + for (i = 0; i != priv->txqs_n; ++i) + mlx5_txq_release(dev, i); +} + +/** + * Start traffic on Tx queues. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_txq_start(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + unsigned int i; + int ret; + + for (i = 0; i != priv->txqs_n; ++i) { + struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i); + + if (!txq_ctrl) + continue; + if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN) { + txq_ctrl->obj = mlx5_txq_obj_new + (dev, i, MLX5_TXQ_OBJ_TYPE_DEVX_HAIRPIN); + } else { + txq_alloc_elts(txq_ctrl); + txq_ctrl->obj = mlx5_txq_obj_new + (dev, i, MLX5_TXQ_OBJ_TYPE_IBV); + } + if (!txq_ctrl->obj) { + rte_errno = ENOMEM; + goto error; + } + } + return 0; +error: + ret = rte_errno; /* Save rte_errno before cleanup. */ + do { + mlx5_txq_release(dev, i); + } while (i-- != 0); + rte_errno = ret; /* Restore rte_errno. */ + return -rte_errno; +} + +/** + * Stop traffic on Rx queues. + * + * @param dev + * Pointer to Ethernet device structure. + */ +static void +mlx5_rxq_stop(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + unsigned int i; + + for (i = 0; i != priv->rxqs_n; ++i) + mlx5_rxq_release(dev, i); +} + +/** + * Start traffic on Rx queues. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_rxq_start(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + unsigned int i; + int ret = 0; + enum mlx5_rxq_obj_type obj_type = MLX5_RXQ_OBJ_TYPE_IBV; + struct mlx5_rxq_data *rxq = NULL; + + for (i = 0; i < priv->rxqs_n; ++i) { + rxq = (*priv->rxqs)[i]; + if (rxq && rxq->lro) { + obj_type = MLX5_RXQ_OBJ_TYPE_DEVX_RQ; + break; + } + } + /* Allocate/reuse/resize mempool for Multi-Packet RQ. */ + if (mlx5_mprq_alloc_mp(dev)) { + /* Should not release Rx queues but return immediately. */ + return -rte_errno; + } + for (i = 0; i != priv->rxqs_n; ++i) { + struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_rxq_get(dev, i); + struct rte_mempool *mp; + + if (!rxq_ctrl) + continue; + if (rxq_ctrl->type == MLX5_RXQ_TYPE_HAIRPIN) { + rxq_ctrl->obj = mlx5_rxq_obj_new + (dev, i, MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN); + if (!rxq_ctrl->obj) + goto error; + continue; + } + /* Pre-register Rx mempool. */ + mp = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ? + rxq_ctrl->rxq.mprq_mp : rxq_ctrl->rxq.mp; + DRV_LOG(DEBUG, + "port %u Rx queue %u registering" + " mp %s having %u chunks", + dev->data->port_id, rxq_ctrl->rxq.idx, + mp->name, mp->nb_mem_chunks); + mlx5_mr_update_mp(dev, &rxq_ctrl->rxq.mr_ctrl, mp); + ret = rxq_alloc_elts(rxq_ctrl); + if (ret) + goto error; + rxq_ctrl->obj = mlx5_rxq_obj_new(dev, i, obj_type); + if (!rxq_ctrl->obj) + goto error; + if (obj_type == MLX5_RXQ_OBJ_TYPE_IBV) + rxq_ctrl->wqn = rxq_ctrl->obj->wq->wq_num; + else if (obj_type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ) + rxq_ctrl->wqn = rxq_ctrl->obj->rq->id; + } + return 0; +error: + ret = rte_errno; /* Save rte_errno before cleanup. */ + do { + mlx5_rxq_release(dev, i); + } while (i-- != 0); + rte_errno = ret; /* Restore rte_errno. */ + return -rte_errno; +} + +/** + * Binds Tx queues to Rx queues for hairpin. + * + * Binds Tx queues to the target Rx queues. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_hairpin_bind(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_devx_modify_sq_attr sq_attr = { 0 }; + struct mlx5_devx_modify_rq_attr rq_attr = { 0 }; + struct mlx5_txq_ctrl *txq_ctrl; + struct mlx5_rxq_ctrl *rxq_ctrl; + struct mlx5_devx_obj *sq; + struct mlx5_devx_obj *rq; + unsigned int i; + int ret = 0; + + for (i = 0; i != priv->txqs_n; ++i) { + txq_ctrl = mlx5_txq_get(dev, i); + if (!txq_ctrl) + continue; + if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) { + mlx5_txq_release(dev, i); + continue; + } + if (!txq_ctrl->obj) { + rte_errno = ENOMEM; + DRV_LOG(ERR, "port %u no txq object found: %d", + dev->data->port_id, i); + mlx5_txq_release(dev, i); + return -rte_errno; + } + sq = txq_ctrl->obj->sq; + rxq_ctrl = mlx5_rxq_get(dev, + txq_ctrl->hairpin_conf.peers[0].queue); + if (!rxq_ctrl) { + mlx5_txq_release(dev, i); + rte_errno = EINVAL; + DRV_LOG(ERR, "port %u no rxq object found: %d", + dev->data->port_id, + txq_ctrl->hairpin_conf.peers[0].queue); + return -rte_errno; + } + if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN || + rxq_ctrl->hairpin_conf.peers[0].queue != i) { + rte_errno = ENOMEM; + DRV_LOG(ERR, "port %u Tx queue %d can't be binded to " + "Rx queue %d", dev->data->port_id, + i, txq_ctrl->hairpin_conf.peers[0].queue); + goto error; + } + rq = rxq_ctrl->obj->rq; + if (!rq) { + rte_errno = ENOMEM; + DRV_LOG(ERR, "port %u hairpin no matching rxq: %d", + dev->data->port_id, + txq_ctrl->hairpin_conf.peers[0].queue); + goto error; + } + sq_attr.state = MLX5_SQC_STATE_RDY; + sq_attr.sq_state = MLX5_SQC_STATE_RST; + sq_attr.hairpin_peer_rq = rq->id; + sq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id; + ret = mlx5_devx_cmd_modify_sq(sq, &sq_attr); + if (ret) + goto error; + rq_attr.state = MLX5_SQC_STATE_RDY; + rq_attr.rq_state = MLX5_SQC_STATE_RST; + rq_attr.hairpin_peer_sq = sq->id; + rq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id; + ret = mlx5_devx_cmd_modify_rq(rq, &rq_attr); + if (ret) + goto error; + mlx5_txq_release(dev, i); + mlx5_rxq_release(dev, txq_ctrl->hairpin_conf.peers[0].queue); + } + return 0; +error: + mlx5_txq_release(dev, i); + mlx5_rxq_release(dev, txq_ctrl->hairpin_conf.peers[0].queue); + return -rte_errno; +} + +/** + * DPDK callback to start the device. + * + * Simulate device start by attaching all configured flows. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_dev_start(struct rte_eth_dev *dev) +{ + int ret; + int fine_inline; + + DRV_LOG(DEBUG, "port %u starting device", dev->data->port_id); + fine_inline = rte_mbuf_dynflag_lookup + (RTE_PMD_MLX5_FINE_GRANULARITY_INLINE, NULL); + if (fine_inline > 0) + rte_net_mlx5_dynf_inline_mask = 1UL << fine_inline; + else + rte_net_mlx5_dynf_inline_mask = 0; + if (dev->data->nb_rx_queues > 0) { + ret = mlx5_dev_configure_rss_reta(dev); + if (ret) { + DRV_LOG(ERR, "port %u reta config failed: %s", + dev->data->port_id, strerror(rte_errno)); + return -rte_errno; + } + } + ret = mlx5_txq_start(dev); + if (ret) { + DRV_LOG(ERR, "port %u Tx queue allocation failed: %s", + dev->data->port_id, strerror(rte_errno)); + return -rte_errno; + } + ret = mlx5_rxq_start(dev); + if (ret) { + DRV_LOG(ERR, "port %u Rx queue allocation failed: %s", + dev->data->port_id, strerror(rte_errno)); + mlx5_txq_stop(dev); + return -rte_errno; + } + ret = mlx5_hairpin_bind(dev); + if (ret) { + DRV_LOG(ERR, "port %u hairpin binding failed: %s", + dev->data->port_id, strerror(rte_errno)); + mlx5_txq_stop(dev); + return -rte_errno; + } + /* Set started flag here for the following steps like control flow. */ + dev->data->dev_started = 1; + ret = mlx5_rx_intr_vec_enable(dev); + if (ret) { + DRV_LOG(ERR, "port %u Rx interrupt vector creation failed", + dev->data->port_id); + goto error; + } + mlx5_stats_init(dev); + ret = mlx5_traffic_enable(dev); + if (ret) { + DRV_LOG(ERR, "port %u failed to set defaults flows", + dev->data->port_id); + goto error; + } + /* Set a mask and offset of dynamic metadata flows into Rx queues*/ + mlx5_flow_rxq_dynf_metadata_set(dev); + /* + * In non-cached mode, it only needs to start the default mreg copy + * action and no flow created by application exists anymore. + * But it is worth wrapping the interface for further usage. + */ + ret = mlx5_flow_start_default(dev); + if (ret) { + DRV_LOG(DEBUG, "port %u failed to start default actions: %s", + dev->data->port_id, strerror(rte_errno)); + goto error; + } + rte_wmb(); + dev->tx_pkt_burst = mlx5_select_tx_function(dev); + dev->rx_pkt_burst = mlx5_select_rx_function(dev); + /* Enable datapath on secondary process. */ + mlx5_mp_req_start_rxtx(dev); + mlx5_dev_interrupt_handler_install(dev); + return 0; +error: + ret = rte_errno; /* Save rte_errno before cleanup. */ + /* Rollback. */ + dev->data->dev_started = 0; + mlx5_flow_stop_default(dev); + mlx5_traffic_disable(dev); + mlx5_txq_stop(dev); + mlx5_rxq_stop(dev); + rte_errno = ret; /* Restore rte_errno. */ + return -rte_errno; +} + +/** + * DPDK callback to stop the device. + * + * Simulate device stop by detaching all configured flows. + * + * @param dev + * Pointer to Ethernet device structure. + */ +void +mlx5_dev_stop(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + + dev->data->dev_started = 0; + /* Prevent crashes when queues are still in use. */ + dev->rx_pkt_burst = removed_rx_burst; + dev->tx_pkt_burst = removed_tx_burst; + rte_wmb(); + /* Disable datapath on secondary process. */ + mlx5_mp_req_stop_rxtx(dev); + usleep(1000 * priv->rxqs_n); + DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id); + mlx5_flow_stop_default(dev); + /* Control flows for default traffic can be removed firstly. */ + mlx5_traffic_disable(dev); + /* All RX queue flags will be cleared in the flush interface. */ + mlx5_flow_list_flush(dev, &priv->flows, true); + mlx5_rx_intr_vec_disable(dev); + mlx5_dev_interrupt_handler_uninstall(dev); + mlx5_txq_stop(dev); + mlx5_rxq_stop(dev); +} + +/** + * Enable traffic flows configured by control plane + * + * @param dev + * Pointer to Ethernet device private data. + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_traffic_enable(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct rte_flow_item_eth bcast = { + .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", + }; + struct rte_flow_item_eth ipv6_multi_spec = { + .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00", + }; + struct rte_flow_item_eth ipv6_multi_mask = { + .dst.addr_bytes = "\xff\xff\x00\x00\x00\x00", + }; + struct rte_flow_item_eth unicast = { + .src.addr_bytes = "\x00\x00\x00\x00\x00\x00", + }; + struct rte_flow_item_eth unicast_mask = { + .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", + }; + const unsigned int vlan_filter_n = priv->vlan_filter_n; + const struct rte_ether_addr cmp = { + .addr_bytes = "\x00\x00\x00\x00\x00\x00", + }; + unsigned int i; + unsigned int j; + int ret; + + /* + * Hairpin txq default flow should be created no matter if it is + * isolation mode. Or else all the packets to be sent will be sent + * out directly without the TX flow actions, e.g. encapsulation. + */ + for (i = 0; i != priv->txqs_n; ++i) { + struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i); + if (!txq_ctrl) + continue; + if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN) { + ret = mlx5_ctrl_flow_source_queue(dev, i); + if (ret) { + mlx5_txq_release(dev, i); + goto error; + } + } + mlx5_txq_release(dev, i); + } + if (priv->config.dv_esw_en && !priv->config.vf) { + if (mlx5_flow_create_esw_table_zero_flow(dev)) + priv->fdb_def_rule = 1; + else + DRV_LOG(INFO, "port %u FDB default rule cannot be" + " configured - only Eswitch group 0 flows are" + " supported.", dev->data->port_id); + } + if (priv->isolated) + return 0; + if (dev->data->promiscuous) { + struct rte_flow_item_eth promisc = { + .dst.addr_bytes = "\x00\x00\x00\x00\x00\x00", + .src.addr_bytes = "\x00\x00\x00\x00\x00\x00", + .type = 0, + }; + + ret = mlx5_ctrl_flow(dev, &promisc, &promisc); + if (ret) + goto error; + } + if (dev->data->all_multicast) { + struct rte_flow_item_eth multicast = { + .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00", + .src.addr_bytes = "\x00\x00\x00\x00\x00\x00", + .type = 0, + }; + + ret = mlx5_ctrl_flow(dev, &multicast, &multicast); + if (ret) + goto error; + } else { + /* Add broadcast/multicast flows. */ + for (i = 0; i != vlan_filter_n; ++i) { + uint16_t vlan = priv->vlan_filter[i]; + + struct rte_flow_item_vlan vlan_spec = { + .tci = rte_cpu_to_be_16(vlan), + }; + struct rte_flow_item_vlan vlan_mask = + rte_flow_item_vlan_mask; + + ret = mlx5_ctrl_flow_vlan(dev, &bcast, &bcast, + &vlan_spec, &vlan_mask); + if (ret) + goto error; + ret = mlx5_ctrl_flow_vlan(dev, &ipv6_multi_spec, + &ipv6_multi_mask, + &vlan_spec, &vlan_mask); + if (ret) + goto error; + } + if (!vlan_filter_n) { + ret = mlx5_ctrl_flow(dev, &bcast, &bcast); + if (ret) + goto error; + ret = mlx5_ctrl_flow(dev, &ipv6_multi_spec, + &ipv6_multi_mask); + if (ret) + goto error; + } + } + /* Add MAC address flows. */ + for (i = 0; i != MLX5_MAX_MAC_ADDRESSES; ++i) { + struct rte_ether_addr *mac = &dev->data->mac_addrs[i]; + + if (!memcmp(mac, &cmp, sizeof(*mac))) + continue; + memcpy(&unicast.dst.addr_bytes, + mac->addr_bytes, + RTE_ETHER_ADDR_LEN); + for (j = 0; j != vlan_filter_n; ++j) { + uint16_t vlan = priv->vlan_filter[j]; + + struct rte_flow_item_vlan vlan_spec = { + .tci = rte_cpu_to_be_16(vlan), + }; + struct rte_flow_item_vlan vlan_mask = + rte_flow_item_vlan_mask; + + ret = mlx5_ctrl_flow_vlan(dev, &unicast, + &unicast_mask, + &vlan_spec, + &vlan_mask); + if (ret) + goto error; + } + if (!vlan_filter_n) { + ret = mlx5_ctrl_flow(dev, &unicast, &unicast_mask); + if (ret) + goto error; + } + } + return 0; +error: + ret = rte_errno; /* Save rte_errno before cleanup. */ + mlx5_flow_list_flush(dev, &priv->ctrl_flows, false); + rte_errno = ret; /* Restore rte_errno. */ + return -rte_errno; +} + + +/** + * Disable traffic flows configured by control plane + * + * @param dev + * Pointer to Ethernet device private data. + */ +void +mlx5_traffic_disable(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + + mlx5_flow_list_flush(dev, &priv->ctrl_flows, false); +} + +/** + * Restart traffic flows configured by control plane + * + * @param dev + * Pointer to Ethernet device private data. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_traffic_restart(struct rte_eth_dev *dev) +{ + if (dev->data->dev_started) { + mlx5_traffic_disable(dev); + return mlx5_traffic_enable(dev); + } + return 0; +} diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_txq.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_txq.c new file mode 100644 index 000000000..a211fa91b --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_txq.c @@ -0,0 +1,1470 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2015 6WIND S.A. + * Copyright 2015 Mellanox Technologies, Ltd + */ + +#include <stddef.h> +#include <errno.h> +#include <string.h> +#include <stdint.h> +#include <unistd.h> +#include <sys/mman.h> +#include <inttypes.h> + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#include <infiniband/mlx5dv.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_mbuf.h> +#include <rte_malloc.h> +#include <rte_ethdev_driver.h> +#include <rte_common.h> + +#include <mlx5_glue.h> +#include <mlx5_devx_cmds.h> +#include <mlx5_common.h> +#include <mlx5_common_mr.h> + +#include "mlx5_defs.h" +#include "mlx5_utils.h" +#include "mlx5.h" +#include "mlx5_rxtx.h" +#include "mlx5_autoconf.h" + +/** + * Allocate TX queue elements. + * + * @param txq_ctrl + * Pointer to TX queue structure. + */ +void +txq_alloc_elts(struct mlx5_txq_ctrl *txq_ctrl) +{ + const unsigned int elts_n = 1 << txq_ctrl->txq.elts_n; + unsigned int i; + + for (i = 0; (i != elts_n); ++i) + txq_ctrl->txq.elts[i] = NULL; + DRV_LOG(DEBUG, "port %u Tx queue %u allocated and configured %u WRs", + PORT_ID(txq_ctrl->priv), txq_ctrl->txq.idx, elts_n); + txq_ctrl->txq.elts_head = 0; + txq_ctrl->txq.elts_tail = 0; + txq_ctrl->txq.elts_comp = 0; +} + +/** + * Free TX queue elements. + * + * @param txq_ctrl + * Pointer to TX queue structure. + */ +void +txq_free_elts(struct mlx5_txq_ctrl *txq_ctrl) +{ + const uint16_t elts_n = 1 << txq_ctrl->txq.elts_n; + const uint16_t elts_m = elts_n - 1; + uint16_t elts_head = txq_ctrl->txq.elts_head; + uint16_t elts_tail = txq_ctrl->txq.elts_tail; + struct rte_mbuf *(*elts)[elts_n] = &txq_ctrl->txq.elts; + + DRV_LOG(DEBUG, "port %u Tx queue %u freeing WRs", + PORT_ID(txq_ctrl->priv), txq_ctrl->txq.idx); + txq_ctrl->txq.elts_head = 0; + txq_ctrl->txq.elts_tail = 0; + txq_ctrl->txq.elts_comp = 0; + + while (elts_tail != elts_head) { + struct rte_mbuf *elt = (*elts)[elts_tail & elts_m]; + + MLX5_ASSERT(elt != NULL); + rte_pktmbuf_free_seg(elt); +#ifdef RTE_LIBRTE_MLX5_DEBUG + /* Poisoning. */ + memset(&(*elts)[elts_tail & elts_m], + 0x77, + sizeof((*elts)[elts_tail & elts_m])); +#endif + ++elts_tail; + } +} + +/** + * Returns the per-port supported offloads. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * Supported Tx offloads. + */ +uint64_t +mlx5_get_tx_port_offloads(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + uint64_t offloads = (DEV_TX_OFFLOAD_MULTI_SEGS | + DEV_TX_OFFLOAD_VLAN_INSERT); + struct mlx5_dev_config *config = &priv->config; + + if (config->hw_csum) + offloads |= (DEV_TX_OFFLOAD_IPV4_CKSUM | + DEV_TX_OFFLOAD_UDP_CKSUM | + DEV_TX_OFFLOAD_TCP_CKSUM); + if (config->tso) + offloads |= DEV_TX_OFFLOAD_TCP_TSO; + if (config->swp) { + if (config->hw_csum) + offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM; + if (config->tso) + offloads |= (DEV_TX_OFFLOAD_IP_TNL_TSO | + DEV_TX_OFFLOAD_UDP_TNL_TSO); + } + if (config->tunnel_en) { + if (config->hw_csum) + offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM; + if (config->tso) + offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO | + DEV_TX_OFFLOAD_GRE_TNL_TSO | + DEV_TX_OFFLOAD_GENEVE_TNL_TSO); + } + return offloads; +} + +/** + * Tx queue presetup checks. + * + * @param dev + * Pointer to Ethernet device structure. + * @param idx + * Tx queue index. + * @param desc + * Number of descriptors to configure in queue. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_tx_queue_pre_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc) +{ + struct mlx5_priv *priv = dev->data->dev_private; + + if (desc <= MLX5_TX_COMP_THRESH) { + DRV_LOG(WARNING, + "port %u number of descriptors requested for Tx queue" + " %u must be higher than MLX5_TX_COMP_THRESH, using %u" + " instead of %u", + dev->data->port_id, idx, MLX5_TX_COMP_THRESH + 1, desc); + desc = MLX5_TX_COMP_THRESH + 1; + } + if (!rte_is_power_of_2(desc)) { + desc = 1 << log2above(desc); + DRV_LOG(WARNING, + "port %u increased number of descriptors in Tx queue" + " %u to the next power of two (%d)", + dev->data->port_id, idx, desc); + } + DRV_LOG(DEBUG, "port %u configuring queue %u for %u descriptors", + dev->data->port_id, idx, desc); + if (idx >= priv->txqs_n) { + DRV_LOG(ERR, "port %u Tx queue index out of range (%u >= %u)", + dev->data->port_id, idx, priv->txqs_n); + rte_errno = EOVERFLOW; + return -rte_errno; + } + if (!mlx5_txq_releasable(dev, idx)) { + rte_errno = EBUSY; + DRV_LOG(ERR, "port %u unable to release queue index %u", + dev->data->port_id, idx); + return -rte_errno; + } + mlx5_txq_release(dev, idx); + return 0; +} +/** + * DPDK callback to configure a TX queue. + * + * @param dev + * Pointer to Ethernet device structure. + * @param idx + * TX queue index. + * @param desc + * Number of descriptors to configure in queue. + * @param socket + * NUMA socket on which memory must be allocated. + * @param[in] conf + * Thresholds parameters. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + unsigned int socket, const struct rte_eth_txconf *conf) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_txq_data *txq = (*priv->txqs)[idx]; + struct mlx5_txq_ctrl *txq_ctrl = + container_of(txq, struct mlx5_txq_ctrl, txq); + int res; + + res = mlx5_tx_queue_pre_setup(dev, idx, desc); + if (res) + return res; + txq_ctrl = mlx5_txq_new(dev, idx, desc, socket, conf); + if (!txq_ctrl) { + DRV_LOG(ERR, "port %u unable to allocate queue index %u", + dev->data->port_id, idx); + return -rte_errno; + } + DRV_LOG(DEBUG, "port %u adding Tx queue %u to list", + dev->data->port_id, idx); + (*priv->txqs)[idx] = &txq_ctrl->txq; + return 0; +} + +/** + * DPDK callback to configure a TX hairpin queue. + * + * @param dev + * Pointer to Ethernet device structure. + * @param idx + * TX queue index. + * @param desc + * Number of descriptors to configure in queue. + * @param[in] hairpin_conf + * The hairpin binding configuration. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_tx_hairpin_queue_setup(struct rte_eth_dev *dev, uint16_t idx, + uint16_t desc, + const struct rte_eth_hairpin_conf *hairpin_conf) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_txq_data *txq = (*priv->txqs)[idx]; + struct mlx5_txq_ctrl *txq_ctrl = + container_of(txq, struct mlx5_txq_ctrl, txq); + int res; + + res = mlx5_tx_queue_pre_setup(dev, idx, desc); + if (res) + return res; + if (hairpin_conf->peer_count != 1 || + hairpin_conf->peers[0].port != dev->data->port_id || + hairpin_conf->peers[0].queue >= priv->rxqs_n) { + DRV_LOG(ERR, "port %u unable to setup hairpin queue index %u " + " invalid hairpind configuration", dev->data->port_id, + idx); + rte_errno = EINVAL; + return -rte_errno; + } + txq_ctrl = mlx5_txq_hairpin_new(dev, idx, desc, hairpin_conf); + if (!txq_ctrl) { + DRV_LOG(ERR, "port %u unable to allocate queue index %u", + dev->data->port_id, idx); + return -rte_errno; + } + DRV_LOG(DEBUG, "port %u adding Tx queue %u to list", + dev->data->port_id, idx); + (*priv->txqs)[idx] = &txq_ctrl->txq; + return 0; +} + +/** + * DPDK callback to release a TX queue. + * + * @param dpdk_txq + * Generic TX queue pointer. + */ +void +mlx5_tx_queue_release(void *dpdk_txq) +{ + struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq; + struct mlx5_txq_ctrl *txq_ctrl; + struct mlx5_priv *priv; + unsigned int i; + + if (txq == NULL) + return; + txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq); + priv = txq_ctrl->priv; + for (i = 0; (i != priv->txqs_n); ++i) + if ((*priv->txqs)[i] == txq) { + DRV_LOG(DEBUG, "port %u removing Tx queue %u from list", + PORT_ID(priv), txq->idx); + mlx5_txq_release(ETH_DEV(priv), i); + break; + } +} + +/** + * Configure the doorbell register non-cached attribute. + * + * @param txq_ctrl + * Pointer to Tx queue control structure. + * @param page_size + * Systme page size + */ +static void +txq_uar_ncattr_init(struct mlx5_txq_ctrl *txq_ctrl, size_t page_size) +{ + struct mlx5_priv *priv = txq_ctrl->priv; + off_t cmd; + + txq_ctrl->txq.db_heu = priv->config.dbnc == MLX5_TXDB_HEURISTIC; + txq_ctrl->txq.db_nc = 0; + /* Check the doorbell register mapping type. */ + cmd = txq_ctrl->uar_mmap_offset / page_size; + cmd >>= MLX5_UAR_MMAP_CMD_SHIFT; + cmd &= MLX5_UAR_MMAP_CMD_MASK; + if (cmd == MLX5_MMAP_GET_NC_PAGES_CMD) + txq_ctrl->txq.db_nc = 1; +} + +/** + * Initialize Tx UAR registers for primary process. + * + * @param txq_ctrl + * Pointer to Tx queue control structure. + */ +static void +txq_uar_init(struct mlx5_txq_ctrl *txq_ctrl) +{ + struct mlx5_priv *priv = txq_ctrl->priv; + struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(priv)); + const size_t page_size = sysconf(_SC_PAGESIZE); +#ifndef RTE_ARCH_64 + unsigned int lock_idx; +#endif + + if (txq_ctrl->type != MLX5_TXQ_TYPE_STANDARD) + return; + MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); + MLX5_ASSERT(ppriv); + ppriv->uar_table[txq_ctrl->txq.idx] = txq_ctrl->bf_reg; + txq_uar_ncattr_init(txq_ctrl, page_size); +#ifndef RTE_ARCH_64 + /* Assign an UAR lock according to UAR page number */ + lock_idx = (txq_ctrl->uar_mmap_offset / page_size) & + MLX5_UAR_PAGE_NUM_MASK; + txq_ctrl->txq.uar_lock = &priv->uar_lock[lock_idx]; +#endif +} + +/** + * Remap UAR register of a Tx queue for secondary process. + * + * Remapped address is stored at the table in the process private structure of + * the device, indexed by queue index. + * + * @param txq_ctrl + * Pointer to Tx queue control structure. + * @param fd + * Verbs file descriptor to map UAR pages. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +txq_uar_init_secondary(struct mlx5_txq_ctrl *txq_ctrl, int fd) +{ + struct mlx5_priv *priv = txq_ctrl->priv; + struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(priv)); + struct mlx5_txq_data *txq = &txq_ctrl->txq; + void *addr; + uintptr_t uar_va; + uintptr_t offset; + const size_t page_size = sysconf(_SC_PAGESIZE); + + if (txq_ctrl->type != MLX5_TXQ_TYPE_STANDARD) + return 0; + MLX5_ASSERT(ppriv); + /* + * As rdma-core, UARs are mapped in size of OS page + * size. Ref to libmlx5 function: mlx5_init_context() + */ + uar_va = (uintptr_t)txq_ctrl->bf_reg; + offset = uar_va & (page_size - 1); /* Offset in page. */ + addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd, + txq_ctrl->uar_mmap_offset); + if (addr == MAP_FAILED) { + DRV_LOG(ERR, + "port %u mmap failed for BF reg of txq %u", + txq->port_id, txq->idx); + rte_errno = ENXIO; + return -rte_errno; + } + addr = RTE_PTR_ADD(addr, offset); + ppriv->uar_table[txq->idx] = addr; + txq_uar_ncattr_init(txq_ctrl, page_size); + return 0; +} + +/** + * Unmap UAR register of a Tx queue for secondary process. + * + * @param txq_ctrl + * Pointer to Tx queue control structure. + */ +static void +txq_uar_uninit_secondary(struct mlx5_txq_ctrl *txq_ctrl) +{ + struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(txq_ctrl->priv)); + const size_t page_size = sysconf(_SC_PAGESIZE); + void *addr; + + if (txq_ctrl->type != MLX5_TXQ_TYPE_STANDARD) + return; + addr = ppriv->uar_table[txq_ctrl->txq.idx]; + munmap(RTE_PTR_ALIGN_FLOOR(addr, page_size), page_size); +} + +/** + * Initialize Tx UAR registers for secondary process. + * + * @param dev + * Pointer to Ethernet device. + * @param fd + * Verbs file descriptor to map UAR pages. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_tx_uar_init_secondary(struct rte_eth_dev *dev, int fd) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_txq_data *txq; + struct mlx5_txq_ctrl *txq_ctrl; + unsigned int i; + int ret; + + MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_SECONDARY); + for (i = 0; i != priv->txqs_n; ++i) { + if (!(*priv->txqs)[i]) + continue; + txq = (*priv->txqs)[i]; + txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq); + if (txq_ctrl->type != MLX5_TXQ_TYPE_STANDARD) + continue; + MLX5_ASSERT(txq->idx == (uint16_t)i); + ret = txq_uar_init_secondary(txq_ctrl, fd); + if (ret) + goto error; + } + return 0; +error: + /* Rollback. */ + do { + if (!(*priv->txqs)[i]) + continue; + txq = (*priv->txqs)[i]; + txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq); + txq_uar_uninit_secondary(txq_ctrl); + } while (i--); + return -rte_errno; +} + +/** + * Create the Tx hairpin queue object. + * + * @param dev + * Pointer to Ethernet device. + * @param idx + * Queue index in DPDK Tx queue array + * + * @return + * The hairpin DevX object initialised, NULL otherwise and rte_errno is set. + */ +static struct mlx5_txq_obj * +mlx5_txq_obj_hairpin_new(struct rte_eth_dev *dev, uint16_t idx) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_txq_data *txq_data = (*priv->txqs)[idx]; + struct mlx5_txq_ctrl *txq_ctrl = + container_of(txq_data, struct mlx5_txq_ctrl, txq); + struct mlx5_devx_create_sq_attr attr = { 0 }; + struct mlx5_txq_obj *tmpl = NULL; + int ret = 0; + uint32_t max_wq_data; + + MLX5_ASSERT(txq_data); + MLX5_ASSERT(!txq_ctrl->obj); + tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0, + txq_ctrl->socket); + if (!tmpl) { + DRV_LOG(ERR, + "port %u Tx queue %u cannot allocate memory resources", + dev->data->port_id, txq_data->idx); + rte_errno = ENOMEM; + goto error; + } + tmpl->type = MLX5_TXQ_OBJ_TYPE_DEVX_HAIRPIN; + tmpl->txq_ctrl = txq_ctrl; + attr.hairpin = 1; + attr.tis_lst_sz = 1; + max_wq_data = priv->config.hca_attr.log_max_hairpin_wq_data_sz; + /* Jumbo frames > 9KB should be supported, and more packets. */ + if (priv->config.log_hp_size != (uint32_t)MLX5_ARG_UNSET) { + if (priv->config.log_hp_size > max_wq_data) { + DRV_LOG(ERR, "total data size %u power of 2 is " + "too large for hairpin", + priv->config.log_hp_size); + rte_errno = ERANGE; + return NULL; + } + attr.wq_attr.log_hairpin_data_sz = priv->config.log_hp_size; + } else { + attr.wq_attr.log_hairpin_data_sz = + (max_wq_data < MLX5_HAIRPIN_JUMBO_LOG_SIZE) ? + max_wq_data : MLX5_HAIRPIN_JUMBO_LOG_SIZE; + } + /* Set the packets number to the maximum value for performance. */ + attr.wq_attr.log_hairpin_num_packets = + attr.wq_attr.log_hairpin_data_sz - + MLX5_HAIRPIN_QUEUE_STRIDE; + attr.tis_num = priv->sh->tis->id; + tmpl->sq = mlx5_devx_cmd_create_sq(priv->sh->ctx, &attr); + if (!tmpl->sq) { + DRV_LOG(ERR, + "port %u tx hairpin queue %u can't create sq object", + dev->data->port_id, idx); + rte_errno = errno; + goto error; + } + DRV_LOG(DEBUG, "port %u sxq %u updated with %p", dev->data->port_id, + idx, (void *)&tmpl); + rte_atomic32_inc(&tmpl->refcnt); + LIST_INSERT_HEAD(&priv->txqsobj, tmpl, next); + return tmpl; +error: + ret = rte_errno; /* Save rte_errno before cleanup. */ + if (tmpl->tis) + mlx5_devx_cmd_destroy(tmpl->tis); + if (tmpl->sq) + mlx5_devx_cmd_destroy(tmpl->sq); + rte_errno = ret; /* Restore rte_errno. */ + return NULL; +} + +/** + * Create the Tx queue Verbs object. + * + * @param dev + * Pointer to Ethernet device. + * @param idx + * Queue index in DPDK Tx queue array. + * @param type + * Type of the Tx queue object to create. + * + * @return + * The Verbs object initialised, NULL otherwise and rte_errno is set. + */ +struct mlx5_txq_obj * +mlx5_txq_obj_new(struct rte_eth_dev *dev, uint16_t idx, + enum mlx5_txq_obj_type type) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_txq_data *txq_data = (*priv->txqs)[idx]; + struct mlx5_txq_ctrl *txq_ctrl = + container_of(txq_data, struct mlx5_txq_ctrl, txq); + struct mlx5_txq_obj tmpl; + struct mlx5_txq_obj *txq_obj = NULL; + union { + struct ibv_qp_init_attr_ex init; + struct ibv_cq_init_attr_ex cq; + struct ibv_qp_attr mod; + } attr; + unsigned int cqe_n; + struct mlx5dv_qp qp = { .comp_mask = MLX5DV_QP_MASK_UAR_MMAP_OFFSET }; + struct mlx5dv_cq cq_info; + struct mlx5dv_obj obj; + const int desc = 1 << txq_data->elts_n; + int ret = 0; + + if (type == MLX5_TXQ_OBJ_TYPE_DEVX_HAIRPIN) + return mlx5_txq_obj_hairpin_new(dev, idx); +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + /* If using DevX, need additional mask to read tisn value. */ + if (priv->config.devx && !priv->sh->tdn) + qp.comp_mask |= MLX5DV_QP_MASK_RAW_QP_HANDLES; +#endif + MLX5_ASSERT(txq_data); + priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_TX_QUEUE; + priv->verbs_alloc_ctx.obj = txq_ctrl; + if (mlx5_getenv_int("MLX5_ENABLE_CQE_COMPRESSION")) { + DRV_LOG(ERR, + "port %u MLX5_ENABLE_CQE_COMPRESSION must never be set", + dev->data->port_id); + rte_errno = EINVAL; + return NULL; + } + memset(&tmpl, 0, sizeof(struct mlx5_txq_obj)); + attr.cq = (struct ibv_cq_init_attr_ex){ + .comp_mask = 0, + }; + cqe_n = desc / MLX5_TX_COMP_THRESH + + 1 + MLX5_TX_COMP_THRESH_INLINE_DIV; + tmpl.cq = mlx5_glue->create_cq(priv->sh->ctx, cqe_n, NULL, NULL, 0); + if (tmpl.cq == NULL) { + DRV_LOG(ERR, "port %u Tx queue %u CQ creation failure", + dev->data->port_id, idx); + rte_errno = errno; + goto error; + } + attr.init = (struct ibv_qp_init_attr_ex){ + /* CQ to be associated with the send queue. */ + .send_cq = tmpl.cq, + /* CQ to be associated with the receive queue. */ + .recv_cq = tmpl.cq, + .cap = { + /* Max number of outstanding WRs. */ + .max_send_wr = + ((priv->sh->device_attr.orig_attr.max_qp_wr < + desc) ? + priv->sh->device_attr.orig_attr.max_qp_wr : + desc), + /* + * Max number of scatter/gather elements in a WR, + * must be 1 to prevent libmlx5 from trying to affect + * too much memory. TX gather is not impacted by the + * device_attr.max_sge limit and will still work + * properly. + */ + .max_send_sge = 1, + }, + .qp_type = IBV_QPT_RAW_PACKET, + /* + * Do *NOT* enable this, completions events are managed per + * Tx burst. + */ + .sq_sig_all = 0, + .pd = priv->sh->pd, + .comp_mask = IBV_QP_INIT_ATTR_PD, + }; + if (txq_data->inlen_send) + attr.init.cap.max_inline_data = txq_ctrl->max_inline_data; + if (txq_data->tso_en) { + attr.init.max_tso_header = txq_ctrl->max_tso_header; + attr.init.comp_mask |= IBV_QP_INIT_ATTR_MAX_TSO_HEADER; + } + tmpl.qp = mlx5_glue->create_qp_ex(priv->sh->ctx, &attr.init); + if (tmpl.qp == NULL) { + DRV_LOG(ERR, "port %u Tx queue %u QP creation failure", + dev->data->port_id, idx); + rte_errno = errno; + goto error; + } + attr.mod = (struct ibv_qp_attr){ + /* Move the QP to this state. */ + .qp_state = IBV_QPS_INIT, + /* IB device port number. */ + .port_num = (uint8_t)priv->ibv_port, + }; + ret = mlx5_glue->modify_qp(tmpl.qp, &attr.mod, + (IBV_QP_STATE | IBV_QP_PORT)); + if (ret) { + DRV_LOG(ERR, + "port %u Tx queue %u QP state to IBV_QPS_INIT failed", + dev->data->port_id, idx); + rte_errno = errno; + goto error; + } + attr.mod = (struct ibv_qp_attr){ + .qp_state = IBV_QPS_RTR + }; + ret = mlx5_glue->modify_qp(tmpl.qp, &attr.mod, IBV_QP_STATE); + if (ret) { + DRV_LOG(ERR, + "port %u Tx queue %u QP state to IBV_QPS_RTR failed", + dev->data->port_id, idx); + rte_errno = errno; + goto error; + } + attr.mod.qp_state = IBV_QPS_RTS; + ret = mlx5_glue->modify_qp(tmpl.qp, &attr.mod, IBV_QP_STATE); + if (ret) { + DRV_LOG(ERR, + "port %u Tx queue %u QP state to IBV_QPS_RTS failed", + dev->data->port_id, idx); + rte_errno = errno; + goto error; + } + txq_obj = rte_calloc_socket(__func__, 1, sizeof(struct mlx5_txq_obj), 0, + txq_ctrl->socket); + if (!txq_obj) { + DRV_LOG(ERR, "port %u Tx queue %u cannot allocate memory", + dev->data->port_id, idx); + rte_errno = ENOMEM; + goto error; + } + obj.cq.in = tmpl.cq; + obj.cq.out = &cq_info; + obj.qp.in = tmpl.qp; + obj.qp.out = &qp; + ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_QP); + if (ret != 0) { + rte_errno = errno; + goto error; + } + if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) { + DRV_LOG(ERR, + "port %u wrong MLX5_CQE_SIZE environment variable" + " value: it should be set to %u", + dev->data->port_id, RTE_CACHE_LINE_SIZE); + rte_errno = EINVAL; + goto error; + } + txq_data->cqe_n = log2above(cq_info.cqe_cnt); + txq_data->cqe_s = 1 << txq_data->cqe_n; + txq_data->cqe_m = txq_data->cqe_s - 1; + txq_data->qp_num_8s = tmpl.qp->qp_num << 8; + txq_data->wqes = qp.sq.buf; + txq_data->wqe_n = log2above(qp.sq.wqe_cnt); + txq_data->wqe_s = 1 << txq_data->wqe_n; + txq_data->wqe_m = txq_data->wqe_s - 1; + txq_data->wqes_end = txq_data->wqes + txq_data->wqe_s; + txq_data->qp_db = &qp.dbrec[MLX5_SND_DBR]; + txq_data->cq_db = cq_info.dbrec; + txq_data->cqes = (volatile struct mlx5_cqe *)cq_info.buf; + txq_data->cq_ci = 0; + txq_data->cq_pi = 0; + txq_data->wqe_ci = 0; + txq_data->wqe_pi = 0; + txq_data->wqe_comp = 0; + txq_data->wqe_thres = txq_data->wqe_s / MLX5_TX_COMP_THRESH_INLINE_DIV; + txq_data->fcqs = rte_calloc_socket(__func__, + txq_data->cqe_s, + sizeof(*txq_data->fcqs), + RTE_CACHE_LINE_SIZE, + txq_ctrl->socket); + if (!txq_data->fcqs) { + DRV_LOG(ERR, "port %u Tx queue %u cannot allocate memory (FCQ)", + dev->data->port_id, idx); + rte_errno = ENOMEM; + goto error; + } +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + /* + * If using DevX need to query and store TIS transport domain value. + * This is done once per port. + * Will use this value on Rx, when creating matching TIR. + */ + if (priv->config.devx && !priv->sh->tdn) { + ret = mlx5_devx_cmd_qp_query_tis_td(tmpl.qp, qp.tisn, + &priv->sh->tdn); + if (ret) { + DRV_LOG(ERR, "Fail to query port %u Tx queue %u QP TIS " + "transport domain", dev->data->port_id, idx); + rte_errno = EINVAL; + goto error; + } else { + DRV_LOG(DEBUG, "port %u Tx queue %u TIS number %d " + "transport domain %d", dev->data->port_id, + idx, qp.tisn, priv->sh->tdn); + } + } +#endif + txq_obj->qp = tmpl.qp; + txq_obj->cq = tmpl.cq; + rte_atomic32_inc(&txq_obj->refcnt); + txq_ctrl->bf_reg = qp.bf.reg; + if (qp.comp_mask & MLX5DV_QP_MASK_UAR_MMAP_OFFSET) { + txq_ctrl->uar_mmap_offset = qp.uar_mmap_offset; + DRV_LOG(DEBUG, "port %u: uar_mmap_offset 0x%"PRIx64, + dev->data->port_id, txq_ctrl->uar_mmap_offset); + } else { + DRV_LOG(ERR, + "port %u failed to retrieve UAR info, invalid" + " libmlx5.so", + dev->data->port_id); + rte_errno = EINVAL; + goto error; + } + txq_uar_init(txq_ctrl); + LIST_INSERT_HEAD(&priv->txqsobj, txq_obj, next); + txq_obj->txq_ctrl = txq_ctrl; + priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE; + return txq_obj; +error: + ret = rte_errno; /* Save rte_errno before cleanup. */ + if (tmpl.cq) + claim_zero(mlx5_glue->destroy_cq(tmpl.cq)); + if (tmpl.qp) + claim_zero(mlx5_glue->destroy_qp(tmpl.qp)); + if (txq_data && txq_data->fcqs) + rte_free(txq_data->fcqs); + if (txq_obj) + rte_free(txq_obj); + priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE; + rte_errno = ret; /* Restore rte_errno. */ + return NULL; +} + +/** + * Get an Tx queue Verbs object. + * + * @param dev + * Pointer to Ethernet device. + * @param idx + * Queue index in DPDK Tx queue array. + * + * @return + * The Verbs object if it exists. + */ +struct mlx5_txq_obj * +mlx5_txq_obj_get(struct rte_eth_dev *dev, uint16_t idx) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_txq_ctrl *txq_ctrl; + + if (idx >= priv->txqs_n) + return NULL; + if (!(*priv->txqs)[idx]) + return NULL; + txq_ctrl = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq); + if (txq_ctrl->obj) + rte_atomic32_inc(&txq_ctrl->obj->refcnt); + return txq_ctrl->obj; +} + +/** + * Release an Tx verbs queue object. + * + * @param txq_obj + * Verbs Tx queue object. + * + * @return + * 1 while a reference on it exists, 0 when freed. + */ +int +mlx5_txq_obj_release(struct mlx5_txq_obj *txq_obj) +{ + MLX5_ASSERT(txq_obj); + if (rte_atomic32_dec_and_test(&txq_obj->refcnt)) { + if (txq_obj->type == MLX5_TXQ_OBJ_TYPE_DEVX_HAIRPIN) { + if (txq_obj->tis) + claim_zero(mlx5_devx_cmd_destroy(txq_obj->tis)); + } else { + claim_zero(mlx5_glue->destroy_qp(txq_obj->qp)); + claim_zero(mlx5_glue->destroy_cq(txq_obj->cq)); + if (txq_obj->txq_ctrl->txq.fcqs) + rte_free(txq_obj->txq_ctrl->txq.fcqs); + } + LIST_REMOVE(txq_obj, next); + rte_free(txq_obj); + return 0; + } + return 1; +} + +/** + * Verify the Verbs Tx queue list is empty + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * The number of object not released. + */ +int +mlx5_txq_obj_verify(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + int ret = 0; + struct mlx5_txq_obj *txq_obj; + + LIST_FOREACH(txq_obj, &priv->txqsobj, next) { + DRV_LOG(DEBUG, "port %u Verbs Tx queue %u still referenced", + dev->data->port_id, txq_obj->txq_ctrl->txq.idx); + ++ret; + } + return ret; +} + +/** + * Calculate the total number of WQEBB for Tx queue. + * + * Simplified version of calc_sq_size() in rdma-core. + * + * @param txq_ctrl + * Pointer to Tx queue control structure. + * + * @return + * The number of WQEBB. + */ +static int +txq_calc_wqebb_cnt(struct mlx5_txq_ctrl *txq_ctrl) +{ + unsigned int wqe_size; + const unsigned int desc = 1 << txq_ctrl->txq.elts_n; + + wqe_size = MLX5_WQE_CSEG_SIZE + + MLX5_WQE_ESEG_SIZE + + MLX5_WSEG_SIZE - + MLX5_ESEG_MIN_INLINE_SIZE + + txq_ctrl->max_inline_data; + return rte_align32pow2(wqe_size * desc) / MLX5_WQE_SIZE; +} + +/** + * Calculate the maximal inline data size for Tx queue. + * + * @param txq_ctrl + * Pointer to Tx queue control structure. + * + * @return + * The maximal inline data size. + */ +static unsigned int +txq_calc_inline_max(struct mlx5_txq_ctrl *txq_ctrl) +{ + const unsigned int desc = 1 << txq_ctrl->txq.elts_n; + struct mlx5_priv *priv = txq_ctrl->priv; + unsigned int wqe_size; + + wqe_size = priv->sh->device_attr.orig_attr.max_qp_wr / desc; + if (!wqe_size) + return 0; + /* + * This calculation is derived from tthe source of + * mlx5_calc_send_wqe() in rdma_core library. + */ + wqe_size = wqe_size * MLX5_WQE_SIZE - + MLX5_WQE_CSEG_SIZE - + MLX5_WQE_ESEG_SIZE - + MLX5_WSEG_SIZE - + MLX5_WSEG_SIZE + + MLX5_DSEG_MIN_INLINE_SIZE; + return wqe_size; +} + +/** + * Set Tx queue parameters from device configuration. + * + * @param txq_ctrl + * Pointer to Tx queue control structure. + */ +static void +txq_set_params(struct mlx5_txq_ctrl *txq_ctrl) +{ + struct mlx5_priv *priv = txq_ctrl->priv; + struct mlx5_dev_config *config = &priv->config; + unsigned int inlen_send; /* Inline data for ordinary SEND.*/ + unsigned int inlen_empw; /* Inline data for enhanced MPW. */ + unsigned int inlen_mode; /* Minimal required Inline data. */ + unsigned int txqs_inline; /* Min Tx queues to enable inline. */ + uint64_t dev_txoff = priv->dev_data->dev_conf.txmode.offloads; + bool tso = txq_ctrl->txq.offloads & (DEV_TX_OFFLOAD_TCP_TSO | + DEV_TX_OFFLOAD_VXLAN_TNL_TSO | + DEV_TX_OFFLOAD_GRE_TNL_TSO | + DEV_TX_OFFLOAD_IP_TNL_TSO | + DEV_TX_OFFLOAD_UDP_TNL_TSO); + bool vlan_inline; + unsigned int temp; + + if (config->txqs_inline == MLX5_ARG_UNSET) + txqs_inline = +#if defined(RTE_ARCH_ARM64) + (priv->pci_dev->id.device_id == + PCI_DEVICE_ID_MELLANOX_CONNECTX5BF) ? + MLX5_INLINE_MAX_TXQS_BLUEFIELD : +#endif + MLX5_INLINE_MAX_TXQS; + else + txqs_inline = (unsigned int)config->txqs_inline; + inlen_send = (config->txq_inline_max == MLX5_ARG_UNSET) ? + MLX5_SEND_DEF_INLINE_LEN : + (unsigned int)config->txq_inline_max; + inlen_empw = (config->txq_inline_mpw == MLX5_ARG_UNSET) ? + MLX5_EMPW_DEF_INLINE_LEN : + (unsigned int)config->txq_inline_mpw; + inlen_mode = (config->txq_inline_min == MLX5_ARG_UNSET) ? + 0 : (unsigned int)config->txq_inline_min; + if (config->mps != MLX5_MPW_ENHANCED && config->mps != MLX5_MPW) + inlen_empw = 0; + /* + * If there is requested minimal amount of data to inline + * we MUST enable inlining. This is a case for ConnectX-4 + * which usually requires L2 inlined for correct operating + * and ConnectX-4 Lx which requires L2-L4 inlined to + * support E-Switch Flows. + */ + if (inlen_mode) { + if (inlen_mode <= MLX5_ESEG_MIN_INLINE_SIZE) { + /* + * Optimize minimal inlining for single + * segment packets to fill one WQEBB + * without gaps. + */ + temp = MLX5_ESEG_MIN_INLINE_SIZE; + } else { + temp = inlen_mode - MLX5_ESEG_MIN_INLINE_SIZE; + temp = RTE_ALIGN(temp, MLX5_WSEG_SIZE) + + MLX5_ESEG_MIN_INLINE_SIZE; + temp = RTE_MIN(temp, MLX5_SEND_MAX_INLINE_LEN); + } + if (temp != inlen_mode) { + DRV_LOG(INFO, + "port %u minimal required inline setting" + " aligned from %u to %u", + PORT_ID(priv), inlen_mode, temp); + inlen_mode = temp; + } + } + /* + * If port is configured to support VLAN insertion and device + * does not support this feature by HW (for NICs before ConnectX-5 + * or in case of wqe_vlan_insert flag is not set) we must enable + * data inline on all queues because it is supported by single + * tx_burst routine. + */ + txq_ctrl->txq.vlan_en = config->hw_vlan_insert; + vlan_inline = (dev_txoff & DEV_TX_OFFLOAD_VLAN_INSERT) && + !config->hw_vlan_insert; + /* + * If there are few Tx queues it is prioritized + * to save CPU cycles and disable data inlining at all. + */ + if (inlen_send && priv->txqs_n >= txqs_inline) { + /* + * The data sent with ordinal MLX5_OPCODE_SEND + * may be inlined in Ethernet Segment, align the + * length accordingly to fit entire WQEBBs. + */ + temp = RTE_MAX(inlen_send, + MLX5_ESEG_MIN_INLINE_SIZE + MLX5_WQE_DSEG_SIZE); + temp -= MLX5_ESEG_MIN_INLINE_SIZE + MLX5_WQE_DSEG_SIZE; + temp = RTE_ALIGN(temp, MLX5_WQE_SIZE); + temp += MLX5_ESEG_MIN_INLINE_SIZE + MLX5_WQE_DSEG_SIZE; + temp = RTE_MIN(temp, MLX5_WQE_SIZE_MAX + + MLX5_ESEG_MIN_INLINE_SIZE - + MLX5_WQE_CSEG_SIZE - + MLX5_WQE_ESEG_SIZE - + MLX5_WQE_DSEG_SIZE * 2); + temp = RTE_MIN(temp, MLX5_SEND_MAX_INLINE_LEN); + temp = RTE_MAX(temp, inlen_mode); + if (temp != inlen_send) { + DRV_LOG(INFO, + "port %u ordinary send inline setting" + " aligned from %u to %u", + PORT_ID(priv), inlen_send, temp); + inlen_send = temp; + } + /* + * Not aligned to cache lines, but to WQEs. + * First bytes of data (initial alignment) + * is going to be copied explicitly at the + * beginning of inlining buffer in Ethernet + * Segment. + */ + MLX5_ASSERT(inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE); + MLX5_ASSERT(inlen_send <= MLX5_WQE_SIZE_MAX + + MLX5_ESEG_MIN_INLINE_SIZE - + MLX5_WQE_CSEG_SIZE - + MLX5_WQE_ESEG_SIZE - + MLX5_WQE_DSEG_SIZE * 2); + } else if (inlen_mode) { + /* + * If minimal inlining is requested we must + * enable inlining in general, despite the + * number of configured queues. Ignore the + * txq_inline_max devarg, this is not + * full-featured inline. + */ + inlen_send = inlen_mode; + inlen_empw = 0; + } else if (vlan_inline) { + /* + * Hardware does not report offload for + * VLAN insertion, we must enable data inline + * to implement feature by software. + */ + inlen_send = MLX5_ESEG_MIN_INLINE_SIZE; + inlen_empw = 0; + } else { + inlen_send = 0; + inlen_empw = 0; + } + txq_ctrl->txq.inlen_send = inlen_send; + txq_ctrl->txq.inlen_mode = inlen_mode; + txq_ctrl->txq.inlen_empw = 0; + if (inlen_send && inlen_empw && priv->txqs_n >= txqs_inline) { + /* + * The data sent with MLX5_OPCODE_ENHANCED_MPSW + * may be inlined in Data Segment, align the + * length accordingly to fit entire WQEBBs. + */ + temp = RTE_MAX(inlen_empw, + MLX5_WQE_SIZE + MLX5_DSEG_MIN_INLINE_SIZE); + temp -= MLX5_DSEG_MIN_INLINE_SIZE; + temp = RTE_ALIGN(temp, MLX5_WQE_SIZE); + temp += MLX5_DSEG_MIN_INLINE_SIZE; + temp = RTE_MIN(temp, MLX5_WQE_SIZE_MAX + + MLX5_DSEG_MIN_INLINE_SIZE - + MLX5_WQE_CSEG_SIZE - + MLX5_WQE_ESEG_SIZE - + MLX5_WQE_DSEG_SIZE); + temp = RTE_MIN(temp, MLX5_EMPW_MAX_INLINE_LEN); + if (temp != inlen_empw) { + DRV_LOG(INFO, + "port %u enhanced empw inline setting" + " aligned from %u to %u", + PORT_ID(priv), inlen_empw, temp); + inlen_empw = temp; + } + MLX5_ASSERT(inlen_empw >= MLX5_ESEG_MIN_INLINE_SIZE); + MLX5_ASSERT(inlen_empw <= MLX5_WQE_SIZE_MAX + + MLX5_DSEG_MIN_INLINE_SIZE - + MLX5_WQE_CSEG_SIZE - + MLX5_WQE_ESEG_SIZE - + MLX5_WQE_DSEG_SIZE); + txq_ctrl->txq.inlen_empw = inlen_empw; + } + txq_ctrl->max_inline_data = RTE_MAX(inlen_send, inlen_empw); + if (tso) { + txq_ctrl->max_tso_header = MLX5_MAX_TSO_HEADER; + txq_ctrl->max_inline_data = RTE_MAX(txq_ctrl->max_inline_data, + MLX5_MAX_TSO_HEADER); + txq_ctrl->txq.tso_en = 1; + } + txq_ctrl->txq.tunnel_en = config->tunnel_en | config->swp; + txq_ctrl->txq.swp_en = ((DEV_TX_OFFLOAD_IP_TNL_TSO | + DEV_TX_OFFLOAD_UDP_TNL_TSO | + DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM) & + txq_ctrl->txq.offloads) && config->swp; +} + +/** + * Adjust Tx queue data inline parameters for large queue sizes. + * The data inline feature requires multiple WQEs to fit the packets, + * and if the large amount of Tx descriptors is requested by application + * the total WQE amount may exceed the hardware capabilities. If the + * default inline setting are used we can try to adjust these ones and + * meet the hardware requirements and not exceed the queue size. + * + * @param txq_ctrl + * Pointer to Tx queue control structure. + * + * @return + * Zero on success, otherwise the parameters can not be adjusted. + */ +static int +txq_adjust_params(struct mlx5_txq_ctrl *txq_ctrl) +{ + struct mlx5_priv *priv = txq_ctrl->priv; + struct mlx5_dev_config *config = &priv->config; + unsigned int max_inline; + + max_inline = txq_calc_inline_max(txq_ctrl); + if (!txq_ctrl->txq.inlen_send) { + /* + * Inline data feature is not engaged at all. + * There is nothing to adjust. + */ + return 0; + } + if (txq_ctrl->max_inline_data <= max_inline) { + /* + * The requested inline data length does not + * exceed queue capabilities. + */ + return 0; + } + if (txq_ctrl->txq.inlen_mode > max_inline) { + DRV_LOG(ERR, + "minimal data inline requirements (%u) are not" + " satisfied (%u) on port %u, try the smaller" + " Tx queue size (%d)", + txq_ctrl->txq.inlen_mode, max_inline, + priv->dev_data->port_id, + priv->sh->device_attr.orig_attr.max_qp_wr); + goto error; + } + if (txq_ctrl->txq.inlen_send > max_inline && + config->txq_inline_max != MLX5_ARG_UNSET && + config->txq_inline_max > (int)max_inline) { + DRV_LOG(ERR, + "txq_inline_max requirements (%u) are not" + " satisfied (%u) on port %u, try the smaller" + " Tx queue size (%d)", + txq_ctrl->txq.inlen_send, max_inline, + priv->dev_data->port_id, + priv->sh->device_attr.orig_attr.max_qp_wr); + goto error; + } + if (txq_ctrl->txq.inlen_empw > max_inline && + config->txq_inline_mpw != MLX5_ARG_UNSET && + config->txq_inline_mpw > (int)max_inline) { + DRV_LOG(ERR, + "txq_inline_mpw requirements (%u) are not" + " satisfied (%u) on port %u, try the smaller" + " Tx queue size (%d)", + txq_ctrl->txq.inlen_empw, max_inline, + priv->dev_data->port_id, + priv->sh->device_attr.orig_attr.max_qp_wr); + goto error; + } + if (txq_ctrl->txq.tso_en && max_inline < MLX5_MAX_TSO_HEADER) { + DRV_LOG(ERR, + "tso header inline requirements (%u) are not" + " satisfied (%u) on port %u, try the smaller" + " Tx queue size (%d)", + MLX5_MAX_TSO_HEADER, max_inline, + priv->dev_data->port_id, + priv->sh->device_attr.orig_attr.max_qp_wr); + goto error; + } + if (txq_ctrl->txq.inlen_send > max_inline) { + DRV_LOG(WARNING, + "adjust txq_inline_max (%u->%u)" + " due to large Tx queue on port %u", + txq_ctrl->txq.inlen_send, max_inline, + priv->dev_data->port_id); + txq_ctrl->txq.inlen_send = max_inline; + } + if (txq_ctrl->txq.inlen_empw > max_inline) { + DRV_LOG(WARNING, + "adjust txq_inline_mpw (%u->%u)" + "due to large Tx queue on port %u", + txq_ctrl->txq.inlen_empw, max_inline, + priv->dev_data->port_id); + txq_ctrl->txq.inlen_empw = max_inline; + } + txq_ctrl->max_inline_data = RTE_MAX(txq_ctrl->txq.inlen_send, + txq_ctrl->txq.inlen_empw); + MLX5_ASSERT(txq_ctrl->max_inline_data <= max_inline); + MLX5_ASSERT(txq_ctrl->txq.inlen_mode <= max_inline); + MLX5_ASSERT(txq_ctrl->txq.inlen_mode <= txq_ctrl->txq.inlen_send); + MLX5_ASSERT(txq_ctrl->txq.inlen_mode <= txq_ctrl->txq.inlen_empw || + !txq_ctrl->txq.inlen_empw); + return 0; +error: + rte_errno = ENOMEM; + return -ENOMEM; +} + +/** + * Create a DPDK Tx queue. + * + * @param dev + * Pointer to Ethernet device. + * @param idx + * TX queue index. + * @param desc + * Number of descriptors to configure in queue. + * @param socket + * NUMA socket on which memory must be allocated. + * @param[in] conf + * Thresholds parameters. + * + * @return + * A DPDK queue object on success, NULL otherwise and rte_errno is set. + */ +struct mlx5_txq_ctrl * +mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + unsigned int socket, const struct rte_eth_txconf *conf) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_txq_ctrl *tmpl; + + tmpl = rte_calloc_socket("TXQ", 1, + sizeof(*tmpl) + + desc * sizeof(struct rte_mbuf *), + 0, socket); + if (!tmpl) { + rte_errno = ENOMEM; + return NULL; + } + if (mlx5_mr_btree_init(&tmpl->txq.mr_ctrl.cache_bh, + MLX5_MR_BTREE_CACHE_N, socket)) { + /* rte_errno is already set. */ + goto error; + } + /* Save pointer of global generation number to check memory event. */ + tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->sh->share_cache.dev_gen; + MLX5_ASSERT(desc > MLX5_TX_COMP_THRESH); + tmpl->txq.offloads = conf->offloads | + dev->data->dev_conf.txmode.offloads; + tmpl->priv = priv; + tmpl->socket = socket; + tmpl->txq.elts_n = log2above(desc); + tmpl->txq.elts_s = desc; + tmpl->txq.elts_m = desc - 1; + tmpl->txq.port_id = dev->data->port_id; + tmpl->txq.idx = idx; + txq_set_params(tmpl); + if (txq_adjust_params(tmpl)) + goto error; + if (txq_calc_wqebb_cnt(tmpl) > + priv->sh->device_attr.orig_attr.max_qp_wr) { + DRV_LOG(ERR, + "port %u Tx WQEBB count (%d) exceeds the limit (%d)," + " try smaller queue size", + dev->data->port_id, txq_calc_wqebb_cnt(tmpl), + priv->sh->device_attr.orig_attr.max_qp_wr); + rte_errno = ENOMEM; + goto error; + } + rte_atomic32_inc(&tmpl->refcnt); + tmpl->type = MLX5_TXQ_TYPE_STANDARD; + LIST_INSERT_HEAD(&priv->txqsctrl, tmpl, next); + return tmpl; +error: + rte_free(tmpl); + return NULL; +} + +/** + * Create a DPDK Tx hairpin queue. + * + * @param dev + * Pointer to Ethernet device. + * @param idx + * TX queue index. + * @param desc + * Number of descriptors to configure in queue. + * @param hairpin_conf + * The hairpin configuration. + * + * @return + * A DPDK queue object on success, NULL otherwise and rte_errno is set. + */ +struct mlx5_txq_ctrl * +mlx5_txq_hairpin_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + const struct rte_eth_hairpin_conf *hairpin_conf) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_txq_ctrl *tmpl; + + tmpl = rte_calloc_socket("TXQ", 1, + sizeof(*tmpl), 0, SOCKET_ID_ANY); + if (!tmpl) { + rte_errno = ENOMEM; + return NULL; + } + tmpl->priv = priv; + tmpl->socket = SOCKET_ID_ANY; + tmpl->txq.elts_n = log2above(desc); + tmpl->txq.port_id = dev->data->port_id; + tmpl->txq.idx = idx; + tmpl->hairpin_conf = *hairpin_conf; + tmpl->type = MLX5_TXQ_TYPE_HAIRPIN; + rte_atomic32_inc(&tmpl->refcnt); + LIST_INSERT_HEAD(&priv->txqsctrl, tmpl, next); + return tmpl; +} + +/** + * Get a Tx queue. + * + * @param dev + * Pointer to Ethernet device. + * @param idx + * TX queue index. + * + * @return + * A pointer to the queue if it exists. + */ +struct mlx5_txq_ctrl * +mlx5_txq_get(struct rte_eth_dev *dev, uint16_t idx) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_txq_ctrl *ctrl = NULL; + + if ((*priv->txqs)[idx]) { + ctrl = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, + txq); + mlx5_txq_obj_get(dev, idx); + rte_atomic32_inc(&ctrl->refcnt); + } + return ctrl; +} + +/** + * Release a Tx queue. + * + * @param dev + * Pointer to Ethernet device. + * @param idx + * TX queue index. + * + * @return + * 1 while a reference on it exists, 0 when freed. + */ +int +mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_txq_ctrl *txq; + + if (!(*priv->txqs)[idx]) + return 0; + txq = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq); + if (txq->obj && !mlx5_txq_obj_release(txq->obj)) + txq->obj = NULL; + if (rte_atomic32_dec_and_test(&txq->refcnt)) { + txq_free_elts(txq); + mlx5_mr_btree_free(&txq->txq.mr_ctrl.cache_bh); + LIST_REMOVE(txq, next); + rte_free(txq); + (*priv->txqs)[idx] = NULL; + return 0; + } + return 1; +} + +/** + * Verify if the queue can be released. + * + * @param dev + * Pointer to Ethernet device. + * @param idx + * TX queue index. + * + * @return + * 1 if the queue can be released. + */ +int +mlx5_txq_releasable(struct rte_eth_dev *dev, uint16_t idx) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_txq_ctrl *txq; + + if (!(*priv->txqs)[idx]) + return -1; + txq = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq); + return (rte_atomic32_read(&txq->refcnt) == 1); +} + +/** + * Verify the Tx Queue list is empty + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * The number of object not released. + */ +int +mlx5_txq_verify(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_txq_ctrl *txq_ctrl; + int ret = 0; + + LIST_FOREACH(txq_ctrl, &priv->txqsctrl, next) { + DRV_LOG(DEBUG, "port %u Tx queue %u still referenced", + dev->data->port_id, txq_ctrl->txq.idx); + ++ret; + } + return ret; +} diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_utils.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_utils.c new file mode 100644 index 000000000..d29fbcbc8 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_utils.c @@ -0,0 +1,484 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2019 Mellanox Technologies, Ltd + */ + +#include <rte_malloc.h> +#include <rte_hash_crc.h> + +#include "mlx5_utils.h" + +struct mlx5_hlist * +mlx5_hlist_create(const char *name, uint32_t size) +{ + struct mlx5_hlist *h; + uint32_t act_size; + uint32_t alloc_size; + + if (!size) + return NULL; + /* Align to the next power of 2, 32bits integer is enough now. */ + if (!rte_is_power_of_2(size)) { + act_size = rte_align32pow2(size); + DRV_LOG(WARNING, "Size 0x%" PRIX32 " is not power of 2, will " + "be aligned to 0x%" PRIX32 ".\n", size, act_size); + } else { + act_size = size; + } + alloc_size = sizeof(struct mlx5_hlist) + + sizeof(struct mlx5_hlist_head) * act_size; + /* Using zmalloc, then no need to initialize the heads. */ + h = rte_zmalloc(name, alloc_size, RTE_CACHE_LINE_SIZE); + if (!h) { + DRV_LOG(ERR, "No memory for hash list %s creation\n", + name ? name : "None"); + return NULL; + } + if (name) + snprintf(h->name, MLX5_HLIST_NAMESIZE, "%s", name); + h->table_sz = act_size; + h->mask = act_size - 1; + DRV_LOG(DEBUG, "Hash list with %s size 0x%" PRIX32 " is created.\n", + h->name, act_size); + return h; +} + +struct mlx5_hlist_entry * +mlx5_hlist_lookup(struct mlx5_hlist *h, uint64_t key) +{ + uint32_t idx; + struct mlx5_hlist_head *first; + struct mlx5_hlist_entry *node; + + MLX5_ASSERT(h); + idx = rte_hash_crc_8byte(key, 0) & h->mask; + first = &h->heads[idx]; + LIST_FOREACH(node, first, next) { + if (node->key == key) + return node; + } + return NULL; +} + +int +mlx5_hlist_insert(struct mlx5_hlist *h, struct mlx5_hlist_entry *entry) +{ + uint32_t idx; + struct mlx5_hlist_head *first; + struct mlx5_hlist_entry *node; + + MLX5_ASSERT(h && entry); + idx = rte_hash_crc_8byte(entry->key, 0) & h->mask; + first = &h->heads[idx]; + /* No need to reuse the lookup function. */ + LIST_FOREACH(node, first, next) { + if (node->key == entry->key) + return -EEXIST; + } + LIST_INSERT_HEAD(first, entry, next); + return 0; +} + +void +mlx5_hlist_remove(struct mlx5_hlist *h __rte_unused, + struct mlx5_hlist_entry *entry) +{ + MLX5_ASSERT(entry && entry->next.le_prev); + LIST_REMOVE(entry, next); + /* Set to NULL to get rid of removing action for more than once. */ + entry->next.le_prev = NULL; +} + +void +mlx5_hlist_destroy(struct mlx5_hlist *h, + mlx5_hlist_destroy_callback_fn cb, void *ctx) +{ + uint32_t idx; + struct mlx5_hlist_entry *entry; + + MLX5_ASSERT(h); + for (idx = 0; idx < h->table_sz; ++idx) { + /* no LIST_FOREACH_SAFE, using while instead */ + while (!LIST_EMPTY(&h->heads[idx])) { + entry = LIST_FIRST(&h->heads[idx]); + LIST_REMOVE(entry, next); + /* + * The owner of whole element which contains data entry + * is the user, so it's the user's duty to do the clean + * up and the free work because someone may not put the + * hlist entry at the beginning(suggested to locate at + * the beginning). Or else the default free function + * will be used. + */ + if (cb) + cb(entry, ctx); + else + rte_free(entry); + } + } + rte_free(h); +} + +static inline void +mlx5_ipool_lock(struct mlx5_indexed_pool *pool) +{ + if (pool->cfg.need_lock) + rte_spinlock_lock(&pool->lock); +} + +static inline void +mlx5_ipool_unlock(struct mlx5_indexed_pool *pool) +{ + if (pool->cfg.need_lock) + rte_spinlock_unlock(&pool->lock); +} + +static inline uint32_t +mlx5_trunk_idx_get(struct mlx5_indexed_pool *pool, uint32_t entry_idx) +{ + struct mlx5_indexed_pool_config *cfg = &pool->cfg; + uint32_t trunk_idx = 0; + uint32_t i; + + if (!cfg->grow_trunk) + return entry_idx / cfg->trunk_size; + if (entry_idx >= pool->grow_tbl[cfg->grow_trunk - 1]) { + trunk_idx = (entry_idx - pool->grow_tbl[cfg->grow_trunk - 1]) / + (cfg->trunk_size << (cfg->grow_shift * + cfg->grow_trunk)) + cfg->grow_trunk; + } else { + for (i = 0; i < cfg->grow_trunk; i++) { + if (entry_idx < pool->grow_tbl[i]) + break; + } + trunk_idx = i; + } + return trunk_idx; +} + +static inline uint32_t +mlx5_trunk_size_get(struct mlx5_indexed_pool *pool, uint32_t trunk_idx) +{ + struct mlx5_indexed_pool_config *cfg = &pool->cfg; + + return cfg->trunk_size << (cfg->grow_shift * + (trunk_idx > cfg->grow_trunk ? cfg->grow_trunk : trunk_idx)); +} + +static inline uint32_t +mlx5_trunk_idx_offset_get(struct mlx5_indexed_pool *pool, uint32_t trunk_idx) +{ + struct mlx5_indexed_pool_config *cfg = &pool->cfg; + uint32_t offset = 0; + + if (!trunk_idx) + return 0; + if (!cfg->grow_trunk) + return cfg->trunk_size * trunk_idx; + if (trunk_idx < cfg->grow_trunk) + offset = pool->grow_tbl[trunk_idx - 1]; + else + offset = pool->grow_tbl[cfg->grow_trunk - 1] + + (cfg->trunk_size << (cfg->grow_shift * + cfg->grow_trunk)) * (trunk_idx - cfg->grow_trunk); + return offset; +} + +struct mlx5_indexed_pool * +mlx5_ipool_create(struct mlx5_indexed_pool_config *cfg) +{ + struct mlx5_indexed_pool *pool; + uint32_t i; + + if (!cfg || !cfg->size || (!cfg->malloc ^ !cfg->free) || + (cfg->trunk_size && ((cfg->trunk_size & (cfg->trunk_size - 1)) || + ((__builtin_ffs(cfg->trunk_size) + TRUNK_IDX_BITS) > 32)))) + return NULL; + pool = rte_zmalloc("mlx5_ipool", sizeof(*pool) + cfg->grow_trunk * + sizeof(pool->grow_tbl[0]), RTE_CACHE_LINE_SIZE); + if (!pool) + return NULL; + pool->cfg = *cfg; + if (!pool->cfg.trunk_size) + pool->cfg.trunk_size = MLX5_IPOOL_DEFAULT_TRUNK_SIZE; + if (!cfg->malloc && !cfg->free) { + pool->cfg.malloc = rte_malloc_socket; + pool->cfg.free = rte_free; + } + pool->free_list = TRUNK_INVALID; + if (pool->cfg.need_lock) + rte_spinlock_init(&pool->lock); + /* + * Initialize the dynamic grow trunk size lookup table to have a quick + * lookup for the trunk entry index offset. + */ + for (i = 0; i < cfg->grow_trunk; i++) { + pool->grow_tbl[i] = cfg->trunk_size << (cfg->grow_shift * i); + if (i > 0) + pool->grow_tbl[i] += pool->grow_tbl[i - 1]; + } + return pool; +} + +static int +mlx5_ipool_grow(struct mlx5_indexed_pool *pool) +{ + struct mlx5_indexed_trunk *trunk; + struct mlx5_indexed_trunk **trunk_tmp; + struct mlx5_indexed_trunk **p; + size_t trunk_size = 0; + size_t data_size; + size_t bmp_size; + uint32_t idx; + + if (pool->n_trunk_valid == TRUNK_MAX_IDX) + return -ENOMEM; + if (pool->n_trunk_valid == pool->n_trunk) { + /* No free trunk flags, expand trunk list. */ + int n_grow = pool->n_trunk_valid ? pool->n_trunk : + RTE_CACHE_LINE_SIZE / sizeof(void *); + + p = pool->cfg.malloc(pool->cfg.type, + (pool->n_trunk_valid + n_grow) * + sizeof(struct mlx5_indexed_trunk *), + RTE_CACHE_LINE_SIZE, rte_socket_id()); + if (!p) + return -ENOMEM; + if (pool->trunks) + memcpy(p, pool->trunks, pool->n_trunk_valid * + sizeof(struct mlx5_indexed_trunk *)); + memset(RTE_PTR_ADD(p, pool->n_trunk_valid * sizeof(void *)), 0, + n_grow * sizeof(void *)); + trunk_tmp = pool->trunks; + pool->trunks = p; + if (trunk_tmp) + pool->cfg.free(trunk_tmp); + pool->n_trunk += n_grow; + } + if (!pool->cfg.release_mem_en) { + idx = pool->n_trunk_valid; + } else { + /* Find the first available slot in trunk list */ + for (idx = 0; idx < pool->n_trunk; idx++) + if (pool->trunks[idx] == NULL) + break; + } + trunk_size += sizeof(*trunk); + data_size = mlx5_trunk_size_get(pool, idx); + bmp_size = rte_bitmap_get_memory_footprint(data_size); + /* rte_bitmap requires memory cacheline aligned. */ + trunk_size += RTE_CACHE_LINE_ROUNDUP(data_size * pool->cfg.size); + trunk_size += bmp_size; + trunk = pool->cfg.malloc(pool->cfg.type, trunk_size, + RTE_CACHE_LINE_SIZE, rte_socket_id()); + if (!trunk) + return -ENOMEM; + pool->trunks[idx] = trunk; + trunk->idx = idx; + trunk->free = data_size; + trunk->prev = TRUNK_INVALID; + trunk->next = TRUNK_INVALID; + MLX5_ASSERT(pool->free_list == TRUNK_INVALID); + pool->free_list = idx; + /* Mark all entries as available. */ + trunk->bmp = rte_bitmap_init_with_all_set(data_size, &trunk->data + [RTE_CACHE_LINE_ROUNDUP(data_size * pool->cfg.size)], + bmp_size); + MLX5_ASSERT(trunk->bmp); + pool->n_trunk_valid++; +#ifdef POOL_DEBUG + pool->trunk_new++; + pool->trunk_avail++; +#endif + return 0; +} + +void * +mlx5_ipool_malloc(struct mlx5_indexed_pool *pool, uint32_t *idx) +{ + struct mlx5_indexed_trunk *trunk; + uint64_t slab = 0; + uint32_t iidx = 0; + void *p; + + mlx5_ipool_lock(pool); + if (pool->free_list == TRUNK_INVALID) { + /* If no available trunks, grow new. */ + if (mlx5_ipool_grow(pool)) { + mlx5_ipool_unlock(pool); + return NULL; + } + } + MLX5_ASSERT(pool->free_list != TRUNK_INVALID); + trunk = pool->trunks[pool->free_list]; + MLX5_ASSERT(trunk->free); + if (!rte_bitmap_scan(trunk->bmp, &iidx, &slab)) { + mlx5_ipool_unlock(pool); + return NULL; + } + MLX5_ASSERT(slab); + iidx += __builtin_ctzll(slab); + MLX5_ASSERT(iidx != UINT32_MAX); + MLX5_ASSERT(iidx < mlx5_trunk_size_get(pool, trunk->idx)); + rte_bitmap_clear(trunk->bmp, iidx); + p = &trunk->data[iidx * pool->cfg.size]; + iidx += mlx5_trunk_idx_offset_get(pool, trunk->idx); + iidx += 1; /* non-zero index. */ + trunk->free--; +#ifdef POOL_DEBUG + pool->n_entry++; +#endif + if (!trunk->free) { + /* Full trunk will be removed from free list in imalloc. */ + MLX5_ASSERT(pool->free_list == trunk->idx); + pool->free_list = trunk->next; + if (trunk->next != TRUNK_INVALID) + pool->trunks[trunk->next]->prev = TRUNK_INVALID; + trunk->prev = TRUNK_INVALID; + trunk->next = TRUNK_INVALID; +#ifdef POOL_DEBUG + pool->trunk_empty++; + pool->trunk_avail--; +#endif + } + *idx = iidx; + mlx5_ipool_unlock(pool); + return p; +} + +void * +mlx5_ipool_zmalloc(struct mlx5_indexed_pool *pool, uint32_t *idx) +{ + void *entry = mlx5_ipool_malloc(pool, idx); + + if (entry) + memset(entry, 0, pool->cfg.size); + return entry; +} + +void +mlx5_ipool_free(struct mlx5_indexed_pool *pool, uint32_t idx) +{ + struct mlx5_indexed_trunk *trunk; + uint32_t trunk_idx; + uint32_t entry_idx; + + if (!idx) + return; + idx -= 1; + mlx5_ipool_lock(pool); + trunk_idx = mlx5_trunk_idx_get(pool, idx); + if ((!pool->cfg.release_mem_en && trunk_idx >= pool->n_trunk_valid) || + (pool->cfg.release_mem_en && trunk_idx >= pool->n_trunk)) + goto out; + trunk = pool->trunks[trunk_idx]; + if (!trunk) + goto out; + entry_idx = idx - mlx5_trunk_idx_offset_get(pool, trunk->idx); + if (trunk_idx != trunk->idx || + rte_bitmap_get(trunk->bmp, entry_idx)) + goto out; + rte_bitmap_set(trunk->bmp, entry_idx); + trunk->free++; + if (pool->cfg.release_mem_en && trunk->free == mlx5_trunk_size_get + (pool, trunk->idx)) { + if (pool->free_list == trunk->idx) + pool->free_list = trunk->next; + if (trunk->next != TRUNK_INVALID) + pool->trunks[trunk->next]->prev = trunk->prev; + if (trunk->prev != TRUNK_INVALID) + pool->trunks[trunk->prev]->next = trunk->next; + pool->cfg.free(trunk); + pool->trunks[trunk_idx] = NULL; + pool->n_trunk_valid--; +#ifdef POOL_DEBUG + pool->trunk_avail--; + pool->trunk_free++; +#endif + if (pool->n_trunk_valid == 0) { + pool->cfg.free(pool->trunks); + pool->trunks = NULL; + pool->n_trunk = 0; + } + } else if (trunk->free == 1) { + /* Put into free trunk list head. */ + MLX5_ASSERT(pool->free_list != trunk->idx); + trunk->next = pool->free_list; + trunk->prev = TRUNK_INVALID; + if (pool->free_list != TRUNK_INVALID) + pool->trunks[pool->free_list]->prev = trunk->idx; + pool->free_list = trunk->idx; +#ifdef POOL_DEBUG + pool->trunk_empty--; + pool->trunk_avail++; +#endif + } +#ifdef POOL_DEBUG + pool->n_entry--; +#endif +out: + mlx5_ipool_unlock(pool); +} + +void * +mlx5_ipool_get(struct mlx5_indexed_pool *pool, uint32_t idx) +{ + struct mlx5_indexed_trunk *trunk; + void *p = NULL; + uint32_t trunk_idx; + uint32_t entry_idx; + + if (!idx) + return NULL; + idx -= 1; + mlx5_ipool_lock(pool); + trunk_idx = mlx5_trunk_idx_get(pool, idx); + if ((!pool->cfg.release_mem_en && trunk_idx >= pool->n_trunk_valid) || + (pool->cfg.release_mem_en && trunk_idx >= pool->n_trunk)) + goto out; + trunk = pool->trunks[trunk_idx]; + if (!trunk) + goto out; + entry_idx = idx - mlx5_trunk_idx_offset_get(pool, trunk->idx); + if (trunk_idx != trunk->idx || + rte_bitmap_get(trunk->bmp, entry_idx)) + goto out; + p = &trunk->data[entry_idx * pool->cfg.size]; +out: + mlx5_ipool_unlock(pool); + return p; +} + +int +mlx5_ipool_destroy(struct mlx5_indexed_pool *pool) +{ + struct mlx5_indexed_trunk **trunks; + uint32_t i; + + MLX5_ASSERT(pool); + mlx5_ipool_lock(pool); + trunks = pool->trunks; + for (i = 0; i < pool->n_trunk; i++) { + if (trunks[i]) + pool->cfg.free(trunks[i]); + } + if (!pool->trunks) + pool->cfg.free(pool->trunks); + mlx5_ipool_unlock(pool); + rte_free(pool); + return 0; +} + +void +mlx5_ipool_dump(struct mlx5_indexed_pool *pool) +{ + printf("Pool %s entry size %u, trunks %u, %d entry per trunk, " + "total: %d\n", + pool->cfg.type, pool->cfg.size, pool->n_trunk_valid, + pool->cfg.trunk_size, pool->n_trunk_valid); +#ifdef POOL_DEBUG + printf("Pool %s entry %u, trunk alloc %u, empty: %u, " + "available %u free %u\n", + pool->cfg.type, pool->n_entry, pool->trunk_new, + pool->trunk_empty, pool->trunk_avail, pool->trunk_free); +#endif +} diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_utils.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_utils.h new file mode 100644 index 000000000..f4ec15170 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_utils.h @@ -0,0 +1,423 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2015 6WIND S.A. + * Copyright 2015 Mellanox Technologies, Ltd + */ + +#ifndef RTE_PMD_MLX5_UTILS_H_ +#define RTE_PMD_MLX5_UTILS_H_ + +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <limits.h> +#include <errno.h> + +#include <rte_spinlock.h> +#include <rte_memory.h> +#include <rte_bitmap.h> + +#include <mlx5_common.h> + +#include "mlx5_defs.h" + + +/* Convert a bit number to the corresponding 64-bit mask */ +#define MLX5_BITSHIFT(v) (UINT64_C(1) << (v)) + +/* Save and restore errno around argument evaluation. */ +#define ERRNO_SAFE(x) ((errno = (int []){ errno, ((x), 0) }[0])) + +extern int mlx5_logtype; + +/* Generic printf()-like logging macro with automatic line feed. */ +#define DRV_LOG(level, ...) \ + PMD_DRV_LOG_(level, mlx5_logtype, MLX5_DRIVER_NAME, \ + __VA_ARGS__ PMD_DRV_LOG_STRIP PMD_DRV_LOG_OPAREN, \ + PMD_DRV_LOG_CPAREN) + +#define INFO(...) DRV_LOG(INFO, __VA_ARGS__) +#define WARN(...) DRV_LOG(WARNING, __VA_ARGS__) +#define ERROR(...) DRV_LOG(ERR, __VA_ARGS__) + +/* Convenience macros for accessing mbuf fields. */ +#define NEXT(m) ((m)->next) +#define DATA_LEN(m) ((m)->data_len) +#define PKT_LEN(m) ((m)->pkt_len) +#define DATA_OFF(m) ((m)->data_off) +#define SET_DATA_OFF(m, o) ((m)->data_off = (o)) +#define NB_SEGS(m) ((m)->nb_segs) +#define PORT(m) ((m)->port) + +/* Transpose flags. Useful to convert IBV to DPDK flags. */ +#define TRANSPOSE(val, from, to) \ + (((from) >= (to)) ? \ + (((val) & (from)) / ((from) / (to))) : \ + (((val) & (from)) * ((to) / (from)))) + +/* + * The indexed memory entry index is made up of trunk index and offset of + * the entry in the trunk. Since the entry index is 32 bits, in case user + * prefers to have small trunks, user can change the macro below to a big + * number which helps the pool contains more trunks with lots of entries + * allocated. + */ +#define TRUNK_IDX_BITS 16 +#define TRUNK_MAX_IDX ((1 << TRUNK_IDX_BITS) - 1) +#define TRUNK_INVALID TRUNK_MAX_IDX +#define MLX5_IPOOL_DEFAULT_TRUNK_SIZE (1 << (28 - TRUNK_IDX_BITS)) +#ifdef RTE_LIBRTE_MLX5_DEBUG +#define POOL_DEBUG 1 +#endif + +struct mlx5_indexed_pool_config { + uint32_t size; /* Pool entry size. */ + uint32_t trunk_size:22; + /* + * Trunk entry number. Must be power of 2. It can be increased + * if trunk_grow enable. The trunk entry number increases with + * left shift grow_shift. Trunks with index are after grow_trunk + * will keep the entry number same with the last grow trunk. + */ + uint32_t grow_trunk:4; + /* + * Trunks with entry number increase in the pool. Set it to 0 + * to make the pool works as trunk entry fixed pool. It works + * only if grow_shift is not 0. + */ + uint32_t grow_shift:4; + /* + * Trunk entry number increase shift value, stop after grow_trunk. + * It works only if grow_trunk is not 0. + */ + uint32_t need_lock:1; + /* Lock is needed for multiple thread usage. */ + uint32_t release_mem_en:1; /* Rlease trunk when it is free. */ + const char *type; /* Memory allocate type name. */ + void *(*malloc)(const char *type, size_t size, unsigned int align, + int socket); + /* User defined memory allocator. */ + void (*free)(void *addr); /* User defined memory release. */ +}; + +struct mlx5_indexed_trunk { + uint32_t idx; /* Trunk id. */ + uint32_t prev; /* Previous free trunk in free list. */ + uint32_t next; /* Next free trunk in free list. */ + uint32_t free; /* Free entries available */ + struct rte_bitmap *bmp; + uint8_t data[] __rte_cache_aligned; /* Entry data start. */ +}; + +struct mlx5_indexed_pool { + struct mlx5_indexed_pool_config cfg; /* Indexed pool configuration. */ + rte_spinlock_t lock; /* Pool lock for multiple thread usage. */ + uint32_t n_trunk_valid; /* Trunks allocated. */ + uint32_t n_trunk; /* Trunk pointer array size. */ + /* Dim of trunk pointer array. */ + struct mlx5_indexed_trunk **trunks; + uint32_t free_list; /* Index to first free trunk. */ +#ifdef POOL_DEBUG + uint32_t n_entry; + uint32_t trunk_new; + uint32_t trunk_avail; + uint32_t trunk_empty; + uint32_t trunk_free; +#endif + uint32_t grow_tbl[]; /* Save the index offset for the grow trunks. */ +}; + +/** + * Return logarithm of the nearest power of two above input value. + * + * @param v + * Input value. + * + * @return + * Logarithm of the nearest power of two above input value. + */ +static inline unsigned int +log2above(unsigned int v) +{ + unsigned int l; + unsigned int r; + + for (l = 0, r = 0; (v >> 1); ++l, v >>= 1) + r |= (v & 1); + return l + r; +} + +/** Maximum size of string for naming the hlist table. */ +#define MLX5_HLIST_NAMESIZE 32 + +/** + * Structure of the entry in the hash list, user should define its own struct + * that contains this in order to store the data. The 'key' is 64-bits right + * now and its user's responsibility to guarantee there is no collision. + */ +struct mlx5_hlist_entry { + LIST_ENTRY(mlx5_hlist_entry) next; /* entry pointers in the list. */ + uint64_t key; /* user defined 'key', could be the hash signature. */ +}; + +/** Structure for hash head. */ +LIST_HEAD(mlx5_hlist_head, mlx5_hlist_entry); + +/** Type of function that is used to handle the data before freeing. */ +typedef void (*mlx5_hlist_destroy_callback_fn)(void *p, void *ctx); + +/** hash list table structure */ +struct mlx5_hlist { + char name[MLX5_HLIST_NAMESIZE]; /**< Name of the hash list. */ + /**< number of heads, need to be power of 2. */ + uint32_t table_sz; + /**< mask to get the index of the list heads. */ + uint32_t mask; + struct mlx5_hlist_head heads[]; /**< list head arrays. */ +}; + +/** + * Create a hash list table, the user can specify the list heads array size + * of the table, now the size should be a power of 2 in order to get better + * distribution for the entries. Each entry is a part of the whole data element + * and the caller should be responsible for the data element's allocation and + * cleanup / free. Key of each entry will be calculated with CRC in order to + * generate a little fairer distribution. + * + * @param name + * Name of the hash list(optional). + * @param size + * Heads array size of the hash list. + * + * @return + * Pointer of the hash list table created, NULL on failure. + */ +struct mlx5_hlist *mlx5_hlist_create(const char *name, uint32_t size); + +/** + * Search an entry matching the key. + * + * @param h + * Pointer to the hast list table. + * @param key + * Key for the searching entry. + * + * @return + * Pointer of the hlist entry if found, NULL otherwise. + */ +struct mlx5_hlist_entry *mlx5_hlist_lookup(struct mlx5_hlist *h, uint64_t key); + +/** + * Insert an entry to the hash list table, the entry is only part of whole data + * element and a 64B key is used for matching. User should construct the key or + * give a calculated hash signature and guarantee there is no collision. + * + * @param h + * Pointer to the hast list table. + * @param entry + * Entry to be inserted into the hash list table. + * + * @return + * - zero for success. + * - -EEXIST if the entry is already inserted. + */ +int mlx5_hlist_insert(struct mlx5_hlist *h, struct mlx5_hlist_entry *entry); + +/** + * Remove an entry from the hash list table. User should guarantee the validity + * of the entry. + * + * @param h + * Pointer to the hast list table. (not used) + * @param entry + * Entry to be removed from the hash list table. + */ +void mlx5_hlist_remove(struct mlx5_hlist *h __rte_unused, + struct mlx5_hlist_entry *entry); + +/** + * Destroy the hash list table, all the entries already inserted into the lists + * will be handled by the callback function provided by the user (including + * free if needed) before the table is freed. + * + * @param h + * Pointer to the hast list table. + * @param cb + * Callback function for each inserted entry when destroying the hash list. + * @param ctx + * Common context parameter used by callback function for each entry. + */ +void mlx5_hlist_destroy(struct mlx5_hlist *h, + mlx5_hlist_destroy_callback_fn cb, void *ctx); + +/** + * This function allocates non-initialized memory entry from pool. + * In NUMA systems, the memory entry allocated resides on the same + * NUMA socket as the core that calls this function. + * + * Memory entry is allocated from memory trunk, no alignment. + * + * @param pool + * Pointer to indexed memory entry pool. + * No initialization required. + * @param[out] idx + * Pointer to memory to save allocated index. + * Memory index always positive value. + * @return + * - Pointer to the allocated memory entry. + * - NULL on error. Not enough memory, or invalid arguments. + */ +void *mlx5_ipool_malloc(struct mlx5_indexed_pool *pool, uint32_t *idx); + +/** + * This function allocates zero initialized memory entry from pool. + * In NUMA systems, the memory entry allocated resides on the same + * NUMA socket as the core that calls this function. + * + * Memory entry is allocated from memory trunk, no alignment. + * + * @param pool + * Pointer to indexed memory pool. + * No initialization required. + * @param[out] idx + * Pointer to memory to save allocated index. + * Memory index always positive value. + * @return + * - Pointer to the allocated memory entry . + * - NULL on error. Not enough memory, or invalid arguments. + */ +void *mlx5_ipool_zmalloc(struct mlx5_indexed_pool *pool, uint32_t *idx); + +/** + * This function frees indexed memory entry to pool. + * Caller has to make sure that the index is allocated from same pool. + * + * @param pool + * Pointer to indexed memory pool. + * @param idx + * Allocated memory entry index. + */ +void mlx5_ipool_free(struct mlx5_indexed_pool *pool, uint32_t idx); + +/** + * This function returns pointer of indexed memory entry from index. + * Caller has to make sure that the index is valid, and allocated + * from same pool. + * + * @param pool + * Pointer to indexed memory pool. + * @param idx + * Allocated memory index. + * @return + * - Pointer to indexed memory entry. + */ +void *mlx5_ipool_get(struct mlx5_indexed_pool *pool, uint32_t idx); + +/** + * This function creates indexed memory pool. + * Caller has to configure the configuration accordingly. + * + * @param pool + * Pointer to indexed memory pool. + * @param cfg + * Allocated memory index. + */ +struct mlx5_indexed_pool * +mlx5_ipool_create(struct mlx5_indexed_pool_config *cfg); + +/** + * This function releases all resources of pool. + * Caller has to make sure that all indexes and memories allocated + * from this pool not referenced anymore. + * + * @param pool + * Pointer to indexed memory pool. + * @return + * - non-zero value on error. + * - 0 on success. + */ +int mlx5_ipool_destroy(struct mlx5_indexed_pool *pool); + +/** + * This function dumps debug info of pool. + * + * @param pool + * Pointer to indexed memory pool. + */ +void mlx5_ipool_dump(struct mlx5_indexed_pool *pool); + +/* + * Macros for linked list based on indexed memory. + * Example data structure: + * struct Foo { + * ILIST_ENTRY(uint16_t) next; + * ... + * } + * + */ +#define ILIST_ENTRY(type) \ +struct { \ + type prev; /* Index of previous element. */ \ + type next; /* Index of next element. */ \ +} + +#define ILIST_INSERT(pool, head, idx, elem, field) \ + do { \ + typeof(elem) peer; \ + MLX5_ASSERT((elem) && (idx)); \ + (elem)->field.next = *(head); \ + (elem)->field.prev = 0; \ + if (*(head)) { \ + (peer) = mlx5_ipool_get(pool, *(head)); \ + if (peer) \ + (peer)->field.prev = (idx); \ + } \ + *(head) = (idx); \ + } while (0) + +#define ILIST_REMOVE(pool, head, idx, elem, field) \ + do { \ + typeof(elem) peer; \ + MLX5_ASSERT(elem); \ + MLX5_ASSERT(head); \ + if ((elem)->field.prev) { \ + (peer) = mlx5_ipool_get \ + (pool, (elem)->field.prev); \ + if (peer) \ + (peer)->field.next = (elem)->field.next;\ + } \ + if ((elem)->field.next) { \ + (peer) = mlx5_ipool_get \ + (pool, (elem)->field.next); \ + if (peer) \ + (peer)->field.prev = (elem)->field.prev;\ + } \ + if (*(head) == (idx)) \ + *(head) = (elem)->field.next; \ + } while (0) + +#define ILIST_FOREACH(pool, head, idx, elem, field) \ + for ((idx) = (head), (elem) = \ + (idx) ? mlx5_ipool_get(pool, (idx)) : NULL; (elem); \ + idx = (elem)->field.next, (elem) = \ + (idx) ? mlx5_ipool_get(pool, idx) : NULL) + +/* Single index list. */ +#define SILIST_ENTRY(type) \ +struct { \ + type next; /* Index of next element. */ \ +} + +#define SILIST_INSERT(head, idx, elem, field) \ + do { \ + MLX5_ASSERT((elem) && (idx)); \ + (elem)->field.next = *(head); \ + *(head) = (idx); \ + } while (0) + +#define SILIST_FOREACH(pool, head, idx, elem, field) \ + for ((idx) = (head), (elem) = \ + (idx) ? mlx5_ipool_get(pool, (idx)) : NULL; (elem); \ + idx = (elem)->field.next, (elem) = \ + (idx) ? mlx5_ipool_get(pool, idx) : NULL) + +#endif /* RTE_PMD_MLX5_UTILS_H_ */ diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_vlan.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_vlan.c new file mode 100644 index 000000000..f65e416da --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_vlan.c @@ -0,0 +1,327 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2015 6WIND S.A. + * Copyright 2015 Mellanox Technologies, Ltd + */ + +#include <stddef.h> +#include <errno.h> +#include <stdint.h> +#include <unistd.h> + + +/* + * Not needed by this file; included to work around the lack of off_t + * definition for mlx5dv.h with unpatched rdma-core versions. + */ +#include <sys/types.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/mlx5dv.h> +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_ethdev_driver.h> +#include <rte_common.h> +#include <rte_malloc.h> +#include <rte_hypervisor.h> + +#include <mlx5_glue.h> +#include <mlx5_devx_cmds.h> +#include <mlx5_nl.h> + +#include "mlx5.h" +#include "mlx5_autoconf.h" +#include "mlx5_rxtx.h" +#include "mlx5_utils.h" + +/** + * DPDK callback to configure a VLAN filter. + * + * @param dev + * Pointer to Ethernet device structure. + * @param vlan_id + * VLAN ID to filter. + * @param on + * Toggle filter. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on) +{ + struct mlx5_priv *priv = dev->data->dev_private; + unsigned int i; + + DRV_LOG(DEBUG, "port %u %s VLAN filter ID %" PRIu16, + dev->data->port_id, (on ? "enable" : "disable"), vlan_id); + MLX5_ASSERT(priv->vlan_filter_n <= RTE_DIM(priv->vlan_filter)); + for (i = 0; (i != priv->vlan_filter_n); ++i) + if (priv->vlan_filter[i] == vlan_id) + break; + /* Check if there's room for another VLAN filter. */ + if (i == RTE_DIM(priv->vlan_filter)) { + rte_errno = ENOMEM; + return -rte_errno; + } + if (i < priv->vlan_filter_n) { + MLX5_ASSERT(priv->vlan_filter_n != 0); + /* Enabling an existing VLAN filter has no effect. */ + if (on) + goto out; + /* Remove VLAN filter from list. */ + --priv->vlan_filter_n; + memmove(&priv->vlan_filter[i], + &priv->vlan_filter[i + 1], + sizeof(priv->vlan_filter[i]) * + (priv->vlan_filter_n - i)); + priv->vlan_filter[priv->vlan_filter_n] = 0; + } else { + MLX5_ASSERT(i == priv->vlan_filter_n); + /* Disabling an unknown VLAN filter has no effect. */ + if (!on) + goto out; + /* Add new VLAN filter. */ + priv->vlan_filter[priv->vlan_filter_n] = vlan_id; + ++priv->vlan_filter_n; + } +out: + if (dev->data->dev_started) + return mlx5_traffic_restart(dev); + return 0; +} + +/** + * Callback to set/reset VLAN stripping for a specific queue. + * + * @param dev + * Pointer to Ethernet device structure. + * @param queue + * RX queue index. + * @param on + * Enable/disable VLAN stripping. + */ +void +mlx5_vlan_strip_queue_set(struct rte_eth_dev *dev, uint16_t queue, int on) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_data *rxq = (*priv->rxqs)[queue]; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of(rxq, struct mlx5_rxq_ctrl, rxq); + struct ibv_wq_attr mod; + uint16_t vlan_offloads = + (on ? IBV_WQ_FLAGS_CVLAN_STRIPPING : 0) | + 0; + int ret = 0; + + /* Validate hw support */ + if (!priv->config.hw_vlan_strip) { + DRV_LOG(ERR, "port %u VLAN stripping is not supported", + dev->data->port_id); + return; + } + /* Validate queue number */ + if (queue >= priv->rxqs_n) { + DRV_LOG(ERR, "port %u VLAN stripping, invalid queue number %d", + dev->data->port_id, queue); + return; + } + DRV_LOG(DEBUG, "port %u set VLAN offloads 0x%x for port %uqueue %d", + dev->data->port_id, vlan_offloads, rxq->port_id, queue); + if (!rxq_ctrl->obj) { + /* Update related bits in RX queue. */ + rxq->vlan_strip = !!on; + return; + } + if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV) { + mod = (struct ibv_wq_attr){ + .attr_mask = IBV_WQ_ATTR_FLAGS, + .flags_mask = IBV_WQ_FLAGS_CVLAN_STRIPPING, + .flags = vlan_offloads, + }; + ret = mlx5_glue->modify_wq(rxq_ctrl->obj->wq, &mod); + } else if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ) { + struct mlx5_devx_modify_rq_attr rq_attr; + + memset(&rq_attr, 0, sizeof(rq_attr)); + rq_attr.rq_state = MLX5_RQC_STATE_RDY; + rq_attr.state = MLX5_RQC_STATE_RDY; + rq_attr.vsd = (on ? 0 : 1); + rq_attr.modify_bitmask = MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_VSD; + ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr); + } + if (ret) { + DRV_LOG(ERR, "port %u failed to modify object %d stripping " + "mode: %s", dev->data->port_id, + rxq_ctrl->obj->type, strerror(rte_errno)); + return; + } + /* Update related bits in RX queue. */ + rxq->vlan_strip = !!on; +} + +/** + * Callback to set/reset VLAN offloads for a port. + * + * @param dev + * Pointer to Ethernet device structure. + * @param mask + * VLAN offload bit mask. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_vlan_offload_set(struct rte_eth_dev *dev, int mask) +{ + struct mlx5_priv *priv = dev->data->dev_private; + unsigned int i; + + if (mask & ETH_VLAN_STRIP_MASK) { + int hw_vlan_strip = !!(dev->data->dev_conf.rxmode.offloads & + DEV_RX_OFFLOAD_VLAN_STRIP); + + if (!priv->config.hw_vlan_strip) { + DRV_LOG(ERR, "port %u VLAN stripping is not supported", + dev->data->port_id); + return 0; + } + /* Run on every RX queue and set/reset VLAN stripping. */ + for (i = 0; (i != priv->rxqs_n); i++) + mlx5_vlan_strip_queue_set(dev, i, hw_vlan_strip); + } + return 0; +} + +/* + * Release VLAN network device, created for VM workaround. + * + * @param[in] dev + * Ethernet device object, Netlink context provider. + * @param[in] vlan + * Object representing the network device to release. + */ +void mlx5_vlan_vmwa_release(struct rte_eth_dev *dev, + struct mlx5_vf_vlan *vlan) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_nl_vlan_vmwa_context *vmwa = priv->vmwa_context; + struct mlx5_nl_vlan_dev *vlan_dev = &vmwa->vlan_dev[0]; + + MLX5_ASSERT(vlan->created); + MLX5_ASSERT(priv->vmwa_context); + if (!vlan->created || !vmwa) + return; + vlan->created = 0; + MLX5_ASSERT(vlan_dev[vlan->tag].refcnt); + if (--vlan_dev[vlan->tag].refcnt == 0 && + vlan_dev[vlan->tag].ifindex) { + mlx5_nl_vlan_vmwa_delete(vmwa, vlan_dev[vlan->tag].ifindex); + vlan_dev[vlan->tag].ifindex = 0; + } +} + +/** + * Acquire VLAN interface with specified tag for VM workaround. + * + * @param[in] dev + * Ethernet device object, Netlink context provider. + * @param[in] vlan + * Object representing the network device to acquire. + */ +void mlx5_vlan_vmwa_acquire(struct rte_eth_dev *dev, + struct mlx5_vf_vlan *vlan) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_nl_vlan_vmwa_context *vmwa = priv->vmwa_context; + struct mlx5_nl_vlan_dev *vlan_dev = &vmwa->vlan_dev[0]; + + MLX5_ASSERT(!vlan->created); + MLX5_ASSERT(priv->vmwa_context); + if (vlan->created || !vmwa) + return; + if (vlan_dev[vlan->tag].refcnt == 0) { + MLX5_ASSERT(!vlan_dev[vlan->tag].ifindex); + vlan_dev[vlan->tag].ifindex = + mlx5_nl_vlan_vmwa_create(vmwa, vmwa->vf_ifindex, + vlan->tag); + } + if (vlan_dev[vlan->tag].ifindex) { + vlan_dev[vlan->tag].refcnt++; + vlan->created = 1; + } +} + +/* + * Create per ethernet device VLAN VM workaround context + */ +struct mlx5_nl_vlan_vmwa_context * +mlx5_vlan_vmwa_init(struct rte_eth_dev *dev, uint32_t ifindex) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_dev_config *config = &priv->config; + struct mlx5_nl_vlan_vmwa_context *vmwa; + enum rte_hypervisor hv_type; + + /* Do not engage workaround over PF. */ + if (!config->vf) + return NULL; + /* Check whether there is desired virtual environment */ + hv_type = rte_hypervisor_get(); + switch (hv_type) { + case RTE_HYPERVISOR_UNKNOWN: + case RTE_HYPERVISOR_VMWARE: + /* + * The "white list" of configurations + * to engage the workaround. + */ + break; + default: + /* + * The configuration is not found in the "white list". + * We should not engage the VLAN workaround. + */ + return NULL; + } + vmwa = rte_zmalloc(__func__, sizeof(*vmwa), sizeof(uint32_t)); + if (!vmwa) { + DRV_LOG(WARNING, + "Can not allocate memory" + " for VLAN workaround context"); + return NULL; + } + vmwa->nl_socket = mlx5_nl_init(NETLINK_ROUTE); + if (vmwa->nl_socket < 0) { + DRV_LOG(WARNING, + "Can not create Netlink socket" + " for VLAN workaround context"); + rte_free(vmwa); + return NULL; + } + vmwa->vf_ifindex = ifindex; + /* Cleanup for existing VLAN devices. */ + return vmwa; +} + +/* + * Destroy per ethernet device VLAN VM workaround context + */ +void mlx5_vlan_vmwa_exit(struct mlx5_nl_vlan_vmwa_context *vmwa) +{ + unsigned int i; + + /* Delete all remaining VLAN devices. */ + for (i = 0; i < RTE_DIM(vmwa->vlan_dev); i++) { + if (vmwa->vlan_dev[i].ifindex) + mlx5_nl_vlan_vmwa_delete(vmwa, + vmwa->vlan_dev[i].ifindex); + } + if (vmwa->nl_socket >= 0) + close(vmwa->nl_socket); + rte_free(vmwa); +} diff --git a/src/spdk/dpdk/drivers/net/mlx5/rte_pmd_mlx5.h b/src/spdk/dpdk/drivers/net/mlx5/rte_pmd_mlx5.h new file mode 100644 index 000000000..8c6922835 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/rte_pmd_mlx5.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2020 Mellanox Technologies, Ltd + */ + +#ifndef RTE_PMD_PRIVATE_MLX5_H_ +#define RTE_PMD_PRIVATE_MLX5_H_ + +/** + * @file + * MLX5 public header. + * + * This interface provides the ability to support private PMD + * dynamic flags. + */ + +#define RTE_PMD_MLX5_FINE_GRANULARITY_INLINE "mlx5_fine_granularity_inline" + +/** + * Returns the dynamic flags name, that are supported. + * + * @param[out] names + * Array that is used to return the supported dynamic flags names. + * @param[in] n + * The number of elements in the names array. + * + * @return + * The number of dynamic flags that were copied if not negative. + * Otherwise: + * - ENOMEM - not enough entries in the array + * - EINVAL - invalid array entry + */ +__rte_experimental +int rte_pmd_mlx5_get_dyn_flag_names(char *names[], unsigned int n); + +#endif diff --git a/src/spdk/dpdk/drivers/net/mlx5/rte_pmd_mlx5_version.map b/src/spdk/dpdk/drivers/net/mlx5/rte_pmd_mlx5_version.map new file mode 100644 index 000000000..c8b1031b0 --- /dev/null +++ b/src/spdk/dpdk/drivers/net/mlx5/rte_pmd_mlx5_version.map @@ -0,0 +1,10 @@ +DPDK_20.0 { + local: *; +}; + +EXPERIMENTAL { + global: + + # added in 20.02 + rte_pmd_mlx5_get_dyn_flag_names; +}; |