diff options
Diffstat (limited to 'drivers/infiniband/hw/mlx5')
22 files changed, 23136 insertions, 0 deletions
diff --git a/drivers/infiniband/hw/mlx5/Kconfig b/drivers/infiniband/hw/mlx5/Kconfig new file mode 100644 index 000000000..0440966bc --- /dev/null +++ b/drivers/infiniband/hw/mlx5/Kconfig @@ -0,0 +1,9 @@ +config MLX5_INFINIBAND + tristate "Mellanox 5th generation network adapters (ConnectX series) support" + depends on NETDEVICES && ETHERNET && PCI && MLX5_CORE + depends on INFINIBAND_USER_ACCESS || INFINIBAND_USER_ACCESS=n + ---help--- + This driver provides low-level InfiniBand support for + Mellanox Connect-IB PCI Express host channel adapters (HCAs). + This is required to use InfiniBand protocols such as + IP-over-IB or SRP with these devices. diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile new file mode 100644 index 000000000..b8e4b15e2 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o + +mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o gsi.o ib_virt.o cmd.o cong.o +mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o +mlx5_ib-$(CONFIG_MLX5_ESWITCH) += ib_rep.o +mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += devx.o +mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += flow.o diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c new file mode 100644 index 000000000..ffd03bf1a --- /dev/null +++ b/drivers/infiniband/hw/mlx5/ah.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "mlx5_ib.h" + +static struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev, + struct mlx5_ib_ah *ah, + struct rdma_ah_attr *ah_attr) +{ + enum ib_gid_type gid_type; + + if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) { + const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); + + memcpy(ah->av.rgid, &grh->dgid, 16); + ah->av.grh_gid_fl = cpu_to_be32(grh->flow_label | + (1 << 30) | + grh->sgid_index << 20); + ah->av.hop_limit = grh->hop_limit; + ah->av.tclass = grh->traffic_class; + } + + ah->av.stat_rate_sl = (rdma_ah_get_static_rate(ah_attr) << 4); + + if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { + gid_type = ah_attr->grh.sgid_attr->gid_type; + + memcpy(ah->av.rmac, ah_attr->roce.dmac, + sizeof(ah_attr->roce.dmac)); + ah->av.udp_sport = + mlx5_get_roce_udp_sport(dev, ah_attr->grh.sgid_attr); + ah->av.stat_rate_sl |= (rdma_ah_get_sl(ah_attr) & 0x7) << 1; + if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) +#define MLX5_ECN_ENABLED BIT(1) + ah->av.tclass |= MLX5_ECN_ENABLED; + } else { + ah->av.rlid = cpu_to_be16(rdma_ah_get_dlid(ah_attr)); + ah->av.fl_mlid = rdma_ah_get_path_bits(ah_attr) & 0x7f; + ah->av.stat_rate_sl |= (rdma_ah_get_sl(ah_attr) & 0xf); + } + + return &ah->ibah; +} + +struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, + struct ib_udata *udata) + +{ + struct mlx5_ib_ah *ah; + struct mlx5_ib_dev *dev = to_mdev(pd->device); + enum rdma_ah_attr_type ah_type = ah_attr->type; + + if ((ah_type == RDMA_AH_ATTR_TYPE_ROCE) && + !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) + return ERR_PTR(-EINVAL); + + if (ah_type == RDMA_AH_ATTR_TYPE_ROCE && udata) { + int err; + struct mlx5_ib_create_ah_resp resp = {}; + u32 min_resp_len = offsetof(typeof(resp), dmac) + + sizeof(resp.dmac); + + if (udata->outlen < min_resp_len) + return ERR_PTR(-EINVAL); + + resp.response_length = min_resp_len; + + memcpy(resp.dmac, ah_attr->roce.dmac, ETH_ALEN); + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) + return ERR_PTR(err); + } + + ah = kzalloc(sizeof(*ah), GFP_ATOMIC); + if (!ah) + return ERR_PTR(-ENOMEM); + + return create_ib_ah(dev, ah, ah_attr); /* never fails */ +} + +int mlx5_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr) +{ + struct mlx5_ib_ah *ah = to_mah(ibah); + u32 tmp; + + memset(ah_attr, 0, sizeof(*ah_attr)); + ah_attr->type = ibah->type; + + tmp = be32_to_cpu(ah->av.grh_gid_fl); + if (tmp & (1 << 30)) { + rdma_ah_set_grh(ah_attr, NULL, + tmp & 0xfffff, + (tmp >> 20) & 0xff, + ah->av.hop_limit, + ah->av.tclass); + rdma_ah_set_dgid_raw(ah_attr, ah->av.rgid); + } + rdma_ah_set_dlid(ah_attr, be16_to_cpu(ah->av.rlid)); + rdma_ah_set_static_rate(ah_attr, ah->av.stat_rate_sl >> 4); + rdma_ah_set_sl(ah_attr, ah->av.stat_rate_sl & 0xf); + + return 0; +} + +int mlx5_ib_destroy_ah(struct ib_ah *ah) +{ + kfree(to_mah(ah)); + return 0; +} diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c new file mode 100644 index 000000000..c84fef9a8 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "cmd.h" + +int mlx5_cmd_dump_fill_mkey(struct mlx5_core_dev *dev, u32 *mkey) +{ + u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {0}; + int err; + + MLX5_SET(query_special_contexts_in, in, opcode, + MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (!err) + *mkey = MLX5_GET(query_special_contexts_out, out, + dump_fill_mkey); + return err; +} + +int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey) +{ + u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {}; + int err; + + MLX5_SET(query_special_contexts_in, in, opcode, + MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (!err) + *null_mkey = MLX5_GET(query_special_contexts_out, out, + null_mkey); + return err; +} + +int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point, + void *out, int out_size) +{ + u32 in[MLX5_ST_SZ_DW(query_cong_params_in)] = { }; + + MLX5_SET(query_cong_params_in, in, opcode, + MLX5_CMD_OP_QUERY_CONG_PARAMS); + MLX5_SET(query_cong_params_in, in, cong_protocol, cong_point); + + return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size); +} + +int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *dev, + void *in, int in_size) +{ + u32 out[MLX5_ST_SZ_DW(modify_cong_params_out)] = { }; + + return mlx5_cmd_exec(dev, in, in_size, out, sizeof(out)); +} + +int mlx5_cmd_alloc_memic(struct mlx5_memic *memic, phys_addr_t *addr, + u64 length, u32 alignment) +{ + struct mlx5_core_dev *dev = memic->dev; + u64 num_memic_hw_pages = MLX5_CAP_DEV_MEM(dev, memic_bar_size) + >> PAGE_SHIFT; + u64 hw_start_addr = MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr); + u32 max_alignment = MLX5_CAP_DEV_MEM(dev, log_max_memic_addr_alignment); + u32 num_pages = DIV_ROUND_UP(length, PAGE_SIZE); + u32 out[MLX5_ST_SZ_DW(alloc_memic_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_memic_in)] = {}; + u32 mlx5_alignment; + u64 page_idx = 0; + int ret = 0; + + if (!length || (length & MLX5_MEMIC_ALLOC_SIZE_MASK)) + return -EINVAL; + + /* mlx5 device sets alignment as 64*2^driver_value + * so normalizing is needed. + */ + mlx5_alignment = (alignment < MLX5_MEMIC_BASE_ALIGN) ? 0 : + alignment - MLX5_MEMIC_BASE_ALIGN; + if (mlx5_alignment > max_alignment) + return -EINVAL; + + MLX5_SET(alloc_memic_in, in, opcode, MLX5_CMD_OP_ALLOC_MEMIC); + MLX5_SET(alloc_memic_in, in, range_size, num_pages * PAGE_SIZE); + MLX5_SET(alloc_memic_in, in, memic_size, length); + MLX5_SET(alloc_memic_in, in, log_memic_addr_alignment, + mlx5_alignment); + + while (page_idx < num_memic_hw_pages) { + spin_lock(&memic->memic_lock); + page_idx = bitmap_find_next_zero_area(memic->memic_alloc_pages, + num_memic_hw_pages, + page_idx, + num_pages, 0); + + if (page_idx < num_memic_hw_pages) + bitmap_set(memic->memic_alloc_pages, + page_idx, num_pages); + + spin_unlock(&memic->memic_lock); + + if (page_idx >= num_memic_hw_pages) + break; + + MLX5_SET64(alloc_memic_in, in, range_start_addr, + hw_start_addr + (page_idx * PAGE_SIZE)); + + ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (ret) { + spin_lock(&memic->memic_lock); + bitmap_clear(memic->memic_alloc_pages, + page_idx, num_pages); + spin_unlock(&memic->memic_lock); + + if (ret == -EAGAIN) { + page_idx++; + continue; + } + + return ret; + } + + *addr = pci_resource_start(dev->pdev, 0) + + MLX5_GET64(alloc_memic_out, out, memic_start_addr); + + return 0; + } + + return -ENOMEM; +} + +int mlx5_cmd_dealloc_memic(struct mlx5_memic *memic, u64 addr, u64 length) +{ + struct mlx5_core_dev *dev = memic->dev; + u64 hw_start_addr = MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr); + u32 num_pages = DIV_ROUND_UP(length, PAGE_SIZE); + u32 out[MLX5_ST_SZ_DW(dealloc_memic_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(dealloc_memic_in)] = {0}; + u64 start_page_idx; + int err; + + addr -= pci_resource_start(dev->pdev, 0); + start_page_idx = (addr - hw_start_addr) >> PAGE_SHIFT; + + MLX5_SET(dealloc_memic_in, in, opcode, MLX5_CMD_OP_DEALLOC_MEMIC); + MLX5_SET64(dealloc_memic_in, in, memic_start_addr, addr); + MLX5_SET(dealloc_memic_in, in, memic_size, length); + + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + + if (!err) { + spin_lock(&memic->memic_lock); + bitmap_clear(memic->memic_alloc_pages, + start_page_idx, num_pages); + spin_unlock(&memic->memic_lock); + } + + return err; +} + +int mlx5_cmd_query_ext_ppcnt_counters(struct mlx5_core_dev *dev, void *out) +{ + u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {}; + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + + MLX5_SET(ppcnt_reg, in, local_port, 1); + + MLX5_SET(ppcnt_reg, in, grp, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP); + return mlx5_core_access_reg(dev, in, sz, out, sz, MLX5_REG_PPCNT, + 0, 0); +} diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h new file mode 100644 index 000000000..88cbb1c41 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_IB_CMD_H +#define MLX5_IB_CMD_H + +#include "mlx5_ib.h" +#include <linux/kernel.h> +#include <linux/mlx5/driver.h> + +int mlx5_cmd_dump_fill_mkey(struct mlx5_core_dev *dev, u32 *mkey); +int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey); +int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point, + void *out, int out_size); +int mlx5_cmd_query_ext_ppcnt_counters(struct mlx5_core_dev *dev, void *out); +int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *mdev, + void *in, int in_size); +int mlx5_cmd_alloc_memic(struct mlx5_memic *memic, phys_addr_t *addr, + u64 length, u32 alignment); +int mlx5_cmd_dealloc_memic(struct mlx5_memic *memic, u64 addr, u64 length); +#endif /* MLX5_IB_CMD_H */ diff --git a/drivers/infiniband/hw/mlx5/cong.c b/drivers/infiniband/hw/mlx5/cong.c new file mode 100644 index 000000000..7e4e358a4 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/cong.c @@ -0,0 +1,449 @@ +/* + * Copyright (c) 2013-2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/debugfs.h> + +#include "mlx5_ib.h" +#include "cmd.h" + +enum mlx5_ib_cong_node_type { + MLX5_IB_RROCE_ECN_RP = 1, + MLX5_IB_RROCE_ECN_NP = 2, +}; + +static const char * const mlx5_ib_dbg_cc_name[] = { + "rp_clamp_tgt_rate", + "rp_clamp_tgt_rate_ati", + "rp_time_reset", + "rp_byte_reset", + "rp_threshold", + "rp_ai_rate", + "rp_hai_rate", + "rp_min_dec_fac", + "rp_min_rate", + "rp_rate_to_set_on_first_cnp", + "rp_dce_tcp_g", + "rp_dce_tcp_rtt", + "rp_rate_reduce_monitor_period", + "rp_initial_alpha_value", + "rp_gd", + "np_cnp_dscp", + "np_cnp_prio_mode", + "np_cnp_prio", +}; + +#define MLX5_IB_RP_CLAMP_TGT_RATE_ATTR BIT(1) +#define MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR BIT(2) +#define MLX5_IB_RP_TIME_RESET_ATTR BIT(3) +#define MLX5_IB_RP_BYTE_RESET_ATTR BIT(4) +#define MLX5_IB_RP_THRESHOLD_ATTR BIT(5) +#define MLX5_IB_RP_AI_RATE_ATTR BIT(7) +#define MLX5_IB_RP_HAI_RATE_ATTR BIT(8) +#define MLX5_IB_RP_MIN_DEC_FAC_ATTR BIT(9) +#define MLX5_IB_RP_MIN_RATE_ATTR BIT(10) +#define MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR BIT(11) +#define MLX5_IB_RP_DCE_TCP_G_ATTR BIT(12) +#define MLX5_IB_RP_DCE_TCP_RTT_ATTR BIT(13) +#define MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR BIT(14) +#define MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR BIT(15) +#define MLX5_IB_RP_GD_ATTR BIT(16) + +#define MLX5_IB_NP_CNP_DSCP_ATTR BIT(3) +#define MLX5_IB_NP_CNP_PRIO_MODE_ATTR BIT(4) + +static enum mlx5_ib_cong_node_type +mlx5_ib_param_to_node(enum mlx5_ib_dbg_cc_types param_offset) +{ + if (param_offset >= MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE && + param_offset <= MLX5_IB_DBG_CC_RP_GD) + return MLX5_IB_RROCE_ECN_RP; + else + return MLX5_IB_RROCE_ECN_NP; +} + +static u32 mlx5_get_cc_param_val(void *field, int offset) +{ + switch (offset) { + case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + clamp_tgt_rate); + case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE_ATI: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + clamp_tgt_rate_after_time_inc); + case MLX5_IB_DBG_CC_RP_TIME_RESET: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_time_reset); + case MLX5_IB_DBG_CC_RP_BYTE_RESET: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_byte_reset); + case MLX5_IB_DBG_CC_RP_THRESHOLD: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_threshold); + case MLX5_IB_DBG_CC_RP_AI_RATE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_ai_rate); + case MLX5_IB_DBG_CC_RP_HAI_RATE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_hai_rate); + case MLX5_IB_DBG_CC_RP_MIN_DEC_FAC: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_min_dec_fac); + case MLX5_IB_DBG_CC_RP_MIN_RATE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_min_rate); + case MLX5_IB_DBG_CC_RP_RATE_TO_SET_ON_FIRST_CNP: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rate_to_set_on_first_cnp); + case MLX5_IB_DBG_CC_RP_DCE_TCP_G: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + dce_tcp_g); + case MLX5_IB_DBG_CC_RP_DCE_TCP_RTT: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + dce_tcp_rtt); + case MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rate_reduce_monitor_period); + case MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + initial_alpha_value); + case MLX5_IB_DBG_CC_RP_GD: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_gd); + case MLX5_IB_DBG_CC_NP_CNP_DSCP: + return MLX5_GET(cong_control_r_roce_ecn_np, field, + cnp_dscp); + case MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE: + return MLX5_GET(cong_control_r_roce_ecn_np, field, + cnp_prio_mode); + case MLX5_IB_DBG_CC_NP_CNP_PRIO: + return MLX5_GET(cong_control_r_roce_ecn_np, field, + cnp_802p_prio); + default: + return 0; + } +} + +static void mlx5_ib_set_cc_param_mask_val(void *field, int offset, + u32 var, u32 *attr_mask) +{ + switch (offset) { + case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE: + *attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + clamp_tgt_rate, var); + break; + case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE_ATI: + *attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + clamp_tgt_rate_after_time_inc, var); + break; + case MLX5_IB_DBG_CC_RP_TIME_RESET: + *attr_mask |= MLX5_IB_RP_TIME_RESET_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_time_reset, var); + break; + case MLX5_IB_DBG_CC_RP_BYTE_RESET: + *attr_mask |= MLX5_IB_RP_BYTE_RESET_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_byte_reset, var); + break; + case MLX5_IB_DBG_CC_RP_THRESHOLD: + *attr_mask |= MLX5_IB_RP_THRESHOLD_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_threshold, var); + break; + case MLX5_IB_DBG_CC_RP_AI_RATE: + *attr_mask |= MLX5_IB_RP_AI_RATE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_ai_rate, var); + break; + case MLX5_IB_DBG_CC_RP_HAI_RATE: + *attr_mask |= MLX5_IB_RP_HAI_RATE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_hai_rate, var); + break; + case MLX5_IB_DBG_CC_RP_MIN_DEC_FAC: + *attr_mask |= MLX5_IB_RP_MIN_DEC_FAC_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_min_dec_fac, var); + break; + case MLX5_IB_DBG_CC_RP_MIN_RATE: + *attr_mask |= MLX5_IB_RP_MIN_RATE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_min_rate, var); + break; + case MLX5_IB_DBG_CC_RP_RATE_TO_SET_ON_FIRST_CNP: + *attr_mask |= MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rate_to_set_on_first_cnp, var); + break; + case MLX5_IB_DBG_CC_RP_DCE_TCP_G: + *attr_mask |= MLX5_IB_RP_DCE_TCP_G_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + dce_tcp_g, var); + break; + case MLX5_IB_DBG_CC_RP_DCE_TCP_RTT: + *attr_mask |= MLX5_IB_RP_DCE_TCP_RTT_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + dce_tcp_rtt, var); + break; + case MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD: + *attr_mask |= MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rate_reduce_monitor_period, var); + break; + case MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE: + *attr_mask |= MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + initial_alpha_value, var); + break; + case MLX5_IB_DBG_CC_RP_GD: + *attr_mask |= MLX5_IB_RP_GD_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_gd, var); + break; + case MLX5_IB_DBG_CC_NP_CNP_DSCP: + *attr_mask |= MLX5_IB_NP_CNP_DSCP_ATTR; + MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_dscp, var); + break; + case MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE: + *attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_prio_mode, var); + break; + case MLX5_IB_DBG_CC_NP_CNP_PRIO: + *attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_prio_mode, 0); + MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_802p_prio, var); + break; + } +} + +static int mlx5_ib_get_cc_params(struct mlx5_ib_dev *dev, u8 port_num, + int offset, u32 *var) +{ + int outlen = MLX5_ST_SZ_BYTES(query_cong_params_out); + void *out; + void *field; + int err; + enum mlx5_ib_cong_node_type node; + struct mlx5_core_dev *mdev; + + /* Takes a 1-based port number */ + mdev = mlx5_ib_get_native_port_mdev(dev, port_num + 1, NULL); + if (!mdev) + return -ENODEV; + + out = kvzalloc(outlen, GFP_KERNEL); + if (!out) { + err = -ENOMEM; + goto alloc_err; + } + + node = mlx5_ib_param_to_node(offset); + + err = mlx5_cmd_query_cong_params(mdev, node, out, outlen); + if (err) + goto free; + + field = MLX5_ADDR_OF(query_cong_params_out, out, congestion_parameters); + *var = mlx5_get_cc_param_val(field, offset); + +free: + kvfree(out); +alloc_err: + mlx5_ib_put_native_port_mdev(dev, port_num + 1); + return err; +} + +static int mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, u8 port_num, + int offset, u32 var) +{ + int inlen = MLX5_ST_SZ_BYTES(modify_cong_params_in); + void *in; + void *field; + enum mlx5_ib_cong_node_type node; + struct mlx5_core_dev *mdev; + u32 attr_mask = 0; + int err; + + /* Takes a 1-based port number */ + mdev = mlx5_ib_get_native_port_mdev(dev, port_num + 1, NULL); + if (!mdev) + return -ENODEV; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto alloc_err; + } + + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + + node = mlx5_ib_param_to_node(offset); + MLX5_SET(modify_cong_params_in, in, cong_protocol, node); + + field = MLX5_ADDR_OF(modify_cong_params_in, in, congestion_parameters); + mlx5_ib_set_cc_param_mask_val(field, offset, var, &attr_mask); + + field = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_rp, field, field_select_r_roce_rp, + attr_mask); + + err = mlx5_cmd_modify_cong_params(mdev, in, inlen); + kvfree(in); +alloc_err: + mlx5_ib_put_native_port_mdev(dev, port_num + 1); + return err; +} + +static ssize_t set_param(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_ib_dbg_param *param = filp->private_data; + int offset = param->offset; + char lbuf[11] = { }; + u32 var; + int ret; + + if (count > sizeof(lbuf)) + return -EINVAL; + + if (copy_from_user(lbuf, buf, count)) + return -EFAULT; + + lbuf[sizeof(lbuf) - 1] = '\0'; + + if (kstrtou32(lbuf, 0, &var)) + return -EINVAL; + + ret = mlx5_ib_set_cc_params(param->dev, param->port_num, offset, var); + return ret ? ret : count; +} + +static ssize_t get_param(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_ib_dbg_param *param = filp->private_data; + int offset = param->offset; + u32 var = 0; + int ret; + char lbuf[11]; + + ret = mlx5_ib_get_cc_params(param->dev, param->port_num, offset, &var); + if (ret) + return ret; + + ret = snprintf(lbuf, sizeof(lbuf), "%d\n", var); + if (ret < 0) + return ret; + + return simple_read_from_buffer(buf, count, pos, lbuf, ret); +} + +static const struct file_operations dbg_cc_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = set_param, + .read = get_param, +}; + +void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num) +{ + if (!mlx5_debugfs_root || + !dev->port[port_num].dbg_cc_params || + !dev->port[port_num].dbg_cc_params->root) + return; + + debugfs_remove_recursive(dev->port[port_num].dbg_cc_params->root); + kfree(dev->port[port_num].dbg_cc_params); + dev->port[port_num].dbg_cc_params = NULL; +} + +int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num) +{ + struct mlx5_ib_dbg_cc_params *dbg_cc_params; + struct mlx5_core_dev *mdev; + int i; + + if (!mlx5_debugfs_root) + goto out; + + /* Takes a 1-based port number */ + mdev = mlx5_ib_get_native_port_mdev(dev, port_num + 1, NULL); + if (!mdev) + goto out; + + if (!MLX5_CAP_GEN(mdev, cc_query_allowed) || + !MLX5_CAP_GEN(mdev, cc_modify_allowed)) + goto put_mdev; + + dbg_cc_params = kzalloc(sizeof(*dbg_cc_params), GFP_KERNEL); + if (!dbg_cc_params) + goto err; + + dev->port[port_num].dbg_cc_params = dbg_cc_params; + + dbg_cc_params->root = debugfs_create_dir("cc_params", + mdev->priv.dbg_root); + if (!dbg_cc_params->root) + goto err; + + for (i = 0; i < MLX5_IB_DBG_CC_MAX; i++) { + dbg_cc_params->params[i].offset = i; + dbg_cc_params->params[i].dev = dev; + dbg_cc_params->params[i].port_num = port_num; + dbg_cc_params->params[i].dentry = + debugfs_create_file(mlx5_ib_dbg_cc_name[i], + 0600, dbg_cc_params->root, + &dbg_cc_params->params[i], + &dbg_cc_fops); + if (!dbg_cc_params->params[i].dentry) + goto err; + } + +put_mdev: + mlx5_ib_put_native_port_mdev(dev, port_num + 1); +out: + return 0; + +err: + mlx5_ib_warn(dev, "cong debugfs failure\n"); + mlx5_ib_cleanup_cong_debugfs(dev, port_num); + mlx5_ib_put_native_port_mdev(dev, port_num + 1); + + /* + * We don't want to fail driver if debugfs failed to initialize, + * so we are not forwarding error to the user. + */ + return 0; +} diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c new file mode 100644 index 000000000..872985e4e --- /dev/null +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -0,0 +1,1494 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/kref.h> +#include <rdma/ib_umem.h> +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_cache.h> +#include "mlx5_ib.h" + +static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq) +{ + struct ib_cq *ibcq = &to_mibcq(cq)->ibcq; + + ibcq->comp_handler(ibcq, ibcq->cq_context); +} + +static void mlx5_ib_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type) +{ + struct mlx5_ib_cq *cq = container_of(mcq, struct mlx5_ib_cq, mcq); + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct ib_cq *ibcq = &cq->ibcq; + struct ib_event event; + + if (type != MLX5_EVENT_TYPE_CQ_ERROR) { + mlx5_ib_warn(dev, "Unexpected event type %d on CQ %06x\n", + type, mcq->cqn); + return; + } + + if (ibcq->event_handler) { + event.device = &dev->ib_dev; + event.event = IB_EVENT_CQ_ERR; + event.element.cq = ibcq; + ibcq->event_handler(&event, ibcq->cq_context); + } +} + +static void *get_cqe(struct mlx5_ib_cq *cq, int n) +{ + return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n); +} + +static u8 sw_ownership_bit(int n, int nent) +{ + return (n & nent) ? 1 : 0; +} + +static void *get_sw_cqe(struct mlx5_ib_cq *cq, int n) +{ + void *cqe = get_cqe(cq, n & cq->ibcq.cqe); + struct mlx5_cqe64 *cqe64; + + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + + if (likely((cqe64->op_own) >> 4 != MLX5_CQE_INVALID) && + !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1)))) { + return cqe; + } else { + return NULL; + } +} + +static void *next_cqe_sw(struct mlx5_ib_cq *cq) +{ + return get_sw_cqe(cq, cq->mcq.cons_index); +} + +static enum ib_wc_opcode get_umr_comp(struct mlx5_ib_wq *wq, int idx) +{ + switch (wq->wr_data[idx]) { + case MLX5_IB_WR_UMR: + return 0; + + case IB_WR_LOCAL_INV: + return IB_WC_LOCAL_INV; + + case IB_WR_REG_MR: + return IB_WC_REG_MR; + + default: + pr_warn("unknown completion status\n"); + return 0; + } +} + +static void handle_good_req(struct ib_wc *wc, struct mlx5_cqe64 *cqe, + struct mlx5_ib_wq *wq, int idx) +{ + wc->wc_flags = 0; + switch (be32_to_cpu(cqe->sop_drop_qpn) >> 24) { + case MLX5_OPCODE_RDMA_WRITE_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + /* fall through */ + case MLX5_OPCODE_RDMA_WRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case MLX5_OPCODE_SEND_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + /* fall through */ + case MLX5_OPCODE_SEND: + case MLX5_OPCODE_SEND_INVAL: + wc->opcode = IB_WC_SEND; + break; + case MLX5_OPCODE_RDMA_READ: + wc->opcode = IB_WC_RDMA_READ; + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + break; + case MLX5_OPCODE_ATOMIC_CS: + wc->opcode = IB_WC_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_FA: + wc->opcode = IB_WC_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_MASKED_CS: + wc->opcode = IB_WC_MASKED_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_MASKED_FA: + wc->opcode = IB_WC_MASKED_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX5_OPCODE_UMR: + wc->opcode = get_umr_comp(wq, idx); + break; + } +} + +enum { + MLX5_GRH_IN_BUFFER = 1, + MLX5_GRH_IN_CQE = 2, +}; + +static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, + struct mlx5_ib_qp *qp) +{ + enum rdma_link_layer ll = rdma_port_get_link_layer(qp->ibqp.device, 1); + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + struct mlx5_ib_srq *srq; + struct mlx5_ib_wq *wq; + u16 wqe_ctr; + u8 roce_packet_type; + bool vlan_present; + u8 g; + + if (qp->ibqp.srq || qp->ibqp.xrcd) { + struct mlx5_core_srq *msrq = NULL; + + if (qp->ibqp.xrcd) { + msrq = mlx5_core_get_srq(dev->mdev, + be32_to_cpu(cqe->srqn)); + srq = to_mibsrq(msrq); + } else { + srq = to_msrq(qp->ibqp.srq); + } + if (srq) { + wqe_ctr = be16_to_cpu(cqe->wqe_counter); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx5_ib_free_srq_wqe(srq, wqe_ctr); + if (msrq && atomic_dec_and_test(&msrq->refcount)) + complete(&msrq->free); + } + } else { + wq = &qp->rq; + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + + switch (cqe->op_own >> 4) { + case MLX5_CQE_RESP_WR_IMM: + wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->imm_inval_pkey; + break; + case MLX5_CQE_RESP_SEND: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_IP_CSUM_OK; + if (unlikely(!((cqe->hds_ip_ext & CQE_L3_OK) && + (cqe->hds_ip_ext & CQE_L4_OK)))) + wc->wc_flags = 0; + break; + case MLX5_CQE_RESP_SEND_IMM: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->imm_inval_pkey; + break; + case MLX5_CQE_RESP_SEND_INV: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_INVALIDATE; + wc->ex.invalidate_rkey = be32_to_cpu(cqe->imm_inval_pkey); + break; + } + wc->src_qp = be32_to_cpu(cqe->flags_rqpn) & 0xffffff; + wc->dlid_path_bits = cqe->ml_path; + g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3; + wc->wc_flags |= g ? IB_WC_GRH : 0; + if (unlikely(is_qp1(qp->ibqp.qp_type))) { + u16 pkey = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff; + + ib_find_cached_pkey(&dev->ib_dev, qp->port, pkey, + &wc->pkey_index); + } else { + wc->pkey_index = 0; + } + + if (ll != IB_LINK_LAYER_ETHERNET) { + wc->slid = be16_to_cpu(cqe->slid); + wc->sl = (be32_to_cpu(cqe->flags_rqpn) >> 24) & 0xf; + return; + } + + wc->slid = 0; + vlan_present = cqe->l4_l3_hdr_type & 0x1; + roce_packet_type = (be32_to_cpu(cqe->flags_rqpn) >> 24) & 0x3; + if (vlan_present) { + wc->vlan_id = (be16_to_cpu(cqe->vlan_info)) & 0xfff; + wc->sl = (be16_to_cpu(cqe->vlan_info) >> 13) & 0x7; + wc->wc_flags |= IB_WC_WITH_VLAN; + } else { + wc->sl = 0; + } + + switch (roce_packet_type) { + case MLX5_CQE_ROCE_L3_HEADER_TYPE_GRH: + wc->network_hdr_type = RDMA_NETWORK_IB; + break; + case MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV6: + wc->network_hdr_type = RDMA_NETWORK_IPV6; + break; + case MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV4: + wc->network_hdr_type = RDMA_NETWORK_IPV4; + break; + } + wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE; +} + +static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe) +{ + mlx5_ib_warn(dev, "dump error cqe\n"); + mlx5_dump_err_cqe(dev->mdev, cqe); +} + +static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev, + struct mlx5_err_cqe *cqe, + struct ib_wc *wc) +{ + int dump = 1; + + switch (cqe->syndrome) { + case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR: + wc->status = IB_WC_LOC_LEN_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR: + wc->status = IB_WC_LOC_QP_OP_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR: + wc->status = IB_WC_LOC_PROT_ERR; + break; + case MLX5_CQE_SYNDROME_WR_FLUSH_ERR: + dump = 0; + wc->status = IB_WC_WR_FLUSH_ERR; + break; + case MLX5_CQE_SYNDROME_MW_BIND_ERR: + wc->status = IB_WC_MW_BIND_ERR; + break; + case MLX5_CQE_SYNDROME_BAD_RESP_ERR: + wc->status = IB_WC_BAD_RESP_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: + wc->status = IB_WC_REM_INV_REQ_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR: + wc->status = IB_WC_REM_ACCESS_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_OP_ERR: + wc->status = IB_WC_REM_OP_ERR; + break; + case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: + wc->status = IB_WC_RETRY_EXC_ERR; + dump = 0; + break; + case MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR: + wc->status = IB_WC_RNR_RETRY_EXC_ERR; + dump = 0; + break; + case MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR: + wc->status = IB_WC_REM_ABORT_ERR; + break; + default: + wc->status = IB_WC_GENERAL_ERR; + break; + } + + wc->vendor_err = cqe->vendor_err_synd; + if (dump) + dump_cqe(dev, cqe); +} + +static int is_atomic_response(struct mlx5_ib_qp *qp, uint16_t idx) +{ + /* TBD: waiting decision + */ + return 0; +} + +static void *mlx5_get_atomic_laddr(struct mlx5_ib_qp *qp, uint16_t idx) +{ + struct mlx5_wqe_data_seg *dpseg; + void *addr; + + dpseg = mlx5_get_send_wqe(qp, idx) + sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_atomic_seg); + addr = (void *)(unsigned long)be64_to_cpu(dpseg->addr); + return addr; +} + +static void handle_atomic(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, + uint16_t idx) +{ + void *addr; + int byte_count; + int i; + + if (!is_atomic_response(qp, idx)) + return; + + byte_count = be32_to_cpu(cqe64->byte_cnt); + addr = mlx5_get_atomic_laddr(qp, idx); + + if (byte_count == 4) { + *(uint32_t *)addr = be32_to_cpu(*((__be32 *)addr)); + } else { + for (i = 0; i < byte_count; i += 8) { + *(uint64_t *)addr = be64_to_cpu(*((__be64 *)addr)); + addr += 8; + } + } + + return; +} + +static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, + u16 tail, u16 head) +{ + u16 idx; + + do { + idx = tail & (qp->sq.wqe_cnt - 1); + handle_atomic(qp, cqe64, idx); + if (idx == head) + break; + + tail = qp->sq.w_list[idx].next; + } while (1); + tail = qp->sq.w_list[idx].next; + qp->sq.last_poll = tail; +} + +static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf) +{ + mlx5_frag_buf_free(dev->mdev, &buf->fbc.frag_buf); +} + +static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe, + struct ib_sig_err *item) +{ + u16 syndrome = be16_to_cpu(cqe->syndrome); + +#define GUARD_ERR (1 << 13) +#define APPTAG_ERR (1 << 12) +#define REFTAG_ERR (1 << 11) + + if (syndrome & GUARD_ERR) { + item->err_type = IB_SIG_BAD_GUARD; + item->expected = be32_to_cpu(cqe->expected_trans_sig) >> 16; + item->actual = be32_to_cpu(cqe->actual_trans_sig) >> 16; + } else + if (syndrome & REFTAG_ERR) { + item->err_type = IB_SIG_BAD_REFTAG; + item->expected = be32_to_cpu(cqe->expected_reftag); + item->actual = be32_to_cpu(cqe->actual_reftag); + } else + if (syndrome & APPTAG_ERR) { + item->err_type = IB_SIG_BAD_APPTAG; + item->expected = be32_to_cpu(cqe->expected_trans_sig) & 0xffff; + item->actual = be32_to_cpu(cqe->actual_trans_sig) & 0xffff; + } else { + pr_err("Got signature completion error with bad syndrome %04x\n", + syndrome); + } + + item->sig_err_offset = be64_to_cpu(cqe->err_offset); + item->key = be32_to_cpu(cqe->mkey); +} + +static void sw_send_comp(struct mlx5_ib_qp *qp, int num_entries, + struct ib_wc *wc, int *npolled) +{ + struct mlx5_ib_wq *wq; + unsigned int cur; + unsigned int idx; + int np; + int i; + + wq = &qp->sq; + cur = wq->head - wq->tail; + np = *npolled; + + if (cur == 0) + return; + + for (i = 0; i < cur && np < num_entries; i++) { + idx = wq->last_poll & (wq->wqe_cnt - 1); + wc->wr_id = wq->wrid[idx]; + wc->status = IB_WC_WR_FLUSH_ERR; + wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR; + wq->tail++; + np++; + wc->qp = &qp->ibqp; + wc++; + wq->last_poll = wq->w_list[idx].next; + } + *npolled = np; +} + +static void sw_recv_comp(struct mlx5_ib_qp *qp, int num_entries, + struct ib_wc *wc, int *npolled) +{ + struct mlx5_ib_wq *wq; + unsigned int cur; + int np; + int i; + + wq = &qp->rq; + cur = wq->head - wq->tail; + np = *npolled; + + if (cur == 0) + return; + + for (i = 0; i < cur && np < num_entries; i++) { + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + wc->status = IB_WC_WR_FLUSH_ERR; + wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR; + wq->tail++; + np++; + wc->qp = &qp->ibqp; + wc++; + } + *npolled = np; +} + +static void mlx5_ib_poll_sw_comp(struct mlx5_ib_cq *cq, int num_entries, + struct ib_wc *wc, int *npolled) +{ + struct mlx5_ib_qp *qp; + + *npolled = 0; + /* Find uncompleted WQEs belonging to that cq and return mmics ones */ + list_for_each_entry(qp, &cq->list_send_qp, cq_send_list) { + sw_send_comp(qp, num_entries, wc + *npolled, npolled); + if (*npolled >= num_entries) + return; + } + + list_for_each_entry(qp, &cq->list_recv_qp, cq_recv_list) { + sw_recv_comp(qp, num_entries, wc + *npolled, npolled); + if (*npolled >= num_entries) + return; + } +} + +static int mlx5_poll_one(struct mlx5_ib_cq *cq, + struct mlx5_ib_qp **cur_qp, + struct ib_wc *wc) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_err_cqe *err_cqe; + struct mlx5_cqe64 *cqe64; + struct mlx5_core_qp *mqp; + struct mlx5_ib_wq *wq; + struct mlx5_sig_err_cqe *sig_err_cqe; + struct mlx5_core_mkey *mmkey; + struct mlx5_ib_mr *mr; + uint8_t opcode; + uint32_t qpn; + u16 wqe_ctr; + void *cqe; + int idx; + +repoll: + cqe = next_cqe_sw(cq); + if (!cqe) + return -EAGAIN; + + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + + ++cq->mcq.cons_index; + + /* Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rmb(); + + opcode = cqe64->op_own >> 4; + if (unlikely(opcode == MLX5_CQE_RESIZE_CQ)) { + if (likely(cq->resize_buf)) { + free_cq_buf(dev, &cq->buf); + cq->buf = *cq->resize_buf; + kfree(cq->resize_buf); + cq->resize_buf = NULL; + goto repoll; + } else { + mlx5_ib_warn(dev, "unexpected resize cqe\n"); + } + } + + qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff; + if (!*cur_qp || (qpn != (*cur_qp)->ibqp.qp_num)) { + /* We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + mqp = __mlx5_qp_lookup(dev->mdev, qpn); + *cur_qp = to_mibqp(mqp); + } + + wc->qp = &(*cur_qp)->ibqp; + switch (opcode) { + case MLX5_CQE_REQ: + wq = &(*cur_qp)->sq; + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + idx = wqe_ctr & (wq->wqe_cnt - 1); + handle_good_req(wc, cqe64, wq, idx); + handle_atomics(*cur_qp, cqe64, wq->last_poll, idx); + wc->wr_id = wq->wrid[idx]; + wq->tail = wq->wqe_head[idx] + 1; + wc->status = IB_WC_SUCCESS; + break; + case MLX5_CQE_RESP_WR_IMM: + case MLX5_CQE_RESP_SEND: + case MLX5_CQE_RESP_SEND_IMM: + case MLX5_CQE_RESP_SEND_INV: + handle_responder(wc, cqe64, *cur_qp); + wc->status = IB_WC_SUCCESS; + break; + case MLX5_CQE_RESIZE_CQ: + break; + case MLX5_CQE_REQ_ERR: + case MLX5_CQE_RESP_ERR: + err_cqe = (struct mlx5_err_cqe *)cqe64; + mlx5_handle_error_cqe(dev, err_cqe, wc); + mlx5_ib_dbg(dev, "%s error cqe on cqn 0x%x:\n", + opcode == MLX5_CQE_REQ_ERR ? + "Requestor" : "Responder", cq->mcq.cqn); + mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n", + err_cqe->syndrome, err_cqe->vendor_err_synd); + if (opcode == MLX5_CQE_REQ_ERR) { + wq = &(*cur_qp)->sq; + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + idx = wqe_ctr & (wq->wqe_cnt - 1); + wc->wr_id = wq->wrid[idx]; + wq->tail = wq->wqe_head[idx] + 1; + } else { + struct mlx5_ib_srq *srq; + + if ((*cur_qp)->ibqp.srq) { + srq = to_msrq((*cur_qp)->ibqp.srq); + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx5_ib_free_srq_wqe(srq, wqe_ctr); + } else { + wq = &(*cur_qp)->rq; + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } + } + break; + case MLX5_CQE_SIG_ERR: + sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64; + + read_lock(&dev->mdev->priv.mkey_table.lock); + mmkey = __mlx5_mr_lookup(dev->mdev, + mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey))); + mr = to_mibmr(mmkey); + get_sig_err_item(sig_err_cqe, &mr->sig->err_item); + mr->sig->sig_err_exists = true; + mr->sig->sigerr_count++; + + mlx5_ib_warn(dev, "CQN: 0x%x Got SIGERR on key: 0x%x err_type %x err_offset %llx expected %x actual %x\n", + cq->mcq.cqn, mr->sig->err_item.key, + mr->sig->err_item.err_type, + mr->sig->err_item.sig_err_offset, + mr->sig->err_item.expected, + mr->sig->err_item.actual); + + read_unlock(&dev->mdev->priv.mkey_table.lock); + goto repoll; + } + + return 0; +} + +static int poll_soft_wc(struct mlx5_ib_cq *cq, int num_entries, + struct ib_wc *wc, bool is_fatal_err) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_ib_wc *soft_wc, *next; + int npolled = 0; + + list_for_each_entry_safe(soft_wc, next, &cq->wc_list, list) { + if (npolled >= num_entries) + break; + + mlx5_ib_dbg(dev, "polled software generated completion on CQ 0x%x\n", + cq->mcq.cqn); + + if (unlikely(is_fatal_err)) { + soft_wc->wc.status = IB_WC_WR_FLUSH_ERR; + soft_wc->wc.vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR; + } + wc[npolled++] = soft_wc->wc; + list_del(&soft_wc->list); + kfree(soft_wc); + } + + return npolled; +} + +int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct mlx5_ib_cq *cq = to_mcq(ibcq); + struct mlx5_ib_qp *cur_qp = NULL; + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_core_dev *mdev = dev->mdev; + unsigned long flags; + int soft_polled = 0; + int npolled; + + spin_lock_irqsave(&cq->lock, flags); + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + /* make sure no soft wqe's are waiting */ + if (unlikely(!list_empty(&cq->wc_list))) + soft_polled = poll_soft_wc(cq, num_entries, wc, true); + + mlx5_ib_poll_sw_comp(cq, num_entries - soft_polled, + wc + soft_polled, &npolled); + goto out; + } + + if (unlikely(!list_empty(&cq->wc_list))) + soft_polled = poll_soft_wc(cq, num_entries, wc, false); + + for (npolled = 0; npolled < num_entries - soft_polled; npolled++) { + if (mlx5_poll_one(cq, &cur_qp, wc + soft_polled + npolled)) + break; + } + + if (npolled) + mlx5_cq_set_ci(&cq->mcq); +out: + spin_unlock_irqrestore(&cq->lock, flags); + + return soft_polled + npolled; +} + +int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct mlx5_core_dev *mdev = to_mdev(ibcq->device)->mdev; + struct mlx5_ib_cq *cq = to_mcq(ibcq); + void __iomem *uar_page = mdev->priv.uar->map; + unsigned long irq_flags; + int ret = 0; + + spin_lock_irqsave(&cq->lock, irq_flags); + if (cq->notify_flags != IB_CQ_NEXT_COMP) + cq->notify_flags = flags & IB_CQ_SOLICITED_MASK; + + if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !list_empty(&cq->wc_list)) + ret = 1; + spin_unlock_irqrestore(&cq->lock, irq_flags); + + mlx5_cq_arm(&cq->mcq, + (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? + MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT, + uar_page, to_mcq(ibcq)->mcq.cons_index); + + return ret; +} + +static int alloc_cq_frag_buf(struct mlx5_ib_dev *dev, + struct mlx5_ib_cq_buf *buf, + int nent, + int cqe_size) +{ + struct mlx5_frag_buf_ctrl *c = &buf->fbc; + struct mlx5_frag_buf *frag_buf = &c->frag_buf; + u32 cqc_buff[MLX5_ST_SZ_DW(cqc)] = {0}; + int err; + + MLX5_SET(cqc, cqc_buff, log_cq_size, ilog2(cqe_size)); + MLX5_SET(cqc, cqc_buff, cqe_sz, (cqe_size == 128) ? 1 : 0); + + mlx5_core_init_cq_frag_buf(&buf->fbc, cqc_buff); + + err = mlx5_frag_buf_alloc_node(dev->mdev, + nent * cqe_size, + frag_buf, + dev->mdev->priv.numa_node); + if (err) + return err; + + buf->cqe_size = cqe_size; + buf->nent = nent; + + return 0; +} + +enum { + MLX5_CQE_RES_FORMAT_HASH = 0, + MLX5_CQE_RES_FORMAT_CSUM = 1, + MLX5_CQE_RES_FORMAT_CSUM_STRIDX = 3, +}; + +static int mini_cqe_res_format_to_hw(struct mlx5_ib_dev *dev, u8 format) +{ + switch (format) { + case MLX5_IB_CQE_RES_FORMAT_HASH: + return MLX5_CQE_RES_FORMAT_HASH; + case MLX5_IB_CQE_RES_FORMAT_CSUM: + return MLX5_CQE_RES_FORMAT_CSUM; + case MLX5_IB_CQE_RES_FORMAT_CSUM_STRIDX: + if (MLX5_CAP_GEN(dev->mdev, mini_cqe_resp_stride_index)) + return MLX5_CQE_RES_FORMAT_CSUM_STRIDX; + return -EOPNOTSUPP; + default: + return -EINVAL; + } +} + +static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, + struct ib_ucontext *context, struct mlx5_ib_cq *cq, + int entries, u32 **cqb, + int *cqe_size, int *index, int *inlen) +{ + struct mlx5_ib_create_cq ucmd = {}; + size_t ucmdlen; + int page_shift; + __be64 *pas; + int npages; + int ncont; + void *cqc; + int err; + + ucmdlen = udata->inlen < sizeof(ucmd) ? + (sizeof(ucmd) - sizeof(ucmd.flags)) : sizeof(ucmd); + + if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) + return -EFAULT; + + if (ucmdlen == sizeof(ucmd) && + (ucmd.flags & ~(MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD))) + return -EINVAL; + + if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128) + return -EINVAL; + + *cqe_size = ucmd.cqe_size; + + cq->buf.umem = ib_umem_get(context, ucmd.buf_addr, + entries * ucmd.cqe_size, + IB_ACCESS_LOCAL_WRITE, 1); + if (IS_ERR(cq->buf.umem)) { + err = PTR_ERR(cq->buf.umem); + return err; + } + + err = mlx5_ib_db_map_user(to_mucontext(context), ucmd.db_addr, + &cq->db); + if (err) + goto err_umem; + + mlx5_ib_cont_pages(cq->buf.umem, ucmd.buf_addr, 0, &npages, &page_shift, + &ncont, NULL); + mlx5_ib_dbg(dev, "addr 0x%llx, size %u, npages %d, page_shift %d, ncont %d\n", + ucmd.buf_addr, entries * ucmd.cqe_size, npages, page_shift, ncont); + + *inlen = MLX5_ST_SZ_BYTES(create_cq_in) + + MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * ncont; + *cqb = kvzalloc(*inlen, GFP_KERNEL); + if (!*cqb) { + err = -ENOMEM; + goto err_db; + } + + pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, *cqb, pas); + mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, pas, 0); + + cqc = MLX5_ADDR_OF(create_cq_in, *cqb, cq_context); + MLX5_SET(cqc, cqc, log_page_size, + page_shift - MLX5_ADAPTER_PAGE_SHIFT); + + *index = to_mucontext(context)->bfregi.sys_pages[0]; + + if (ucmd.cqe_comp_en == 1) { + int mini_cqe_format; + + if (!((*cqe_size == 128 && + MLX5_CAP_GEN(dev->mdev, cqe_compression_128)) || + (*cqe_size == 64 && + MLX5_CAP_GEN(dev->mdev, cqe_compression)))) { + err = -EOPNOTSUPP; + mlx5_ib_warn(dev, "CQE compression is not supported for size %d!\n", + *cqe_size); + goto err_cqb; + } + + mini_cqe_format = + mini_cqe_res_format_to_hw(dev, + ucmd.cqe_comp_res_format); + if (mini_cqe_format < 0) { + err = mini_cqe_format; + mlx5_ib_dbg(dev, "CQE compression res format %d error: %d\n", + ucmd.cqe_comp_res_format, err); + goto err_cqb; + } + + MLX5_SET(cqc, cqc, cqe_comp_en, 1); + MLX5_SET(cqc, cqc, mini_cqe_res_format, mini_cqe_format); + } + + if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD) { + if (*cqe_size != 128 || + !MLX5_CAP_GEN(dev->mdev, cqe_128_always)) { + err = -EOPNOTSUPP; + mlx5_ib_warn(dev, + "CQE padding is not supported for CQE size of %dB!\n", + *cqe_size); + goto err_cqb; + } + + cq->private_flags |= MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD; + } + + return 0; + +err_cqb: + kvfree(*cqb); + +err_db: + mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db); + +err_umem: + ib_umem_release(cq->buf.umem); + return err; +} + +static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_ucontext *context) +{ + mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db); + ib_umem_release(cq->buf.umem); +} + +static void init_cq_frag_buf(struct mlx5_ib_cq_buf *buf) +{ + int i; + void *cqe; + struct mlx5_cqe64 *cqe64; + + for (i = 0; i < buf->nent; i++) { + cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i); + cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64; + cqe64->op_own = MLX5_CQE_INVALID << 4; + } +} + +static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, int cqe_size, + u32 **cqb, int *index, int *inlen) +{ + __be64 *pas; + void *cqc; + int err; + + err = mlx5_db_alloc(dev->mdev, &cq->db); + if (err) + return err; + + cq->mcq.set_ci_db = cq->db.db; + cq->mcq.arm_db = cq->db.db + 1; + cq->mcq.cqe_sz = cqe_size; + + err = alloc_cq_frag_buf(dev, &cq->buf, entries, cqe_size); + if (err) + goto err_db; + + init_cq_frag_buf(&cq->buf); + + *inlen = MLX5_ST_SZ_BYTES(create_cq_in) + + MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * + cq->buf.fbc.frag_buf.npages; + *cqb = kvzalloc(*inlen, GFP_KERNEL); + if (!*cqb) { + err = -ENOMEM; + goto err_buf; + } + + pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, *cqb, pas); + mlx5_fill_page_frag_array(&cq->buf.fbc.frag_buf, pas); + + cqc = MLX5_ADDR_OF(create_cq_in, *cqb, cq_context); + MLX5_SET(cqc, cqc, log_page_size, + cq->buf.fbc.frag_buf.page_shift - + MLX5_ADAPTER_PAGE_SHIFT); + + *index = dev->mdev->priv.uar->index; + + return 0; + +err_buf: + free_cq_buf(dev, &cq->buf); + +err_db: + mlx5_db_free(dev->mdev, &cq->db); + return err; +} + +static void destroy_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) +{ + free_cq_buf(dev, &cq->buf); + mlx5_db_free(dev->mdev, &cq->db); +} + +static void notify_soft_wc_handler(struct work_struct *work) +{ + struct mlx5_ib_cq *cq = container_of(work, struct mlx5_ib_cq, + notify_work); + + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); +} + +struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, + const struct ib_cq_init_attr *attr, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + int entries = attr->cqe; + int vector = attr->comp_vector; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_cq *cq; + int uninitialized_var(index); + int uninitialized_var(inlen); + u32 *cqb = NULL; + void *cqc; + int cqe_size; + unsigned int irqn; + int eqn; + int err; + + if (entries < 0 || + (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)))) + return ERR_PTR(-EINVAL); + + if (check_cq_create_flags(attr->flags)) + return ERR_PTR(-EOPNOTSUPP); + + entries = roundup_pow_of_two(entries + 1); + if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) + return ERR_PTR(-EINVAL); + + cq = kzalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) + return ERR_PTR(-ENOMEM); + + cq->ibcq.cqe = entries - 1; + mutex_init(&cq->resize_mutex); + spin_lock_init(&cq->lock); + cq->resize_buf = NULL; + cq->resize_umem = NULL; + cq->create_flags = attr->flags; + INIT_LIST_HEAD(&cq->list_send_qp); + INIT_LIST_HEAD(&cq->list_recv_qp); + + if (context) { + err = create_cq_user(dev, udata, context, cq, entries, + &cqb, &cqe_size, &index, &inlen); + if (err) + goto err_create; + } else { + cqe_size = cache_line_size() == 128 ? 128 : 64; + err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb, + &index, &inlen); + if (err) + goto err_create; + + INIT_WORK(&cq->notify_work, notify_soft_wc_handler); + } + + err = mlx5_vector2eqn(dev->mdev, vector, &eqn, &irqn); + if (err) + goto err_cqb; + + cq->cqe_size = cqe_size; + + cqc = MLX5_ADDR_OF(create_cq_in, cqb, cq_context); + MLX5_SET(cqc, cqc, cqe_sz, + cqe_sz_to_mlx_sz(cqe_size, + cq->private_flags & + MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD)); + MLX5_SET(cqc, cqc, log_cq_size, ilog2(entries)); + MLX5_SET(cqc, cqc, uar_page, index); + MLX5_SET(cqc, cqc, c_eqn, eqn); + MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma); + if (cq->create_flags & IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN) + MLX5_SET(cqc, cqc, oi, 1); + + err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen); + if (err) + goto err_cqb; + + mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn); + cq->mcq.irqn = irqn; + if (context) + cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp; + else + cq->mcq.comp = mlx5_ib_cq_comp; + cq->mcq.event = mlx5_ib_cq_event; + + INIT_LIST_HEAD(&cq->wc_list); + + if (context) + if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) { + err = -EFAULT; + goto err_cmd; + } + + + kvfree(cqb); + return &cq->ibcq; + +err_cmd: + mlx5_core_destroy_cq(dev->mdev, &cq->mcq); + +err_cqb: + kvfree(cqb); + if (context) + destroy_cq_user(cq, context); + else + destroy_cq_kernel(dev, cq); + +err_create: + kfree(cq); + + return ERR_PTR(err); +} + + +int mlx5_ib_destroy_cq(struct ib_cq *cq) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->device); + struct mlx5_ib_cq *mcq = to_mcq(cq); + struct ib_ucontext *context = NULL; + + if (cq->uobject) + context = cq->uobject->context; + + mlx5_core_destroy_cq(dev->mdev, &mcq->mcq); + if (context) + destroy_cq_user(mcq, context); + else + destroy_cq_kernel(dev, mcq); + + kfree(mcq); + + return 0; +} + +static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn) +{ + return rsn == (ntohl(cqe64->sop_drop_qpn) & 0xffffff); +} + +void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq) +{ + struct mlx5_cqe64 *cqe64, *dest64; + void *cqe, *dest; + u32 prod_index; + int nfreed = 0; + u8 owner_bit; + + if (!cq) + return; + + /* First we need to find the current producer index, so we + * know where to start cleaning from. It doesn't matter if HW + * adds new entries after this loop -- the QP we're worried + * about is already in RESET, so the new entries won't come + * from our QP and therefore don't need to be checked. + */ + for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); prod_index++) + if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe) + break; + + /* Now sweep backwards through the CQ, removing CQ entries + * that match our QP by copying older entries on top of them. + */ + while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { + cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + if (is_equal_rsn(cqe64, rsn)) { + if (srq && (ntohl(cqe64->srqn) & 0xffffff)) + mlx5_ib_free_srq_wqe(srq, be16_to_cpu(cqe64->wqe_counter)); + ++nfreed; + } else if (nfreed) { + dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe); + dest64 = (cq->mcq.cqe_sz == 64) ? dest : dest + 64; + owner_bit = dest64->op_own & MLX5_CQE_OWNER_MASK; + memcpy(dest, cqe, cq->mcq.cqe_sz); + dest64->op_own = owner_bit | + (dest64->op_own & ~MLX5_CQE_OWNER_MASK); + } + } + + if (nfreed) { + cq->mcq.cons_index += nfreed; + /* Make sure update of buffer contents is done before + * updating consumer index. + */ + wmb(); + mlx5_cq_set_ci(&cq->mcq); + } +} + +void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq) +{ + if (!cq) + return; + + spin_lock_irq(&cq->lock); + __mlx5_ib_cq_clean(cq, qpn, srq); + spin_unlock_irq(&cq->lock); +} + +int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->device); + struct mlx5_ib_cq *mcq = to_mcq(cq); + int err; + + if (!MLX5_CAP_GEN(dev->mdev, cq_moderation)) + return -EOPNOTSUPP; + + if (cq_period > MLX5_MAX_CQ_PERIOD) + return -EINVAL; + + err = mlx5_core_modify_cq_moderation(dev->mdev, &mcq->mcq, + cq_period, cq_count); + if (err) + mlx5_ib_warn(dev, "modify cq 0x%x failed\n", mcq->mcq.cqn); + + return err; +} + +static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, struct ib_udata *udata, int *npas, + int *page_shift, int *cqe_size) +{ + struct mlx5_ib_resize_cq ucmd; + struct ib_umem *umem; + int err; + int npages; + struct ib_ucontext *context = cq->buf.umem->context; + + err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); + if (err) + return err; + + if (ucmd.reserved0 || ucmd.reserved1) + return -EINVAL; + + /* check multiplication overflow */ + if (ucmd.cqe_size && SIZE_MAX / ucmd.cqe_size <= entries - 1) + return -EINVAL; + + umem = ib_umem_get(context, ucmd.buf_addr, + (size_t)ucmd.cqe_size * entries, + IB_ACCESS_LOCAL_WRITE, 1); + if (IS_ERR(umem)) { + err = PTR_ERR(umem); + return err; + } + + mlx5_ib_cont_pages(umem, ucmd.buf_addr, 0, &npages, page_shift, + npas, NULL); + + cq->resize_umem = umem; + *cqe_size = ucmd.cqe_size; + + return 0; +} + +static void un_resize_user(struct mlx5_ib_cq *cq) +{ + ib_umem_release(cq->resize_umem); +} + +static int resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, int cqe_size) +{ + int err; + + cq->resize_buf = kzalloc(sizeof(*cq->resize_buf), GFP_KERNEL); + if (!cq->resize_buf) + return -ENOMEM; + + err = alloc_cq_frag_buf(dev, cq->resize_buf, entries, cqe_size); + if (err) + goto ex; + + init_cq_frag_buf(cq->resize_buf); + + return 0; + +ex: + kfree(cq->resize_buf); + return err; +} + +static void un_resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) +{ + free_cq_buf(dev, cq->resize_buf); + cq->resize_buf = NULL; +} + +static int copy_resize_cqes(struct mlx5_ib_cq *cq) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_cqe64 *scqe64; + struct mlx5_cqe64 *dcqe64; + void *start_cqe; + void *scqe; + void *dcqe; + int ssize; + int dsize; + int i; + u8 sw_own; + + ssize = cq->buf.cqe_size; + dsize = cq->resize_buf->cqe_size; + if (ssize != dsize) { + mlx5_ib_warn(dev, "resize from different cqe size is not supported\n"); + return -EINVAL; + } + + i = cq->mcq.cons_index; + scqe = get_sw_cqe(cq, i); + scqe64 = ssize == 64 ? scqe : scqe + 64; + start_cqe = scqe; + if (!scqe) { + mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); + return -EINVAL; + } + + while ((scqe64->op_own >> 4) != MLX5_CQE_RESIZE_CQ) { + dcqe = mlx5_frag_buf_get_wqe(&cq->resize_buf->fbc, + (i + 1) & cq->resize_buf->nent); + dcqe64 = dsize == 64 ? dcqe : dcqe + 64; + sw_own = sw_ownership_bit(i + 1, cq->resize_buf->nent); + memcpy(dcqe, scqe, dsize); + dcqe64->op_own = (dcqe64->op_own & ~MLX5_CQE_OWNER_MASK) | sw_own; + + ++i; + scqe = get_sw_cqe(cq, i); + scqe64 = ssize == 64 ? scqe : scqe + 64; + if (!scqe) { + mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); + return -EINVAL; + } + + if (scqe == start_cqe) { + pr_warn("resize CQ failed to get resize CQE, CQN 0x%x\n", + cq->mcq.cqn); + return -ENOMEM; + } + } + ++cq->mcq.cons_index; + return 0; +} + +int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibcq->device); + struct mlx5_ib_cq *cq = to_mcq(ibcq); + void *cqc; + u32 *in; + int err; + int npas; + __be64 *pas; + int page_shift; + int inlen; + int uninitialized_var(cqe_size); + unsigned long flags; + + if (!MLX5_CAP_GEN(dev->mdev, cq_resize)) { + pr_info("Firmware does not support resize CQ\n"); + return -ENOSYS; + } + + if (entries < 1 || + entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) { + mlx5_ib_warn(dev, "wrong entries number %d, max %d\n", + entries, + 1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)); + return -EINVAL; + } + + entries = roundup_pow_of_two(entries + 1); + if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)) + 1) + return -EINVAL; + + if (entries == ibcq->cqe + 1) + return 0; + + mutex_lock(&cq->resize_mutex); + if (udata) { + err = resize_user(dev, cq, entries, udata, &npas, &page_shift, + &cqe_size); + } else { + cqe_size = 64; + err = resize_kernel(dev, cq, entries, cqe_size); + if (!err) { + struct mlx5_frag_buf_ctrl *c; + + c = &cq->resize_buf->fbc; + npas = c->frag_buf.npages; + page_shift = c->frag_buf.page_shift; + } + } + + if (err) + goto ex; + + inlen = MLX5_ST_SZ_BYTES(modify_cq_in) + + MLX5_FLD_SZ_BYTES(modify_cq_in, pas[0]) * npas; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto ex_resize; + } + + pas = (__be64 *)MLX5_ADDR_OF(modify_cq_in, in, pas); + if (udata) + mlx5_ib_populate_pas(dev, cq->resize_umem, page_shift, + pas, 0); + else + mlx5_fill_page_frag_array(&cq->resize_buf->fbc.frag_buf, + pas); + + MLX5_SET(modify_cq_in, in, + modify_field_select_resize_field_select.resize_field_select.resize_field_select, + MLX5_MODIFY_CQ_MASK_LOG_SIZE | + MLX5_MODIFY_CQ_MASK_PG_OFFSET | + MLX5_MODIFY_CQ_MASK_PG_SIZE); + + cqc = MLX5_ADDR_OF(modify_cq_in, in, cq_context); + + MLX5_SET(cqc, cqc, log_page_size, + page_shift - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(cqc, cqc, cqe_sz, + cqe_sz_to_mlx_sz(cqe_size, + cq->private_flags & + MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD)); + MLX5_SET(cqc, cqc, log_cq_size, ilog2(entries)); + + MLX5_SET(modify_cq_in, in, op_mod, MLX5_CQ_OPMOD_RESIZE); + MLX5_SET(modify_cq_in, in, cqn, cq->mcq.cqn); + + err = mlx5_core_modify_cq(dev->mdev, &cq->mcq, in, inlen); + if (err) + goto ex_alloc; + + if (udata) { + cq->ibcq.cqe = entries - 1; + ib_umem_release(cq->buf.umem); + cq->buf.umem = cq->resize_umem; + cq->resize_umem = NULL; + } else { + struct mlx5_ib_cq_buf tbuf; + int resized = 0; + + spin_lock_irqsave(&cq->lock, flags); + if (cq->resize_buf) { + err = copy_resize_cqes(cq); + if (!err) { + tbuf = cq->buf; + cq->buf = *cq->resize_buf; + kfree(cq->resize_buf); + cq->resize_buf = NULL; + resized = 1; + } + } + cq->ibcq.cqe = entries - 1; + spin_unlock_irqrestore(&cq->lock, flags); + if (resized) + free_cq_buf(dev, &tbuf); + } + mutex_unlock(&cq->resize_mutex); + + kvfree(in); + return 0; + +ex_alloc: + kvfree(in); + +ex_resize: + if (udata) + un_resize_user(cq); + else + un_resize_kernel(dev, cq); +ex: + mutex_unlock(&cq->resize_mutex); + return err; +} + +int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq) +{ + struct mlx5_ib_cq *cq; + + if (!ibcq) + return 128; + + cq = to_mcq(ibcq); + return cq->cqe_size; +} + +/* Called from atomic context */ +int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc) +{ + struct mlx5_ib_wc *soft_wc; + struct mlx5_ib_cq *cq = to_mcq(ibcq); + unsigned long flags; + + soft_wc = kmalloc(sizeof(*soft_wc), GFP_ATOMIC); + if (!soft_wc) + return -ENOMEM; + + soft_wc->wc = *wc; + spin_lock_irqsave(&cq->lock, flags); + list_add_tail(&soft_wc->list, &cq->wc_list); + if (cq->notify_flags == IB_CQ_NEXT_COMP || + wc->status != IB_WC_SUCCESS) { + cq->notify_flags = 0; + schedule_work(&cq->notify_work); + } + spin_unlock_irqrestore(&cq->lock, flags); + + return 0; +} diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c new file mode 100644 index 000000000..c89aec834 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -0,0 +1,1127 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + */ + +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_verbs.h> +#include <rdma/uverbs_types.h> +#include <rdma/uverbs_ioctl.h> +#include <rdma/mlx5_user_ioctl_cmds.h> +#include <rdma/ib_umem.h> +#include <linux/mlx5/driver.h> +#include <linux/mlx5/fs.h> +#include "mlx5_ib.h" + +#define UVERBS_MODULE_NAME mlx5_ib +#include <rdma/uverbs_named_ioctl.h> + +#define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in) +struct devx_obj { + struct mlx5_core_dev *mdev; + u32 obj_id; + u32 dinlen; /* destroy inbox length */ + u32 dinbox[MLX5_MAX_DESTROY_INBOX_SIZE_DW]; +}; + +struct devx_umem { + struct mlx5_core_dev *mdev; + struct ib_umem *umem; + u32 page_offset; + int page_shift; + int ncont; + u32 dinlen; + u32 dinbox[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)]; +}; + +struct devx_umem_reg_cmd { + void *in; + u32 inlen; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; +}; + +static struct mlx5_ib_ucontext *devx_ufile2uctx(struct ib_uverbs_file *file) +{ + return to_mucontext(ib_uverbs_get_ucontext(file)); +} + +int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context) +{ + u32 in[MLX5_ST_SZ_DW(create_uctx_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {0}; + u64 general_obj_types; + void *hdr; + int err; + + hdr = MLX5_ADDR_OF(create_uctx_in, in, hdr); + + general_obj_types = MLX5_CAP_GEN_64(dev->mdev, general_obj_types); + if (!(general_obj_types & MLX5_GENERAL_OBJ_TYPES_CAP_UCTX) || + !(general_obj_types & MLX5_GENERAL_OBJ_TYPES_CAP_UMEM)) + return -EINVAL; + + if (!capable(CAP_NET_RAW)) + return -EPERM; + + MLX5_SET(general_obj_in_cmd_hdr, hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_type, MLX5_OBJ_TYPE_UCTX); + + err = mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + context->devx_uid = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + return 0; +} + +void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, + struct mlx5_ib_ucontext *context) +{ + u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {0}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {0}; + + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_UCTX); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, context->devx_uid); + + mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); +} + +bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type) +{ + struct devx_obj *devx_obj = obj; + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode); + + switch (opcode) { + case MLX5_CMD_OP_DESTROY_TIR: + *dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR; + *dest_id = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, + obj_id); + return true; + + case MLX5_CMD_OP_DESTROY_FLOW_TABLE: + *dest_type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + *dest_id = MLX5_GET(destroy_flow_table_in, devx_obj->dinbox, + table_id); + return true; + default: + return false; + } +} + +static int devx_is_valid_obj_id(struct devx_obj *obj, const void *in) +{ + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + u32 obj_id; + + switch (opcode) { + case MLX5_CMD_OP_MODIFY_GENERAL_OBJECT: + case MLX5_CMD_OP_QUERY_GENERAL_OBJECT: + obj_id = MLX5_GET(general_obj_in_cmd_hdr, in, obj_id); + break; + case MLX5_CMD_OP_QUERY_MKEY: + obj_id = MLX5_GET(query_mkey_in, in, mkey_index); + break; + case MLX5_CMD_OP_QUERY_CQ: + obj_id = MLX5_GET(query_cq_in, in, cqn); + break; + case MLX5_CMD_OP_MODIFY_CQ: + obj_id = MLX5_GET(modify_cq_in, in, cqn); + break; + case MLX5_CMD_OP_QUERY_SQ: + obj_id = MLX5_GET(query_sq_in, in, sqn); + break; + case MLX5_CMD_OP_MODIFY_SQ: + obj_id = MLX5_GET(modify_sq_in, in, sqn); + break; + case MLX5_CMD_OP_QUERY_RQ: + obj_id = MLX5_GET(query_rq_in, in, rqn); + break; + case MLX5_CMD_OP_MODIFY_RQ: + obj_id = MLX5_GET(modify_rq_in, in, rqn); + break; + case MLX5_CMD_OP_QUERY_RMP: + obj_id = MLX5_GET(query_rmp_in, in, rmpn); + break; + case MLX5_CMD_OP_MODIFY_RMP: + obj_id = MLX5_GET(modify_rmp_in, in, rmpn); + break; + case MLX5_CMD_OP_QUERY_RQT: + obj_id = MLX5_GET(query_rqt_in, in, rqtn); + break; + case MLX5_CMD_OP_MODIFY_RQT: + obj_id = MLX5_GET(modify_rqt_in, in, rqtn); + break; + case MLX5_CMD_OP_QUERY_TIR: + obj_id = MLX5_GET(query_tir_in, in, tirn); + break; + case MLX5_CMD_OP_MODIFY_TIR: + obj_id = MLX5_GET(modify_tir_in, in, tirn); + break; + case MLX5_CMD_OP_QUERY_TIS: + obj_id = MLX5_GET(query_tis_in, in, tisn); + break; + case MLX5_CMD_OP_MODIFY_TIS: + obj_id = MLX5_GET(modify_tis_in, in, tisn); + break; + case MLX5_CMD_OP_QUERY_FLOW_TABLE: + obj_id = MLX5_GET(query_flow_table_in, in, table_id); + break; + case MLX5_CMD_OP_MODIFY_FLOW_TABLE: + obj_id = MLX5_GET(modify_flow_table_in, in, table_id); + break; + case MLX5_CMD_OP_QUERY_FLOW_GROUP: + obj_id = MLX5_GET(query_flow_group_in, in, group_id); + break; + case MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY: + obj_id = MLX5_GET(query_fte_in, in, flow_index); + break; + case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY: + obj_id = MLX5_GET(set_fte_in, in, flow_index); + break; + case MLX5_CMD_OP_QUERY_Q_COUNTER: + obj_id = MLX5_GET(query_q_counter_in, in, counter_set_id); + break; + case MLX5_CMD_OP_QUERY_FLOW_COUNTER: + obj_id = MLX5_GET(query_flow_counter_in, in, flow_counter_id); + break; + case MLX5_CMD_OP_QUERY_MODIFY_HEADER_CONTEXT: + obj_id = MLX5_GET(general_obj_in_cmd_hdr, in, obj_id); + break; + case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT: + obj_id = MLX5_GET(query_scheduling_element_in, in, + scheduling_element_id); + break; + case MLX5_CMD_OP_MODIFY_SCHEDULING_ELEMENT: + obj_id = MLX5_GET(modify_scheduling_element_in, in, + scheduling_element_id); + break; + case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT: + obj_id = MLX5_GET(add_vxlan_udp_dport_in, in, vxlan_udp_port); + break; + case MLX5_CMD_OP_QUERY_L2_TABLE_ENTRY: + obj_id = MLX5_GET(query_l2_table_entry_in, in, table_index); + break; + case MLX5_CMD_OP_SET_L2_TABLE_ENTRY: + obj_id = MLX5_GET(set_l2_table_entry_in, in, table_index); + break; + case MLX5_CMD_OP_QUERY_QP: + obj_id = MLX5_GET(query_qp_in, in, qpn); + break; + case MLX5_CMD_OP_RST2INIT_QP: + obj_id = MLX5_GET(rst2init_qp_in, in, qpn); + break; + case MLX5_CMD_OP_INIT2RTR_QP: + obj_id = MLX5_GET(init2rtr_qp_in, in, qpn); + break; + case MLX5_CMD_OP_RTR2RTS_QP: + obj_id = MLX5_GET(rtr2rts_qp_in, in, qpn); + break; + case MLX5_CMD_OP_RTS2RTS_QP: + obj_id = MLX5_GET(rts2rts_qp_in, in, qpn); + break; + case MLX5_CMD_OP_SQERR2RTS_QP: + obj_id = MLX5_GET(sqerr2rts_qp_in, in, qpn); + break; + case MLX5_CMD_OP_2ERR_QP: + obj_id = MLX5_GET(qp_2err_in, in, qpn); + break; + case MLX5_CMD_OP_2RST_QP: + obj_id = MLX5_GET(qp_2rst_in, in, qpn); + break; + case MLX5_CMD_OP_QUERY_DCT: + obj_id = MLX5_GET(query_dct_in, in, dctn); + break; + case MLX5_CMD_OP_QUERY_XRQ: + obj_id = MLX5_GET(query_xrq_in, in, xrqn); + break; + case MLX5_CMD_OP_QUERY_XRC_SRQ: + obj_id = MLX5_GET(query_xrc_srq_in, in, xrc_srqn); + break; + case MLX5_CMD_OP_ARM_XRC_SRQ: + obj_id = MLX5_GET(arm_xrc_srq_in, in, xrc_srqn); + break; + case MLX5_CMD_OP_QUERY_SRQ: + obj_id = MLX5_GET(query_srq_in, in, srqn); + break; + case MLX5_CMD_OP_ARM_RQ: + obj_id = MLX5_GET(arm_rq_in, in, srq_number); + break; + case MLX5_CMD_OP_DRAIN_DCT: + case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION: + obj_id = MLX5_GET(drain_dct_in, in, dctn); + break; + case MLX5_CMD_OP_ARM_XRQ: + obj_id = MLX5_GET(arm_xrq_in, in, xrqn); + break; + default: + return false; + } + + if (obj_id == obj->obj_id) + return true; + + return false; +} + +static bool devx_is_obj_create_cmd(const void *in) +{ + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + + switch (opcode) { + case MLX5_CMD_OP_CREATE_GENERAL_OBJECT: + case MLX5_CMD_OP_CREATE_MKEY: + case MLX5_CMD_OP_CREATE_CQ: + case MLX5_CMD_OP_ALLOC_PD: + case MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN: + case MLX5_CMD_OP_CREATE_RMP: + case MLX5_CMD_OP_CREATE_SQ: + case MLX5_CMD_OP_CREATE_RQ: + case MLX5_CMD_OP_CREATE_RQT: + case MLX5_CMD_OP_CREATE_TIR: + case MLX5_CMD_OP_CREATE_TIS: + case MLX5_CMD_OP_ALLOC_Q_COUNTER: + case MLX5_CMD_OP_CREATE_FLOW_TABLE: + case MLX5_CMD_OP_CREATE_FLOW_GROUP: + case MLX5_CMD_OP_ALLOC_FLOW_COUNTER: + case MLX5_CMD_OP_ALLOC_ENCAP_HEADER: + case MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT: + case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT: + case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT: + case MLX5_CMD_OP_SET_L2_TABLE_ENTRY: + case MLX5_CMD_OP_CREATE_QP: + case MLX5_CMD_OP_CREATE_SRQ: + case MLX5_CMD_OP_CREATE_XRC_SRQ: + case MLX5_CMD_OP_CREATE_DCT: + case MLX5_CMD_OP_CREATE_XRQ: + case MLX5_CMD_OP_ATTACH_TO_MCG: + case MLX5_CMD_OP_ALLOC_XRCD: + return true; + case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY: + { + u16 op_mod = MLX5_GET(set_fte_in, in, op_mod); + if (op_mod == 0) + return true; + return false; + } + default: + return false; + } +} + +static bool devx_is_obj_modify_cmd(const void *in) +{ + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + + switch (opcode) { + case MLX5_CMD_OP_MODIFY_GENERAL_OBJECT: + case MLX5_CMD_OP_MODIFY_CQ: + case MLX5_CMD_OP_MODIFY_RMP: + case MLX5_CMD_OP_MODIFY_SQ: + case MLX5_CMD_OP_MODIFY_RQ: + case MLX5_CMD_OP_MODIFY_RQT: + case MLX5_CMD_OP_MODIFY_TIR: + case MLX5_CMD_OP_MODIFY_TIS: + case MLX5_CMD_OP_MODIFY_FLOW_TABLE: + case MLX5_CMD_OP_MODIFY_SCHEDULING_ELEMENT: + case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT: + case MLX5_CMD_OP_SET_L2_TABLE_ENTRY: + case MLX5_CMD_OP_RST2INIT_QP: + case MLX5_CMD_OP_INIT2RTR_QP: + case MLX5_CMD_OP_INIT2INIT_QP: + case MLX5_CMD_OP_RTR2RTS_QP: + case MLX5_CMD_OP_RTS2RTS_QP: + case MLX5_CMD_OP_SQERR2RTS_QP: + case MLX5_CMD_OP_2ERR_QP: + case MLX5_CMD_OP_2RST_QP: + case MLX5_CMD_OP_ARM_XRC_SRQ: + case MLX5_CMD_OP_ARM_RQ: + case MLX5_CMD_OP_DRAIN_DCT: + case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION: + case MLX5_CMD_OP_ARM_XRQ: + return true; + case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY: + { + u16 op_mod = MLX5_GET(set_fte_in, in, op_mod); + + if (op_mod == 1) + return true; + return false; + } + default: + return false; + } +} + +static bool devx_is_obj_query_cmd(const void *in) +{ + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + + switch (opcode) { + case MLX5_CMD_OP_QUERY_GENERAL_OBJECT: + case MLX5_CMD_OP_QUERY_MKEY: + case MLX5_CMD_OP_QUERY_CQ: + case MLX5_CMD_OP_QUERY_RMP: + case MLX5_CMD_OP_QUERY_SQ: + case MLX5_CMD_OP_QUERY_RQ: + case MLX5_CMD_OP_QUERY_RQT: + case MLX5_CMD_OP_QUERY_TIR: + case MLX5_CMD_OP_QUERY_TIS: + case MLX5_CMD_OP_QUERY_Q_COUNTER: + case MLX5_CMD_OP_QUERY_FLOW_TABLE: + case MLX5_CMD_OP_QUERY_FLOW_GROUP: + case MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY: + case MLX5_CMD_OP_QUERY_FLOW_COUNTER: + case MLX5_CMD_OP_QUERY_MODIFY_HEADER_CONTEXT: + case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT: + case MLX5_CMD_OP_QUERY_L2_TABLE_ENTRY: + case MLX5_CMD_OP_QUERY_QP: + case MLX5_CMD_OP_QUERY_SRQ: + case MLX5_CMD_OP_QUERY_XRC_SRQ: + case MLX5_CMD_OP_QUERY_DCT: + case MLX5_CMD_OP_QUERY_XRQ: + return true; + default: + return false; + } +} + +static bool devx_is_general_cmd(void *in) +{ + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + + switch (opcode) { + case MLX5_CMD_OP_QUERY_HCA_CAP: + case MLX5_CMD_OP_QUERY_VPORT_STATE: + case MLX5_CMD_OP_QUERY_ADAPTER: + case MLX5_CMD_OP_QUERY_ISSI: + case MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT: + case MLX5_CMD_OP_QUERY_ROCE_ADDRESS: + case MLX5_CMD_OP_QUERY_VNIC_ENV: + case MLX5_CMD_OP_QUERY_VPORT_COUNTER: + case MLX5_CMD_OP_GET_DROPPED_PACKET_LOG: + case MLX5_CMD_OP_NOP: + case MLX5_CMD_OP_QUERY_CONG_STATUS: + case MLX5_CMD_OP_QUERY_CONG_PARAMS: + case MLX5_CMD_OP_QUERY_CONG_STATISTICS: + return true; + default: + return false; + } +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_EQN)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_ucontext *c; + struct mlx5_ib_dev *dev; + int user_vector; + int dev_eqn; + unsigned int irqn; + int err; + + if (uverbs_copy_from(&user_vector, attrs, + MLX5_IB_ATTR_DEVX_QUERY_EQN_USER_VEC)) + return -EFAULT; + + c = devx_ufile2uctx(file); + if (IS_ERR(c)) + return PTR_ERR(c); + dev = to_mdev(c->ibucontext.device); + + err = mlx5_vector2eqn(dev->mdev, user_vector, &dev_eqn, &irqn); + if (err < 0) + return err; + + if (uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_QUERY_EQN_DEV_EQN, + &dev_eqn, sizeof(dev_eqn))) + return -EFAULT; + + return 0; +} + +/* + *Security note: + * The hardware protection mechanism works like this: Each device object that + * is subject to UAR doorbells (QP/SQ/CQ) gets a UAR ID (called uar_page in + * the device specification manual) upon its creation. Then upon doorbell, + * hardware fetches the object context for which the doorbell was rang, and + * validates that the UAR through which the DB was rang matches the UAR ID + * of the object. + * If no match the doorbell is silently ignored by the hardware. Of course, + * the user cannot ring a doorbell on a UAR that was not mapped to it. + * Now in devx, as the devx kernel does not manipulate the QP/SQ/CQ command + * mailboxes (except tagging them with UID), we expose to the user its UAR + * ID, so it can embed it in these objects in the expected specification + * format. So the only thing the user can do is hurt itself by creating a + * QP/SQ/CQ with a UAR ID other than his, and then in this case other users + * may ring a doorbell on its objects. + * The consequence of that will be that another user can schedule a QP/SQ + * of the buggy user for execution (just insert it to the hardware schedule + * queue or arm its CQ for event generation), no further harm is expected. + */ +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_UAR)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_ucontext *c; + struct mlx5_ib_dev *dev; + u32 user_idx; + s32 dev_idx; + + c = devx_ufile2uctx(file); + if (IS_ERR(c)) + return PTR_ERR(c); + dev = to_mdev(c->ibucontext.device); + + if (uverbs_copy_from(&user_idx, attrs, + MLX5_IB_ATTR_DEVX_QUERY_UAR_USER_IDX)) + return -EFAULT; + + dev_idx = bfregn_to_uar_index(dev, &c->bfregi, user_idx, true); + if (dev_idx < 0) + return dev_idx; + + if (uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_QUERY_UAR_DEV_IDX, + &dev_idx, sizeof(dev_idx))) + return -EFAULT; + + return 0; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OTHER)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_ucontext *c; + struct mlx5_ib_dev *dev; + void *cmd_in = uverbs_attr_get_alloced_ptr( + attrs, MLX5_IB_ATTR_DEVX_OTHER_CMD_IN); + int cmd_out_len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_DEVX_OTHER_CMD_OUT); + void *cmd_out; + int err; + + c = devx_ufile2uctx(file); + if (IS_ERR(c)) + return PTR_ERR(c); + dev = to_mdev(c->ibucontext.device); + + if (!c->devx_uid) + return -EPERM; + + /* Only white list of some general HCA commands are allowed for this method. */ + if (!devx_is_general_cmd(cmd_in)) + return -EINVAL; + + cmd_out = uverbs_zalloc(attrs, cmd_out_len); + if (IS_ERR(cmd_out)) + return PTR_ERR(cmd_out); + + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid); + err = mlx5_cmd_exec(dev->mdev, cmd_in, + uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OTHER_CMD_IN), + cmd_out, cmd_out_len); + if (err) + return err; + + return uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OTHER_CMD_OUT, cmd_out, + cmd_out_len); +} + +static void devx_obj_build_destroy_cmd(void *in, void *out, void *din, + u32 *dinlen, + u32 *obj_id) +{ + u16 obj_type = MLX5_GET(general_obj_in_cmd_hdr, in, obj_type); + u16 uid = MLX5_GET(general_obj_in_cmd_hdr, in, uid); + + *obj_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + *dinlen = MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr); + + MLX5_SET(general_obj_in_cmd_hdr, din, obj_id, *obj_id); + MLX5_SET(general_obj_in_cmd_hdr, din, uid, uid); + + switch (MLX5_GET(general_obj_in_cmd_hdr, in, opcode)) { + case MLX5_CMD_OP_CREATE_GENERAL_OBJECT: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, din, obj_type, obj_type); + break; + + case MLX5_CMD_OP_CREATE_MKEY: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_MKEY); + break; + case MLX5_CMD_OP_CREATE_CQ: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_CQ); + break; + case MLX5_CMD_OP_ALLOC_PD: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DEALLOC_PD); + break; + case MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN); + break; + case MLX5_CMD_OP_CREATE_RMP: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_RMP); + break; + case MLX5_CMD_OP_CREATE_SQ: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_SQ); + break; + case MLX5_CMD_OP_CREATE_RQ: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_RQ); + break; + case MLX5_CMD_OP_CREATE_RQT: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_RQT); + break; + case MLX5_CMD_OP_CREATE_TIR: + *obj_id = MLX5_GET(create_tir_out, out, tirn); + MLX5_SET(destroy_tir_in, din, opcode, MLX5_CMD_OP_DESTROY_TIR); + MLX5_SET(destroy_tir_in, din, tirn, *obj_id); + break; + case MLX5_CMD_OP_CREATE_TIS: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_TIS); + break; + case MLX5_CMD_OP_ALLOC_Q_COUNTER: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DEALLOC_Q_COUNTER); + break; + case MLX5_CMD_OP_CREATE_FLOW_TABLE: + *dinlen = MLX5_ST_SZ_BYTES(destroy_flow_table_in); + *obj_id = MLX5_GET(create_flow_table_out, out, table_id); + MLX5_SET(destroy_flow_table_in, din, other_vport, + MLX5_GET(create_flow_table_in, in, other_vport)); + MLX5_SET(destroy_flow_table_in, din, vport_number, + MLX5_GET(create_flow_table_in, in, vport_number)); + MLX5_SET(destroy_flow_table_in, din, table_type, + MLX5_GET(create_flow_table_in, in, table_type)); + MLX5_SET(destroy_flow_table_in, din, table_id, *obj_id); + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DESTROY_FLOW_TABLE); + break; + case MLX5_CMD_OP_CREATE_FLOW_GROUP: + *dinlen = MLX5_ST_SZ_BYTES(destroy_flow_group_in); + *obj_id = MLX5_GET(create_flow_group_out, out, group_id); + MLX5_SET(destroy_flow_group_in, din, other_vport, + MLX5_GET(create_flow_group_in, in, other_vport)); + MLX5_SET(destroy_flow_group_in, din, vport_number, + MLX5_GET(create_flow_group_in, in, vport_number)); + MLX5_SET(destroy_flow_group_in, din, table_type, + MLX5_GET(create_flow_group_in, in, table_type)); + MLX5_SET(destroy_flow_group_in, din, table_id, + MLX5_GET(create_flow_group_in, in, table_id)); + MLX5_SET(destroy_flow_group_in, din, group_id, *obj_id); + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DESTROY_FLOW_GROUP); + break; + case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY: + *dinlen = MLX5_ST_SZ_BYTES(delete_fte_in); + *obj_id = MLX5_GET(set_fte_in, in, flow_index); + MLX5_SET(delete_fte_in, din, other_vport, + MLX5_GET(set_fte_in, in, other_vport)); + MLX5_SET(delete_fte_in, din, vport_number, + MLX5_GET(set_fte_in, in, vport_number)); + MLX5_SET(delete_fte_in, din, table_type, + MLX5_GET(set_fte_in, in, table_type)); + MLX5_SET(delete_fte_in, din, table_id, + MLX5_GET(set_fte_in, in, table_id)); + MLX5_SET(delete_fte_in, din, flow_index, *obj_id); + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY); + break; + case MLX5_CMD_OP_ALLOC_FLOW_COUNTER: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DEALLOC_FLOW_COUNTER); + break; + case MLX5_CMD_OP_ALLOC_ENCAP_HEADER: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DEALLOC_ENCAP_HEADER); + break; + case MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT); + break; + case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT: + *dinlen = MLX5_ST_SZ_BYTES(destroy_scheduling_element_in); + *obj_id = MLX5_GET(create_scheduling_element_out, out, + scheduling_element_id); + MLX5_SET(destroy_scheduling_element_in, din, + scheduling_hierarchy, + MLX5_GET(create_scheduling_element_in, in, + scheduling_hierarchy)); + MLX5_SET(destroy_scheduling_element_in, din, + scheduling_element_id, *obj_id); + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT); + break; + case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT: + *dinlen = MLX5_ST_SZ_BYTES(delete_vxlan_udp_dport_in); + *obj_id = MLX5_GET(add_vxlan_udp_dport_in, in, vxlan_udp_port); + MLX5_SET(delete_vxlan_udp_dport_in, din, vxlan_udp_port, *obj_id); + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT); + break; + case MLX5_CMD_OP_SET_L2_TABLE_ENTRY: + *dinlen = MLX5_ST_SZ_BYTES(delete_l2_table_entry_in); + *obj_id = MLX5_GET(set_l2_table_entry_in, in, table_index); + MLX5_SET(delete_l2_table_entry_in, din, table_index, *obj_id); + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY); + break; + case MLX5_CMD_OP_CREATE_QP: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_QP); + break; + case MLX5_CMD_OP_CREATE_SRQ: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_SRQ); + break; + case MLX5_CMD_OP_CREATE_XRC_SRQ: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DESTROY_XRC_SRQ); + break; + case MLX5_CMD_OP_CREATE_DCT: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_DCT); + break; + case MLX5_CMD_OP_CREATE_XRQ: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_XRQ); + break; + case MLX5_CMD_OP_ATTACH_TO_MCG: + *dinlen = MLX5_ST_SZ_BYTES(detach_from_mcg_in); + MLX5_SET(detach_from_mcg_in, din, qpn, + MLX5_GET(attach_to_mcg_in, in, qpn)); + memcpy(MLX5_ADDR_OF(detach_from_mcg_in, din, multicast_gid), + MLX5_ADDR_OF(attach_to_mcg_in, in, multicast_gid), + MLX5_FLD_SZ_BYTES(attach_to_mcg_in, multicast_gid)); + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DETACH_FROM_MCG); + break; + case MLX5_CMD_OP_ALLOC_XRCD: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DEALLOC_XRCD); + break; + default: + /* The entry must match to one of the devx_is_obj_create_cmd */ + WARN_ON(true); + break; + } +} + +static int devx_obj_cleanup(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + struct devx_obj *obj = uobject->object; + int ret; + + ret = mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out)); + if (ib_is_destroy_retryable(ret, why, uobject)) + return ret; + + kfree(obj); + return ret; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) +{ + void *cmd_in = uverbs_attr_get_alloced_ptr(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_IN); + int cmd_out_len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT); + void *cmd_out; + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_HANDLE); + struct mlx5_ib_ucontext *c = to_mucontext(uobj->context); + struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device); + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + struct devx_obj *obj; + int err; + + if (!c->devx_uid) + return -EPERM; + + if (!devx_is_obj_create_cmd(cmd_in)) + return -EINVAL; + + cmd_out = uverbs_zalloc(attrs, cmd_out_len); + if (IS_ERR(cmd_out)) + return PTR_ERR(cmd_out); + + obj = kzalloc(sizeof(struct devx_obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid); + err = mlx5_cmd_exec(dev->mdev, cmd_in, + uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_IN), + cmd_out, cmd_out_len); + if (err) + goto obj_free; + + uobj->object = obj; + obj->mdev = dev->mdev; + devx_obj_build_destroy_cmd(cmd_in, cmd_out, obj->dinbox, &obj->dinlen, &obj->obj_id); + WARN_ON(obj->dinlen > MLX5_MAX_DESTROY_INBOX_SIZE_DW * sizeof(u32)); + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, cmd_out, cmd_out_len); + if (err) + goto obj_destroy; + + return 0; + +obj_destroy: + mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out)); +obj_free: + kfree(obj); + return err; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_MODIFY)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) +{ + void *cmd_in = uverbs_attr_get_alloced_ptr(attrs, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN); + int cmd_out_len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_OUT); + struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, + MLX5_IB_ATTR_DEVX_OBJ_MODIFY_HANDLE); + struct mlx5_ib_ucontext *c = to_mucontext(uobj->context); + struct devx_obj *obj = uobj->object; + void *cmd_out; + int err; + + if (!c->devx_uid) + return -EPERM; + + if (!devx_is_obj_modify_cmd(cmd_in)) + return -EINVAL; + + if (!devx_is_valid_obj_id(obj, cmd_in)) + return -EINVAL; + + cmd_out = uverbs_zalloc(attrs, cmd_out_len); + if (IS_ERR(cmd_out)) + return PTR_ERR(cmd_out); + + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid); + err = mlx5_cmd_exec(obj->mdev, cmd_in, + uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN), + cmd_out, cmd_out_len); + if (err) + return err; + + return uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_OUT, + cmd_out, cmd_out_len); +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) +{ + void *cmd_in = uverbs_attr_get_alloced_ptr(attrs, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN); + int cmd_out_len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT); + struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE); + struct mlx5_ib_ucontext *c = to_mucontext(uobj->context); + struct devx_obj *obj = uobj->object; + void *cmd_out; + int err; + + if (!c->devx_uid) + return -EPERM; + + if (!devx_is_obj_query_cmd(cmd_in)) + return -EINVAL; + + if (!devx_is_valid_obj_id(obj, cmd_in)) + return -EINVAL; + + cmd_out = uverbs_zalloc(attrs, cmd_out_len); + if (IS_ERR(cmd_out)) + return PTR_ERR(cmd_out); + + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid); + err = mlx5_cmd_exec(obj->mdev, cmd_in, + uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN), + cmd_out, cmd_out_len); + if (err) + return err; + + return uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT, + cmd_out, cmd_out_len); +} + +static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext, + struct uverbs_attr_bundle *attrs, + struct devx_umem *obj) +{ + u64 addr; + size_t size; + u32 access; + int npages; + int err; + u32 page_mask; + + if (uverbs_copy_from(&addr, attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_ADDR) || + uverbs_copy_from(&size, attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_LEN)) + return -EFAULT; + + err = uverbs_get_flags32(&access, attrs, + MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + if (err) + return err; + + err = ib_check_mr_access(access); + if (err) + return err; + + obj->umem = ib_umem_get(ucontext, addr, size, access, 0); + if (IS_ERR(obj->umem)) + return PTR_ERR(obj->umem); + + mlx5_ib_cont_pages(obj->umem, obj->umem->address, + MLX5_MKEY_PAGE_SHIFT_MASK, &npages, + &obj->page_shift, &obj->ncont, NULL); + + if (!npages) { + ib_umem_release(obj->umem); + return -EINVAL; + } + + page_mask = (1 << obj->page_shift) - 1; + obj->page_offset = obj->umem->address & page_mask; + + return 0; +} + +static int devx_umem_reg_cmd_alloc(struct uverbs_attr_bundle *attrs, + struct devx_umem *obj, + struct devx_umem_reg_cmd *cmd) +{ + cmd->inlen = MLX5_ST_SZ_BYTES(create_umem_in) + + (MLX5_ST_SZ_BYTES(mtt) * obj->ncont); + cmd->in = uverbs_zalloc(attrs, cmd->inlen); + return PTR_ERR_OR_ZERO(cmd->in); +} + +static void devx_umem_reg_cmd_build(struct mlx5_ib_dev *dev, + struct devx_umem *obj, + struct devx_umem_reg_cmd *cmd) +{ + void *umem; + __be64 *mtt; + + umem = MLX5_ADDR_OF(create_umem_in, cmd->in, umem); + mtt = (__be64 *)MLX5_ADDR_OF(umem, umem, mtt); + + MLX5_SET(general_obj_in_cmd_hdr, cmd->in, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, cmd->in, obj_type, MLX5_OBJ_TYPE_UMEM); + MLX5_SET64(umem, umem, num_of_mtt, obj->ncont); + MLX5_SET(umem, umem, log_page_size, obj->page_shift - + MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(umem, umem, page_offset, obj->page_offset); + mlx5_ib_populate_pas(dev, obj->umem, obj->page_shift, mtt, + (obj->umem->writable ? MLX5_IB_MTT_WRITE : 0) | + MLX5_IB_MTT_READ); +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) +{ + struct devx_umem_reg_cmd cmd; + struct devx_umem *obj; + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE); + u32 obj_id; + struct mlx5_ib_ucontext *c = to_mucontext(uobj->context); + struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device); + int err; + + if (!c->devx_uid) + return -EPERM; + + obj = kzalloc(sizeof(struct devx_umem), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + err = devx_umem_get(dev, &c->ibucontext, attrs, obj); + if (err) + goto err_obj_free; + + err = devx_umem_reg_cmd_alloc(attrs, obj, &cmd); + if (err) + goto err_umem_release; + + devx_umem_reg_cmd_build(dev, obj, &cmd); + + MLX5_SET(general_obj_in_cmd_hdr, cmd.in, uid, c->devx_uid); + err = mlx5_cmd_exec(dev->mdev, cmd.in, cmd.inlen, cmd.out, + sizeof(cmd.out)); + if (err) + goto err_umem_release; + + obj->mdev = dev->mdev; + uobj->object = obj; + devx_obj_build_destroy_cmd(cmd.in, cmd.out, obj->dinbox, &obj->dinlen, &obj_id); + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_OUT_ID, &obj_id, sizeof(obj_id)); + if (err) + goto err_umem_destroy; + + return 0; + +err_umem_destroy: + mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, cmd.out, sizeof(cmd.out)); +err_umem_release: + ib_umem_release(obj->umem); +err_obj_free: + kfree(obj); + return err; +} + +static int devx_umem_cleanup(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + struct devx_umem *obj = uobject->object; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + int err; + + err = mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out)); + if (ib_is_destroy_retryable(err, why, uobject)) + return err; + + ib_umem_release(obj->umem); + kfree(obj); + return 0; +} + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_UMEM_REG, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE, + MLX5_IB_OBJECT_DEVX_UMEM, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_UMEM_REG_ADDR, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_UMEM_REG_LEN, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS, + enum ib_access_flags), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DEVX_UMEM_REG_OUT_ID, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_DEVX_UMEM_DEREG, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_UMEM_DEREG_HANDLE, + MLX5_IB_OBJECT_DEVX_UMEM, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_QUERY_EQN, + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_QUERY_EQN_USER_VEC, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DEVX_QUERY_EQN_DEV_EQN, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_QUERY_UAR, + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_QUERY_UAR_USER_IDX, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DEVX_QUERY_UAR_DEV_IDX, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_OTHER, + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_DEVX_OTHER_CMD_IN, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_DEVX_OTHER_CMD_OUT, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_out_cmd_hdr)), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_OBJ_CREATE, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_CREATE_HANDLE, + MLX5_IB_OBJECT_DEVX_OBJ, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_IN, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_out_cmd_hdr)), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_DEVX_OBJ_DESTROY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_DESTROY_HANDLE, + MLX5_IB_OBJECT_DEVX_OBJ, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_OBJ_MODIFY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_MODIFY_HANDLE, + MLX5_IB_OBJECT_DEVX_OBJ, + UVERBS_ACCESS_WRITE, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_OUT, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_out_cmd_hdr)), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_OBJ_QUERY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE, + MLX5_IB_OBJECT_DEVX_OBJ, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_out_cmd_hdr)), + UA_MANDATORY)); + +DECLARE_UVERBS_GLOBAL_METHODS(MLX5_IB_OBJECT_DEVX, + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OTHER), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_UAR), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_EQN)); + +DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_OBJ, + UVERBS_TYPE_ALLOC_IDR(devx_obj_cleanup), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_CREATE), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_DESTROY), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_MODIFY), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_QUERY)); + +DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_UMEM, + UVERBS_TYPE_ALLOC_IDR(devx_umem_cleanup), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_UMEM_REG), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_UMEM_DEREG)); + +DECLARE_UVERBS_OBJECT_TREE(devx_objects, + &UVERBS_OBJECT(MLX5_IB_OBJECT_DEVX), + &UVERBS_OBJECT(MLX5_IB_OBJECT_DEVX_OBJ), + &UVERBS_OBJECT(MLX5_IB_OBJECT_DEVX_UMEM)); + +const struct uverbs_object_tree_def *mlx5_ib_get_devx_tree(void) +{ + return &devx_objects; +} diff --git a/drivers/infiniband/hw/mlx5/doorbell.c b/drivers/infiniband/hw/mlx5/doorbell.c new file mode 100644 index 000000000..a0e4e6ddb --- /dev/null +++ b/drivers/infiniband/hw/mlx5/doorbell.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/kref.h> +#include <linux/slab.h> +#include <rdma/ib_umem.h> + +#include "mlx5_ib.h" + +struct mlx5_ib_user_db_page { + struct list_head list; + struct ib_umem *umem; + unsigned long user_virt; + int refcnt; +}; + +int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, + struct mlx5_db *db) +{ + struct mlx5_ib_user_db_page *page; + int err = 0; + + mutex_lock(&context->db_page_mutex); + + list_for_each_entry(page, &context->db_page_list, list) + if (page->user_virt == (virt & PAGE_MASK)) + goto found; + + page = kmalloc(sizeof(*page), GFP_KERNEL); + if (!page) { + err = -ENOMEM; + goto out; + } + + page->user_virt = (virt & PAGE_MASK); + page->refcnt = 0; + page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK, + PAGE_SIZE, 0, 0); + if (IS_ERR(page->umem)) { + err = PTR_ERR(page->umem); + kfree(page); + goto out; + } + + list_add(&page->list, &context->db_page_list); + +found: + db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK); + db->u.user_page = page; + ++page->refcnt; + +out: + mutex_unlock(&context->db_page_mutex); + + return err; +} + +void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db) +{ + mutex_lock(&context->db_page_mutex); + + if (!--db->u.user_page->refcnt) { + list_del(&db->u.user_page->list); + ib_umem_release(db->u.user_page->umem); + kfree(db->u.user_page); + } + + mutex_unlock(&context->db_page_mutex); +} diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c new file mode 100644 index 000000000..1a29f47f8 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + */ + +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_verbs.h> +#include <rdma/uverbs_types.h> +#include <rdma/uverbs_ioctl.h> +#include <rdma/mlx5_user_ioctl_cmds.h> +#include <rdma/ib_umem.h> +#include <linux/mlx5/driver.h> +#include <linux/mlx5/fs.h> +#include "mlx5_ib.h" + +#define UVERBS_MODULE_NAME mlx5_ib +#include <rdma/uverbs_named_ioctl.h> + +static const struct uverbs_attr_spec mlx5_ib_flow_type[] = { + [MLX5_IB_FLOW_TYPE_NORMAL] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + .u.ptr = { + .len = sizeof(u16), /* data is priority */ + .min_len = sizeof(u16), + } + }, + [MLX5_IB_FLOW_TYPE_SNIFFER] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_NO_DATA(), + }, + [MLX5_IB_FLOW_TYPE_ALL_DEFAULT] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_NO_DATA(), + }, + [MLX5_IB_FLOW_TYPE_MC_DEFAULT] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_NO_DATA(), + }, +}; + +static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_flow_handler *flow_handler; + struct mlx5_ib_flow_matcher *fs_matcher; + void *devx_obj; + int dest_id, dest_type; + void *cmd_in; + int inlen; + bool dest_devx, dest_qp; + struct ib_qp *qp = NULL; + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_CREATE_FLOW_HANDLE); + struct mlx5_ib_dev *dev = to_mdev(uobj->context->device); + + if (!capable(CAP_NET_RAW)) + return -EPERM; + + dest_devx = + uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX); + dest_qp = uverbs_attr_is_valid(attrs, + MLX5_IB_ATTR_CREATE_FLOW_DEST_QP); + + if ((dest_devx && dest_qp) || (!dest_devx && !dest_qp)) + return -EINVAL; + + if (dest_devx) { + devx_obj = uverbs_attr_get_obj( + attrs, MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX); + if (IS_ERR(devx_obj)) + return PTR_ERR(devx_obj); + + /* Verify that the given DEVX object is a flow + * steering destination. + */ + if (!mlx5_ib_devx_is_flow_dest(devx_obj, &dest_id, &dest_type)) + return -EINVAL; + } else { + struct mlx5_ib_qp *mqp; + + qp = uverbs_attr_get_obj(attrs, + MLX5_IB_ATTR_CREATE_FLOW_DEST_QP); + if (IS_ERR(qp)) + return PTR_ERR(qp); + + if (qp->qp_type != IB_QPT_RAW_PACKET) + return -EINVAL; + + mqp = to_mqp(qp); + if (mqp->flags & MLX5_IB_QP_RSS) + dest_id = mqp->rss_qp.tirn; + else + dest_id = mqp->raw_packet_qp.rq.tirn; + dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR; + } + + if (dev->rep) + return -ENOTSUPP; + + cmd_in = uverbs_attr_get_alloced_ptr( + attrs, MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE); + inlen = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE); + fs_matcher = uverbs_attr_get_obj(attrs, + MLX5_IB_ATTR_CREATE_FLOW_MATCHER); + flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher, cmd_in, inlen, + dest_id, dest_type); + if (IS_ERR(flow_handler)) + return PTR_ERR(flow_handler); + + ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev); + + return 0; +} + +static int flow_matcher_cleanup(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + struct mlx5_ib_flow_matcher *obj = uobject->object; + int ret; + + ret = ib_destroy_usecnt(&obj->usecnt, why, uobject); + if (ret) + return ret; + + kfree(obj); + return 0; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE); + struct mlx5_ib_dev *dev = to_mdev(uobj->context->device); + struct mlx5_ib_flow_matcher *obj; + int err; + + obj = kzalloc(sizeof(struct mlx5_ib_flow_matcher), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + obj->mask_len = uverbs_attr_get_len( + attrs, MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK); + err = uverbs_copy_from(&obj->matcher_mask, + attrs, + MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK); + if (err) + goto end; + + obj->flow_type = uverbs_attr_get_enum_id( + attrs, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE); + + if (obj->flow_type == MLX5_IB_FLOW_TYPE_NORMAL) { + err = uverbs_copy_from(&obj->priority, + attrs, + MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE); + if (err) + goto end; + } + + err = uverbs_copy_from(&obj->match_criteria_enable, + attrs, + MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA); + if (err) + goto end; + + uobj->object = obj; + obj->mdev = dev->mdev; + atomic_set(&obj->usecnt, 0); + return 0; + +end: + kfree(obj); + return err; +} + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_CREATE_FLOW, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_HANDLE, + UVERBS_OBJECT_FLOW, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE, + UVERBS_ATTR_SIZE(1, sizeof(struct mlx5_ib_match_params)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_MATCHER, + MLX5_IB_OBJECT_FLOW_MATCHER, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_DEST_QP, + UVERBS_OBJECT_QP, + UVERBS_ACCESS_READ), + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX, + MLX5_IB_OBJECT_DEVX_OBJ, + UVERBS_ACCESS_READ)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_DESTROY_FLOW, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_HANDLE, + UVERBS_OBJECT_FLOW, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +ADD_UVERBS_METHODS(mlx5_ib_fs, + UVERBS_OBJECT_FLOW, + &UVERBS_METHOD(MLX5_IB_METHOD_CREATE_FLOW), + &UVERBS_METHOD(MLX5_IB_METHOD_DESTROY_FLOW)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_FLOW_MATCHER_CREATE, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE, + MLX5_IB_OBJECT_FLOW_MATCHER, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK, + UVERBS_ATTR_SIZE(1, sizeof(struct mlx5_ib_match_params)), + UA_MANDATORY), + UVERBS_ATTR_ENUM_IN(MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE, + mlx5_ib_flow_type, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA, + UVERBS_ATTR_TYPE(u8), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_FLOW_MATCHER_DESTROY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_FLOW_MATCHER_DESTROY_HANDLE, + MLX5_IB_OBJECT_FLOW_MATCHER, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_FLOW_MATCHER, + UVERBS_TYPE_ALLOC_IDR(flow_matcher_cleanup), + &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_CREATE), + &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_DESTROY)); + +DECLARE_UVERBS_OBJECT_TREE(flow_objects, + &UVERBS_OBJECT(MLX5_IB_OBJECT_FLOW_MATCHER)); + +int mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root) +{ + int i = 0; + + root[i++] = &flow_objects; + root[i++] = &mlx5_ib_fs; + + return i; +} diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c new file mode 100644 index 000000000..5c73c0a79 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/gsi.c @@ -0,0 +1,540 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "mlx5_ib.h" + +struct mlx5_ib_gsi_wr { + struct ib_cqe cqe; + struct ib_wc wc; + int send_flags; + bool completed:1; +}; + +struct mlx5_ib_gsi_qp { + struct ib_qp ibqp; + struct ib_qp *rx_qp; + u8 port_num; + struct ib_qp_cap cap; + enum ib_sig_type sq_sig_type; + /* Serialize qp state modifications */ + struct mutex mutex; + struct ib_cq *cq; + struct mlx5_ib_gsi_wr *outstanding_wrs; + u32 outstanding_pi, outstanding_ci; + int num_qps; + /* Protects access to the tx_qps. Post send operations synchronize + * with tx_qp creation in setup_qp(). Also protects the + * outstanding_wrs array and indices. + */ + spinlock_t lock; + struct ib_qp **tx_qps; +}; + +static struct mlx5_ib_gsi_qp *gsi_qp(struct ib_qp *qp) +{ + return container_of(qp, struct mlx5_ib_gsi_qp, ibqp); +} + +static bool mlx5_ib_deth_sqpn_cap(struct mlx5_ib_dev *dev) +{ + return MLX5_CAP_GEN(dev->mdev, set_deth_sqpn); +} + +/* Call with gsi->lock locked */ +static void generate_completions(struct mlx5_ib_gsi_qp *gsi) +{ + struct ib_cq *gsi_cq = gsi->ibqp.send_cq; + struct mlx5_ib_gsi_wr *wr; + u32 index; + + for (index = gsi->outstanding_ci; index != gsi->outstanding_pi; + index++) { + wr = &gsi->outstanding_wrs[index % gsi->cap.max_send_wr]; + + if (!wr->completed) + break; + + if (gsi->sq_sig_type == IB_SIGNAL_ALL_WR || + wr->send_flags & IB_SEND_SIGNALED) + WARN_ON_ONCE(mlx5_ib_generate_wc(gsi_cq, &wr->wc)); + + wr->completed = false; + } + + gsi->outstanding_ci = index; +} + +static void handle_single_completion(struct ib_cq *cq, struct ib_wc *wc) +{ + struct mlx5_ib_gsi_qp *gsi = cq->cq_context; + struct mlx5_ib_gsi_wr *wr = + container_of(wc->wr_cqe, struct mlx5_ib_gsi_wr, cqe); + u64 wr_id; + unsigned long flags; + + spin_lock_irqsave(&gsi->lock, flags); + wr->completed = true; + wr_id = wr->wc.wr_id; + wr->wc = *wc; + wr->wc.wr_id = wr_id; + wr->wc.qp = &gsi->ibqp; + + generate_completions(gsi); + spin_unlock_irqrestore(&gsi->lock, flags); +} + +struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_gsi_qp *gsi; + struct ib_qp_init_attr hw_init_attr = *init_attr; + const u8 port_num = init_attr->port_num; + const int num_pkeys = pd->device->attrs.max_pkeys; + const int num_qps = mlx5_ib_deth_sqpn_cap(dev) ? num_pkeys : 0; + int ret; + + mlx5_ib_dbg(dev, "creating GSI QP\n"); + + if (port_num > ARRAY_SIZE(dev->devr.ports) || port_num < 1) { + mlx5_ib_warn(dev, + "invalid port number %d during GSI QP creation\n", + port_num); + return ERR_PTR(-EINVAL); + } + + gsi = kzalloc(sizeof(*gsi), GFP_KERNEL); + if (!gsi) + return ERR_PTR(-ENOMEM); + + gsi->tx_qps = kcalloc(num_qps, sizeof(*gsi->tx_qps), GFP_KERNEL); + if (!gsi->tx_qps) { + ret = -ENOMEM; + goto err_free; + } + + gsi->outstanding_wrs = kcalloc(init_attr->cap.max_send_wr, + sizeof(*gsi->outstanding_wrs), + GFP_KERNEL); + if (!gsi->outstanding_wrs) { + ret = -ENOMEM; + goto err_free_tx; + } + + mutex_init(&gsi->mutex); + + mutex_lock(&dev->devr.mutex); + + if (dev->devr.ports[port_num - 1].gsi) { + mlx5_ib_warn(dev, "GSI QP already exists on port %d\n", + port_num); + ret = -EBUSY; + goto err_free_wrs; + } + gsi->num_qps = num_qps; + spin_lock_init(&gsi->lock); + + gsi->cap = init_attr->cap; + gsi->sq_sig_type = init_attr->sq_sig_type; + gsi->ibqp.qp_num = 1; + gsi->port_num = port_num; + + gsi->cq = ib_alloc_cq(pd->device, gsi, init_attr->cap.max_send_wr, 0, + IB_POLL_SOFTIRQ); + if (IS_ERR(gsi->cq)) { + mlx5_ib_warn(dev, "unable to create send CQ for GSI QP. error %ld\n", + PTR_ERR(gsi->cq)); + ret = PTR_ERR(gsi->cq); + goto err_free_wrs; + } + + hw_init_attr.qp_type = MLX5_IB_QPT_HW_GSI; + hw_init_attr.send_cq = gsi->cq; + if (num_qps) { + hw_init_attr.cap.max_send_wr = 0; + hw_init_attr.cap.max_send_sge = 0; + hw_init_attr.cap.max_inline_data = 0; + } + gsi->rx_qp = ib_create_qp(pd, &hw_init_attr); + if (IS_ERR(gsi->rx_qp)) { + mlx5_ib_warn(dev, "unable to create hardware GSI QP. error %ld\n", + PTR_ERR(gsi->rx_qp)); + ret = PTR_ERR(gsi->rx_qp); + goto err_destroy_cq; + } + + dev->devr.ports[init_attr->port_num - 1].gsi = gsi; + + mutex_unlock(&dev->devr.mutex); + + return &gsi->ibqp; + +err_destroy_cq: + ib_free_cq(gsi->cq); +err_free_wrs: + mutex_unlock(&dev->devr.mutex); + kfree(gsi->outstanding_wrs); +err_free_tx: + kfree(gsi->tx_qps); +err_free: + kfree(gsi); + return ERR_PTR(ret); +} + +int mlx5_ib_gsi_destroy_qp(struct ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + const int port_num = gsi->port_num; + int qp_index; + int ret; + + mlx5_ib_dbg(dev, "destroying GSI QP\n"); + + mutex_lock(&dev->devr.mutex); + ret = ib_destroy_qp(gsi->rx_qp); + if (ret) { + mlx5_ib_warn(dev, "unable to destroy hardware GSI QP. error %d\n", + ret); + mutex_unlock(&dev->devr.mutex); + return ret; + } + dev->devr.ports[port_num - 1].gsi = NULL; + mutex_unlock(&dev->devr.mutex); + gsi->rx_qp = NULL; + + for (qp_index = 0; qp_index < gsi->num_qps; ++qp_index) { + if (!gsi->tx_qps[qp_index]) + continue; + WARN_ON_ONCE(ib_destroy_qp(gsi->tx_qps[qp_index])); + gsi->tx_qps[qp_index] = NULL; + } + + ib_free_cq(gsi->cq); + + kfree(gsi->outstanding_wrs); + kfree(gsi->tx_qps); + kfree(gsi); + + return 0; +} + +static struct ib_qp *create_gsi_ud_qp(struct mlx5_ib_gsi_qp *gsi) +{ + struct ib_pd *pd = gsi->rx_qp->pd; + struct ib_qp_init_attr init_attr = { + .event_handler = gsi->rx_qp->event_handler, + .qp_context = gsi->rx_qp->qp_context, + .send_cq = gsi->cq, + .recv_cq = gsi->rx_qp->recv_cq, + .cap = { + .max_send_wr = gsi->cap.max_send_wr, + .max_send_sge = gsi->cap.max_send_sge, + .max_inline_data = gsi->cap.max_inline_data, + }, + .sq_sig_type = gsi->sq_sig_type, + .qp_type = IB_QPT_UD, + .create_flags = mlx5_ib_create_qp_sqpn_qp1(), + }; + + return ib_create_qp(pd, &init_attr); +} + +static int modify_to_rts(struct mlx5_ib_gsi_qp *gsi, struct ib_qp *qp, + u16 qp_index) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct ib_qp_attr attr; + int mask; + int ret; + + mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY | IB_QP_PORT; + attr.qp_state = IB_QPS_INIT; + attr.pkey_index = qp_index; + attr.qkey = IB_QP1_QKEY; + attr.port_num = gsi->port_num; + ret = ib_modify_qp(qp, &attr, mask); + if (ret) { + mlx5_ib_err(dev, "could not change QP%d state to INIT: %d\n", + qp->qp_num, ret); + return ret; + } + + attr.qp_state = IB_QPS_RTR; + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + mlx5_ib_err(dev, "could not change QP%d state to RTR: %d\n", + qp->qp_num, ret); + return ret; + } + + attr.qp_state = IB_QPS_RTS; + attr.sq_psn = 0; + ret = ib_modify_qp(qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN); + if (ret) { + mlx5_ib_err(dev, "could not change QP%d state to RTS: %d\n", + qp->qp_num, ret); + return ret; + } + + return 0; +} + +static void setup_qp(struct mlx5_ib_gsi_qp *gsi, u16 qp_index) +{ + struct ib_device *device = gsi->rx_qp->device; + struct mlx5_ib_dev *dev = to_mdev(device); + struct ib_qp *qp; + unsigned long flags; + u16 pkey; + int ret; + + ret = ib_query_pkey(device, gsi->port_num, qp_index, &pkey); + if (ret) { + mlx5_ib_warn(dev, "unable to read P_Key at port %d, index %d\n", + gsi->port_num, qp_index); + return; + } + + if (!pkey) { + mlx5_ib_dbg(dev, "invalid P_Key at port %d, index %d. Skipping.\n", + gsi->port_num, qp_index); + return; + } + + spin_lock_irqsave(&gsi->lock, flags); + qp = gsi->tx_qps[qp_index]; + spin_unlock_irqrestore(&gsi->lock, flags); + if (qp) { + mlx5_ib_dbg(dev, "already existing GSI TX QP at port %d, index %d. Skipping\n", + gsi->port_num, qp_index); + return; + } + + qp = create_gsi_ud_qp(gsi); + if (IS_ERR(qp)) { + mlx5_ib_warn(dev, "unable to create hardware UD QP for GSI: %ld\n", + PTR_ERR(qp)); + return; + } + + ret = modify_to_rts(gsi, qp, qp_index); + if (ret) + goto err_destroy_qp; + + spin_lock_irqsave(&gsi->lock, flags); + WARN_ON_ONCE(gsi->tx_qps[qp_index]); + gsi->tx_qps[qp_index] = qp; + spin_unlock_irqrestore(&gsi->lock, flags); + + return; + +err_destroy_qp: + WARN_ON_ONCE(qp); +} + +static void setup_qps(struct mlx5_ib_gsi_qp *gsi) +{ + u16 qp_index; + + for (qp_index = 0; qp_index < gsi->num_qps; ++qp_index) + setup_qp(gsi, qp_index); +} + +int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, + int attr_mask) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + int ret; + + mlx5_ib_dbg(dev, "modifying GSI QP to state %d\n", attr->qp_state); + + mutex_lock(&gsi->mutex); + ret = ib_modify_qp(gsi->rx_qp, attr, attr_mask); + if (ret) { + mlx5_ib_warn(dev, "unable to modify GSI rx QP: %d\n", ret); + goto unlock; + } + + if (to_mqp(gsi->rx_qp)->state == IB_QPS_RTS) + setup_qps(gsi); + +unlock: + mutex_unlock(&gsi->mutex); + + return ret; +} + +int mlx5_ib_gsi_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + int ret; + + mutex_lock(&gsi->mutex); + ret = ib_query_qp(gsi->rx_qp, qp_attr, qp_attr_mask, qp_init_attr); + qp_init_attr->cap = gsi->cap; + mutex_unlock(&gsi->mutex); + + return ret; +} + +/* Call with gsi->lock locked */ +static int mlx5_ib_add_outstanding_wr(struct mlx5_ib_gsi_qp *gsi, + struct ib_ud_wr *wr, struct ib_wc *wc) +{ + struct mlx5_ib_dev *dev = to_mdev(gsi->rx_qp->device); + struct mlx5_ib_gsi_wr *gsi_wr; + + if (gsi->outstanding_pi == gsi->outstanding_ci + gsi->cap.max_send_wr) { + mlx5_ib_warn(dev, "no available GSI work request.\n"); + return -ENOMEM; + } + + gsi_wr = &gsi->outstanding_wrs[gsi->outstanding_pi % + gsi->cap.max_send_wr]; + gsi->outstanding_pi++; + + if (!wc) { + memset(&gsi_wr->wc, 0, sizeof(gsi_wr->wc)); + gsi_wr->wc.pkey_index = wr->pkey_index; + gsi_wr->wc.wr_id = wr->wr.wr_id; + } else { + gsi_wr->wc = *wc; + gsi_wr->completed = true; + } + + gsi_wr->cqe.done = &handle_single_completion; + wr->wr.wr_cqe = &gsi_wr->cqe; + + return 0; +} + +/* Call with gsi->lock locked */ +static int mlx5_ib_gsi_silent_drop(struct mlx5_ib_gsi_qp *gsi, + struct ib_ud_wr *wr) +{ + struct ib_wc wc = { + { .wr_id = wr->wr.wr_id }, + .status = IB_WC_SUCCESS, + .opcode = IB_WC_SEND, + .qp = &gsi->ibqp, + }; + int ret; + + ret = mlx5_ib_add_outstanding_wr(gsi, wr, &wc); + if (ret) + return ret; + + generate_completions(gsi); + + return 0; +} + +/* Call with gsi->lock locked */ +static struct ib_qp *get_tx_qp(struct mlx5_ib_gsi_qp *gsi, struct ib_ud_wr *wr) +{ + struct mlx5_ib_dev *dev = to_mdev(gsi->rx_qp->device); + int qp_index = wr->pkey_index; + + if (!mlx5_ib_deth_sqpn_cap(dev)) + return gsi->rx_qp; + + if (qp_index >= gsi->num_qps) + return NULL; + + return gsi->tx_qps[qp_index]; +} + +int mlx5_ib_gsi_post_send(struct ib_qp *qp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + struct ib_qp *tx_qp; + unsigned long flags; + int ret; + + for (; wr; wr = wr->next) { + struct ib_ud_wr cur_wr = *ud_wr(wr); + + cur_wr.wr.next = NULL; + + spin_lock_irqsave(&gsi->lock, flags); + tx_qp = get_tx_qp(gsi, &cur_wr); + if (!tx_qp) { + ret = mlx5_ib_gsi_silent_drop(gsi, &cur_wr); + if (ret) + goto err; + spin_unlock_irqrestore(&gsi->lock, flags); + continue; + } + + ret = mlx5_ib_add_outstanding_wr(gsi, &cur_wr, NULL); + if (ret) + goto err; + + ret = ib_post_send(tx_qp, &cur_wr.wr, bad_wr); + if (ret) { + /* Undo the effect of adding the outstanding wr */ + gsi->outstanding_pi--; + goto err; + } + spin_unlock_irqrestore(&gsi->lock, flags); + } + + return 0; + +err: + spin_unlock_irqrestore(&gsi->lock, flags); + *bad_wr = wr; + return ret; +} + +int mlx5_ib_gsi_post_recv(struct ib_qp *qp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + + return ib_post_recv(gsi->rx_qp, wr, bad_wr); +} + +void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi) +{ + if (!gsi) + return; + + mutex_lock(&gsi->mutex); + setup_qps(gsi); + mutex_unlock(&gsi->mutex); +} diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c new file mode 100644 index 000000000..b841589c2 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2018 Mellanox Technologies. All rights reserved. + */ + +#include "ib_rep.h" + +static const struct mlx5_ib_profile rep_profile = { + STAGE_CREATE(MLX5_IB_STAGE_INIT, + mlx5_ib_stage_init_init, + mlx5_ib_stage_init_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB, + mlx5_ib_stage_rep_flow_db_init, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_CAPS, + mlx5_ib_stage_caps_init, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB, + mlx5_ib_stage_rep_non_default_cb, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_ROCE, + mlx5_ib_stage_rep_roce_init, + mlx5_ib_stage_rep_roce_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, + mlx5_ib_stage_dev_res_init, + mlx5_ib_stage_dev_res_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_COUNTERS, + mlx5_ib_stage_counters_init, + mlx5_ib_stage_counters_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_BFREG, + mlx5_ib_stage_bfrag_init, + mlx5_ib_stage_bfrag_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR, + NULL, + mlx5_ib_stage_pre_ib_reg_umr_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_IB_REG, + mlx5_ib_stage_ib_reg_init, + mlx5_ib_stage_ib_reg_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, + mlx5_ib_stage_post_ib_reg_umr_init, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR, + mlx5_ib_stage_class_attr_init, + NULL), +}; + +static int +mlx5_ib_nic_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) +{ + return 0; +} + +static void +mlx5_ib_nic_rep_unload(struct mlx5_eswitch_rep *rep) +{ + rep->rep_if[REP_IB].priv = NULL; +} + +static int +mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) +{ + struct mlx5_ib_dev *ibdev; + + ibdev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*ibdev)); + if (!ibdev) + return -ENOMEM; + + ibdev->rep = rep; + ibdev->mdev = dev; + ibdev->num_ports = max(MLX5_CAP_GEN(dev, num_ports), + MLX5_CAP_GEN(dev, num_vhca_ports)); + if (!__mlx5_ib_add(ibdev, &rep_profile)) { + ib_dealloc_device(&ibdev->ib_dev); + return -EINVAL; + } + + rep->rep_if[REP_IB].priv = ibdev; + + return 0; +} + +static void +mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep) +{ + struct mlx5_ib_dev *dev; + + if (!rep->rep_if[REP_IB].priv) + return; + + dev = mlx5_ib_rep_to_dev(rep); + __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX); + rep->rep_if[REP_IB].priv = NULL; +} + +static void *mlx5_ib_vport_get_proto_dev(struct mlx5_eswitch_rep *rep) +{ + return mlx5_ib_rep_to_dev(rep); +} + +static void mlx5_ib_rep_register_vf_vports(struct mlx5_ib_dev *dev) +{ + struct mlx5_eswitch *esw = dev->mdev->priv.eswitch; + int total_vfs = MLX5_TOTAL_VPORTS(dev->mdev); + int vport; + + for (vport = 1; vport < total_vfs; vport++) { + struct mlx5_eswitch_rep_if rep_if = {}; + + rep_if.load = mlx5_ib_vport_rep_load; + rep_if.unload = mlx5_ib_vport_rep_unload; + rep_if.get_proto_dev = mlx5_ib_vport_get_proto_dev; + mlx5_eswitch_register_vport_rep(esw, vport, &rep_if, REP_IB); + } +} + +static void mlx5_ib_rep_unregister_vf_vports(struct mlx5_ib_dev *dev) +{ + struct mlx5_eswitch *esw = dev->mdev->priv.eswitch; + int total_vfs = MLX5_TOTAL_VPORTS(dev->mdev); + int vport; + + for (vport = 1; vport < total_vfs; vport++) + mlx5_eswitch_unregister_vport_rep(esw, vport, REP_IB); +} + +void mlx5_ib_register_vport_reps(struct mlx5_ib_dev *dev) +{ + struct mlx5_eswitch *esw = dev->mdev->priv.eswitch; + struct mlx5_eswitch_rep_if rep_if = {}; + + rep_if.load = mlx5_ib_nic_rep_load; + rep_if.unload = mlx5_ib_nic_rep_unload; + rep_if.get_proto_dev = mlx5_ib_vport_get_proto_dev; + rep_if.priv = dev; + + mlx5_eswitch_register_vport_rep(esw, 0, &rep_if, REP_IB); + + mlx5_ib_rep_register_vf_vports(dev); +} + +void mlx5_ib_unregister_vport_reps(struct mlx5_ib_dev *dev) +{ + struct mlx5_eswitch *esw = dev->mdev->priv.eswitch; + + mlx5_ib_rep_unregister_vf_vports(dev); /* VFs vports */ + mlx5_eswitch_unregister_vport_rep(esw, 0, REP_IB); /* UPLINK PF*/ +} + +u8 mlx5_ib_eswitch_mode(struct mlx5_eswitch *esw) +{ + return mlx5_eswitch_mode(esw); +} + +struct mlx5_ib_dev *mlx5_ib_get_rep_ibdev(struct mlx5_eswitch *esw, + int vport_index) +{ + return mlx5_eswitch_get_proto_dev(esw, vport_index, REP_IB); +} + +struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw, + int vport_index) +{ + return mlx5_eswitch_get_proto_dev(esw, vport_index, REP_ETH); +} + +struct mlx5_ib_dev *mlx5_ib_get_uplink_ibdev(struct mlx5_eswitch *esw) +{ + return mlx5_eswitch_uplink_get_proto_dev(esw, REP_IB); +} + +struct mlx5_eswitch_rep *mlx5_ib_vport_rep(struct mlx5_eswitch *esw, int vport) +{ + return mlx5_eswitch_vport_rep(esw, vport); +} + +int create_flow_rule_vport_sq(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq) +{ + struct mlx5_flow_handle *flow_rule; + struct mlx5_eswitch *esw = dev->mdev->priv.eswitch; + + if (!dev->rep) + return 0; + + flow_rule = + mlx5_eswitch_add_send_to_vport_rule(esw, + dev->rep->vport, + sq->base.mqp.qpn); + if (IS_ERR(flow_rule)) + return PTR_ERR(flow_rule); + sq->flow_rule = flow_rule; + + return 0; +} diff --git a/drivers/infiniband/hw/mlx5/ib_rep.h b/drivers/infiniband/hw/mlx5/ib_rep.h new file mode 100644 index 000000000..2ba73636a --- /dev/null +++ b/drivers/infiniband/hw/mlx5/ib_rep.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2018 Mellanox Technologies. All rights reserved. + */ + +#ifndef __MLX5_IB_REP_H__ +#define __MLX5_IB_REP_H__ + +#include <linux/mlx5/eswitch.h> +#include "mlx5_ib.h" + +#ifdef CONFIG_MLX5_ESWITCH +u8 mlx5_ib_eswitch_mode(struct mlx5_eswitch *esw); +struct mlx5_ib_dev *mlx5_ib_get_rep_ibdev(struct mlx5_eswitch *esw, + int vport_index); +struct mlx5_ib_dev *mlx5_ib_get_uplink_ibdev(struct mlx5_eswitch *esw); +struct mlx5_eswitch_rep *mlx5_ib_vport_rep(struct mlx5_eswitch *esw, + int vport_index); +void mlx5_ib_register_vport_reps(struct mlx5_ib_dev *dev); +void mlx5_ib_unregister_vport_reps(struct mlx5_ib_dev *dev); +int create_flow_rule_vport_sq(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq); +struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw, + int vport_index); +#else /* CONFIG_MLX5_ESWITCH */ +static inline u8 mlx5_ib_eswitch_mode(struct mlx5_eswitch *esw) +{ + return SRIOV_NONE; +} + +static inline +struct mlx5_ib_dev *mlx5_ib_get_rep_ibdev(struct mlx5_eswitch *esw, + int vport_index) +{ + return NULL; +} + +static inline +struct mlx5_ib_dev *mlx5_ib_get_uplink_ibdev(struct mlx5_eswitch *esw) +{ + return NULL; +} + +static inline +struct mlx5_eswitch_rep *mlx5_ib_vport_rep(struct mlx5_eswitch *esw, + int vport_index) +{ + return NULL; +} + +static inline void mlx5_ib_register_vport_reps(struct mlx5_ib_dev *dev) {} +static inline void mlx5_ib_unregister_vport_reps(struct mlx5_ib_dev *dev) {} +static inline int create_flow_rule_vport_sq(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq) +{ + return 0; +} + +static inline +struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw, + int vport_index) +{ + return NULL; +} +#endif + +static inline +struct mlx5_ib_dev *mlx5_ib_rep_to_dev(struct mlx5_eswitch_rep *rep) +{ + return (struct mlx5_ib_dev *)rep->rep_if[REP_IB].priv; +} +#endif /* __MLX5_IB_REP_H__ */ diff --git a/drivers/infiniband/hw/mlx5/ib_virt.c b/drivers/infiniband/hw/mlx5/ib_virt.c new file mode 100644 index 000000000..649a3364f --- /dev/null +++ b/drivers/infiniband/hw/mlx5/ib_virt.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/module.h> +#include <linux/mlx5/vport.h> +#include "mlx5_ib.h" + +static inline u32 mlx_to_net_policy(enum port_state_policy mlx_policy) +{ + switch (mlx_policy) { + case MLX5_POLICY_DOWN: + return IFLA_VF_LINK_STATE_DISABLE; + case MLX5_POLICY_UP: + return IFLA_VF_LINK_STATE_ENABLE; + case MLX5_POLICY_FOLLOW: + return IFLA_VF_LINK_STATE_AUTO; + default: + return __IFLA_VF_LINK_STATE_MAX; + } +} + +int mlx5_ib_get_vf_config(struct ib_device *device, int vf, u8 port, + struct ifla_vf_info *info) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_hca_vport_context *rep; + int err; + + rep = kzalloc(sizeof(*rep), GFP_KERNEL); + if (!rep) + return -ENOMEM; + + err = mlx5_query_hca_vport_context(mdev, 1, 1, vf + 1, rep); + if (err) { + mlx5_ib_warn(dev, "failed to query port policy for vf %d (%d)\n", + vf, err); + goto free; + } + memset(info, 0, sizeof(*info)); + info->linkstate = mlx_to_net_policy(rep->policy); + if (info->linkstate == __IFLA_VF_LINK_STATE_MAX) + err = -EINVAL; + +free: + kfree(rep); + return err; +} + +static inline enum port_state_policy net_to_mlx_policy(int policy) +{ + switch (policy) { + case IFLA_VF_LINK_STATE_DISABLE: + return MLX5_POLICY_DOWN; + case IFLA_VF_LINK_STATE_ENABLE: + return MLX5_POLICY_UP; + case IFLA_VF_LINK_STATE_AUTO: + return MLX5_POLICY_FOLLOW; + default: + return MLX5_POLICY_INVALID; + } +} + +int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf, + u8 port, int state) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_hca_vport_context *in; + struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->policy = net_to_mlx_policy(state); + if (in->policy == MLX5_POLICY_INVALID) { + err = -EINVAL; + goto out; + } + in->field_select = MLX5_HCA_VPORT_SEL_STATE_POLICY; + err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in); + if (!err) + vfs_ctx[vf].policy = in->policy; + +out: + kfree(in); + return err; +} + +int mlx5_ib_get_vf_stats(struct ib_device *device, int vf, + u8 port, struct ifla_vf_stats *stats) +{ + int out_sz = MLX5_ST_SZ_BYTES(query_vport_counter_out); + struct mlx5_core_dev *mdev; + struct mlx5_ib_dev *dev; + void *out; + int err; + + dev = to_mdev(device); + mdev = dev->mdev; + + out = kzalloc(out_sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = mlx5_core_query_vport_counter(mdev, true, vf, port, out, out_sz); + if (err) + goto ex; + + stats->rx_packets = MLX5_GET64_PR(query_vport_counter_out, out, received_ib_unicast.packets); + stats->tx_packets = MLX5_GET64_PR(query_vport_counter_out, out, transmitted_ib_unicast.packets); + stats->rx_bytes = MLX5_GET64_PR(query_vport_counter_out, out, received_ib_unicast.octets); + stats->tx_bytes = MLX5_GET64_PR(query_vport_counter_out, out, transmitted_ib_unicast.octets); + stats->multicast = MLX5_GET64_PR(query_vport_counter_out, out, received_ib_multicast.packets); + +ex: + kfree(out); + return err; +} + +static int set_vf_node_guid(struct ib_device *device, int vf, u8 port, u64 guid) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_hca_vport_context *in; + struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->field_select = MLX5_HCA_VPORT_SEL_NODE_GUID; + in->node_guid = guid; + err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in); + if (!err) + vfs_ctx[vf].node_guid = guid; + kfree(in); + return err; +} + +static int set_vf_port_guid(struct ib_device *device, int vf, u8 port, u64 guid) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_hca_vport_context *in; + struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->field_select = MLX5_HCA_VPORT_SEL_PORT_GUID; + in->port_guid = guid; + err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in); + if (!err) + vfs_ctx[vf].port_guid = guid; + kfree(in); + return err; +} + +int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u8 port, + u64 guid, int type) +{ + if (type == IFLA_VF_IB_NODE_GUID) + return set_vf_node_guid(device, vf, port, guid); + else if (type == IFLA_VF_IB_PORT_GUID) + return set_vf_port_guid(device, vf, port, guid); + + return -EINVAL; +} diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c new file mode 100644 index 000000000..cdf6e26eb --- /dev/null +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -0,0 +1,621 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/mlx5/cmd.h> +#include <linux/mlx5/vport.h> +#include <rdma/ib_mad.h> +#include <rdma/ib_smi.h> +#include <rdma/ib_pma.h> +#include "mlx5_ib.h" + +enum { + MLX5_IB_VENDOR_CLASS1 = 0x9, + MLX5_IB_VENDOR_CLASS2 = 0xa +}; + +static bool can_do_mad_ifc(struct mlx5_ib_dev *dev, u8 port_num, + struct ib_mad *in_mad) +{ + if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED && + in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + return true; + return dev->mdev->port_caps[port_num - 1].has_smi; +} + +int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, + u8 port, const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const void *in_mad, void *response_mad) +{ + u8 op_modifier = 0; + + if (!can_do_mad_ifc(dev, port, (struct ib_mad *)in_mad)) + return -EPERM; + + /* Key check traps can't be generated unless we have in_wc to + * tell us where to send the trap. + */ + if (ignore_mkey || !in_wc) + op_modifier |= 0x1; + if (ignore_bkey || !in_wc) + op_modifier |= 0x2; + + return mlx5_core_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier, port); +} + +static int process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + u16 slid; + int err; + + slid = in_wc ? ib_lid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS) + return IB_MAD_RESULT_SUCCESS; + + /* Don't process SMInfo queries -- the SMA can't handle them. + */ + if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO) + return IB_MAD_RESULT_SUCCESS; + } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT || + in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS1 || + in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS2 || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET) + return IB_MAD_RESULT_SUCCESS; + } else { + return IB_MAD_RESULT_SUCCESS; + } + + err = mlx5_MAD_IFC(to_mdev(ibdev), + mad_flags & IB_MAD_IGNORE_MKEY, + mad_flags & IB_MAD_IGNORE_BKEY, + port_num, in_wc, in_grh, in_mad, out_mad); + if (err) + return IB_MAD_RESULT_FAILURE; + + /* set return bit in status of directed route responses */ + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + out_mad->mad_hdr.status |= cpu_to_be16(1 << 15); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) + /* no response for trap repress */ + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +static void pma_cnt_ext_assign(struct ib_pma_portcounters_ext *pma_cnt_ext, + void *out) +{ +#define MLX5_SUM_CNT(p, cntr1, cntr2) \ + (MLX5_GET64(query_vport_counter_out, p, cntr1) + \ + MLX5_GET64(query_vport_counter_out, p, cntr2)) + + pma_cnt_ext->port_xmit_data = + cpu_to_be64(MLX5_SUM_CNT(out, transmitted_ib_unicast.octets, + transmitted_ib_multicast.octets) >> 2); + pma_cnt_ext->port_rcv_data = + cpu_to_be64(MLX5_SUM_CNT(out, received_ib_unicast.octets, + received_ib_multicast.octets) >> 2); + pma_cnt_ext->port_xmit_packets = + cpu_to_be64(MLX5_SUM_CNT(out, transmitted_ib_unicast.packets, + transmitted_ib_multicast.packets)); + pma_cnt_ext->port_rcv_packets = + cpu_to_be64(MLX5_SUM_CNT(out, received_ib_unicast.packets, + received_ib_multicast.packets)); + pma_cnt_ext->port_unicast_xmit_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, transmitted_ib_unicast.packets); + pma_cnt_ext->port_unicast_rcv_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, received_ib_unicast.packets); + pma_cnt_ext->port_multicast_xmit_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, transmitted_ib_multicast.packets); + pma_cnt_ext->port_multicast_rcv_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, received_ib_multicast.packets); +} + +static void pma_cnt_assign(struct ib_pma_portcounters *pma_cnt, + void *out) +{ + /* Traffic counters will be reported in + * their 64bit form via ib_pma_portcounters_ext by default. + */ + void *out_pma = MLX5_ADDR_OF(ppcnt_reg, out, + counter_set); + +#define MLX5_ASSIGN_PMA_CNTR(counter_var, counter_name) { \ + counter_var = MLX5_GET_BE(typeof(counter_var), \ + ib_port_cntrs_grp_data_layout, \ + out_pma, counter_name); \ + } + + MLX5_ASSIGN_PMA_CNTR(pma_cnt->symbol_error_counter, + symbol_error_counter); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_error_recovery_counter, + link_error_recovery_counter); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_downed_counter, + link_downed_counter); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_errors, + port_rcv_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_remphys_errors, + port_rcv_remote_physical_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_switch_relay_errors, + port_rcv_switch_relay_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_xmit_discards, + port_xmit_discards); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_xmit_constraint_errors, + port_xmit_constraint_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_xmit_wait, + port_xmit_wait); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_constraint_errors, + port_rcv_constraint_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_overrun_errors, + link_overrun_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->vl15_dropped, + vl_15_dropped); +} + +static int process_pma_cmd(struct mlx5_ib_dev *dev, u8 port_num, + const struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + struct mlx5_core_dev *mdev; + bool native_port = true; + u8 mdev_port_num; + void *out_cnt; + int err; + + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num); + if (!mdev) { + /* Fail to get the native port, likely due to 2nd port is still + * unaffiliated. In such case default to 1st port and attached + * PF device. + */ + native_port = false; + mdev = dev->mdev; + mdev_port_num = 1; + } + /* Declaring support of extended counters */ + if (in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO) { + struct ib_class_port_info cpi = {}; + + cpi.capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH; + memcpy((out_mad->data + 40), &cpi, sizeof(cpi)); + err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + goto done; + } + + if (in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT) { + struct ib_pma_portcounters_ext *pma_cnt_ext = + (struct ib_pma_portcounters_ext *)(out_mad->data + 40); + int sz = MLX5_ST_SZ_BYTES(query_vport_counter_out); + + out_cnt = kvzalloc(sz, GFP_KERNEL); + if (!out_cnt) { + err = IB_MAD_RESULT_FAILURE; + goto done; + } + + err = mlx5_core_query_vport_counter(mdev, 0, 0, + mdev_port_num, out_cnt, sz); + if (!err) + pma_cnt_ext_assign(pma_cnt_ext, out_cnt); + } else { + struct ib_pma_portcounters *pma_cnt = + (struct ib_pma_portcounters *)(out_mad->data + 40); + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + + out_cnt = kvzalloc(sz, GFP_KERNEL); + if (!out_cnt) { + err = IB_MAD_RESULT_FAILURE; + goto done; + } + + err = mlx5_core_query_ib_ppcnt(mdev, mdev_port_num, + out_cnt, sz); + if (!err) + pma_cnt_assign(pma_cnt, out_cnt); + } + kvfree(out_cnt); + err = err ? IB_MAD_RESULT_FAILURE : + IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +done: + if (native_port) + mlx5_ib_put_native_port_mdev(dev, port_num); + return err; +} + +int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad_hdr *in, size_t in_mad_size, + struct ib_mad_hdr *out, size_t *out_mad_size, + u16 *out_mad_pkey_index) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + const struct ib_mad *in_mad = (const struct ib_mad *)in; + struct ib_mad *out_mad = (struct ib_mad *)out; + int ret; + + if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || + *out_mad_size != sizeof(*out_mad))) + return IB_MAD_RESULT_FAILURE; + + memset(out_mad->data, 0, sizeof(out_mad->data)); + + if (MLX5_CAP_GEN(dev->mdev, vport_counters) && + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT && + in_mad->mad_hdr.method == IB_MGMT_METHOD_GET) { + ret = process_pma_cmd(dev, port_num, in_mad, out_mad); + } else { + ret = process_mad(ibdev, mad_flags, port_num, in_wc, in_grh, + in_mad, out_mad); + } + return ret; +} + +int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + u16 packet_error; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + + packet_error = be16_to_cpu(out_mad->status); + + dev->mdev->port_caps[port - 1].ext_port_cap = (!err && !packet_error) ? + MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO : 0; + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +int mlx5_query_mad_ifc_smp_attr_node_info(struct ib_device *ibdev, + struct ib_smp *out_mad) +{ + struct ib_smp *in_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + if (!in_mad) + return -ENOMEM; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, + out_mad); + + kfree(in_mad); + return err; +} + +int mlx5_query_mad_ifc_system_image_guid(struct ib_device *ibdev, + __be64 *sys_image_guid) +{ + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!out_mad) + return -ENOMEM; + + err = mlx5_query_mad_ifc_smp_attr_node_info(ibdev, out_mad); + if (err) + goto out; + + memcpy(sys_image_guid, out_mad->data + 4, 8); + +out: + kfree(out_mad); + + return err; +} + +int mlx5_query_mad_ifc_max_pkeys(struct ib_device *ibdev, + u16 *max_pkeys) +{ + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!out_mad) + return -ENOMEM; + + err = mlx5_query_mad_ifc_smp_attr_node_info(ibdev, out_mad); + if (err) + goto out; + + *max_pkeys = be16_to_cpup((__be16 *)(out_mad->data + 28)); + +out: + kfree(out_mad); + + return err; +} + +int mlx5_query_mad_ifc_vendor_id(struct ib_device *ibdev, + u32 *vendor_id) +{ + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!out_mad) + return -ENOMEM; + + err = mlx5_query_mad_ifc_smp_attr_node_info(ibdev, out_mad); + if (err) + goto out; + + *vendor_id = be32_to_cpup((__be32 *)(out_mad->data + 36)) & 0xffff; + +out: + kfree(out_mad); + + return err; +} + +int mlx5_query_mad_ifc_node_desc(struct mlx5_ib_dev *dev, char *node_desc) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(node_desc, out_mad->data, IB_DEVICE_NODE_DESC_MAX); +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +int mlx5_query_mad_ifc_node_guid(struct mlx5_ib_dev *dev, __be64 *node_guid) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(node_guid, out_mad->data + 12, 8); +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +int mlx5_query_mad_ifc_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; + in_mad->attr_mod = cpu_to_be32(index / 32); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, + out_mad); + if (err) + goto out; + + *pkey = be16_to_cpu(((__be16 *)out_mad->data)[index % 32]); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +int mlx5_query_mad_ifc_gids(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, + out_mad); + if (err) + goto out; + + memcpy(gid->raw, out_mad->data + 8, 8); + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(index / 8); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, + out_mad); + if (err) + goto out; + + memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int ext_active_speed; + int err = -ENOMEM; + + if (port < 1 || port > dev->num_ports) { + mlx5_ib_warn(dev, "invalid port number %d\n", port); + return -EINVAL; + } + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + /* props being zeroed by the caller, avoid zeroing it here */ + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) { + mlx5_ib_warn(dev, "err %d\n", err); + goto out; + } + + props->lid = be16_to_cpup((__be16 *)(out_mad->data + 16)); + props->lmc = out_mad->data[34] & 0x7; + props->sm_lid = be16_to_cpup((__be16 *)(out_mad->data + 18)); + props->sm_sl = out_mad->data[36] & 0xf; + props->state = out_mad->data[32] & 0xf; + props->phys_state = out_mad->data[33] >> 4; + props->port_cap_flags = be32_to_cpup((__be32 *)(out_mad->data + 20)); + props->gid_tbl_len = out_mad->data[50]; + props->max_msg_sz = 1 << MLX5_CAP_GEN(mdev, log_max_msg); + props->pkey_tbl_len = mdev->port_caps[port - 1].pkey_table_len; + props->bad_pkey_cntr = be16_to_cpup((__be16 *)(out_mad->data + 46)); + props->qkey_viol_cntr = be16_to_cpup((__be16 *)(out_mad->data + 48)); + props->active_width = out_mad->data[31] & 0xf; + props->active_speed = out_mad->data[35] >> 4; + props->max_mtu = out_mad->data[41] & 0xf; + props->active_mtu = out_mad->data[36] >> 4; + props->subnet_timeout = out_mad->data[51] & 0x1f; + props->max_vl_num = out_mad->data[37] >> 4; + props->init_type_reply = out_mad->data[41] >> 4; + + /* Check if extended speeds (EDR/FDR/...) are supported */ + if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) { + ext_active_speed = out_mad->data[62] >> 4; + + switch (ext_active_speed) { + case 1: + props->active_speed = 16; /* FDR */ + break; + case 2: + props->active_speed = 32; /* EDR */ + break; + } + } + + /* If reported active speed is QDR, check if is FDR-10 */ + if (props->active_speed == 4) { + if (mdev->port_caps[port - 1].ext_port_cap & + MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO) { + init_query_mad(in_mad); + in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, port, + NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + /* Checking LinkSpeedActive for FDR-10 */ + if (out_mad->data[15] & 0x1) + props->active_speed = 8; + } + } + +out: + kfree(in_mad); + kfree(out_mad); + + return err; +} diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c new file mode 100644 index 000000000..1688c06d5 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/main.c @@ -0,0 +1,6461 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/debugfs.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/pci.h> +#include <linux/dma-mapping.h> +#include <linux/slab.h> +#include <linux/bitmap.h> +#if defined(CONFIG_X86) +#include <asm/pat.h> +#endif +#include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/task.h> +#include <linux/delay.h> +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_addr.h> +#include <rdma/ib_cache.h> +#include <linux/mlx5/port.h> +#include <linux/mlx5/vport.h> +#include <linux/mlx5/fs.h> +#include <linux/list.h> +#include <rdma/ib_smi.h> +#include <rdma/ib_umem.h> +#include <linux/in.h> +#include <linux/etherdevice.h> +#include "mlx5_ib.h" +#include "ib_rep.h" +#include "cmd.h" +#include <linux/mlx5/fs_helpers.h> +#include <linux/mlx5/accel.h> +#include <rdma/uverbs_std_types.h> +#include <rdma/mlx5_user_ioctl_verbs.h> +#include <rdma/mlx5_user_ioctl_cmds.h> + +#define UVERBS_MODULE_NAME mlx5_ib +#include <rdma/uverbs_named_ioctl.h> + +#define DRIVER_NAME "mlx5_ib" +#define DRIVER_VERSION "5.0-0" + +MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>"); +MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver"); +MODULE_LICENSE("Dual BSD/GPL"); + +static char mlx5_version[] = + DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v" + DRIVER_VERSION "\n"; + +struct mlx5_ib_event_work { + struct work_struct work; + struct mlx5_core_dev *dev; + void *context; + enum mlx5_dev_event event; + unsigned long param; +}; + +enum { + MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3, +}; + +static struct workqueue_struct *mlx5_ib_event_wq; +static LIST_HEAD(mlx5_ib_unaffiliated_port_list); +static LIST_HEAD(mlx5_ib_dev_list); +/* + * This mutex should be held when accessing either of the above lists + */ +static DEFINE_MUTEX(mlx5_ib_multiport_mutex); + +/* We can't use an array for xlt_emergency_page because dma_map_single + * doesn't work on kernel modules memory + */ +static unsigned long xlt_emergency_page; +static struct mutex xlt_emergency_page_mutex; + +struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi) +{ + struct mlx5_ib_dev *dev; + + mutex_lock(&mlx5_ib_multiport_mutex); + dev = mpi->ibdev; + mutex_unlock(&mlx5_ib_multiport_mutex); + return dev; +} + +static enum rdma_link_layer +mlx5_port_type_cap_to_rdma_ll(int port_type_cap) +{ + switch (port_type_cap) { + case MLX5_CAP_PORT_TYPE_IB: + return IB_LINK_LAYER_INFINIBAND; + case MLX5_CAP_PORT_TYPE_ETH: + return IB_LINK_LAYER_ETHERNET; + default: + return IB_LINK_LAYER_UNSPECIFIED; + } +} + +static enum rdma_link_layer +mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type); + + return mlx5_port_type_cap_to_rdma_ll(port_type_cap); +} + +static int get_port_state(struct ib_device *ibdev, + u8 port_num, + enum ib_port_state *state) +{ + struct ib_port_attr attr; + int ret; + + memset(&attr, 0, sizeof(attr)); + ret = ibdev->query_port(ibdev, port_num, &attr); + if (!ret) + *state = attr.state; + return ret; +} + +static int mlx5_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb); + struct net_device *ndev = netdev_notifier_info_to_dev(ptr); + u8 port_num = roce->native_port_num; + struct mlx5_core_dev *mdev; + struct mlx5_ib_dev *ibdev; + + ibdev = roce->dev; + mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL); + if (!mdev) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_REGISTER: + case NETDEV_UNREGISTER: + write_lock(&roce->netdev_lock); + if (ibdev->rep) { + struct mlx5_eswitch *esw = ibdev->mdev->priv.eswitch; + struct net_device *rep_ndev; + + rep_ndev = mlx5_ib_get_rep_netdev(esw, + ibdev->rep->vport); + if (rep_ndev == ndev) + roce->netdev = (event == NETDEV_UNREGISTER) ? + NULL : ndev; + } else if (ndev->dev.parent == &mdev->pdev->dev) { + roce->netdev = (event == NETDEV_UNREGISTER) ? + NULL : ndev; + } + write_unlock(&roce->netdev_lock); + break; + + case NETDEV_CHANGE: + case NETDEV_UP: + case NETDEV_DOWN: { + struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(mdev); + struct net_device *upper = NULL; + + if (lag_ndev) { + upper = netdev_master_upper_dev_get(lag_ndev); + dev_put(lag_ndev); + } + + if ((upper == ndev || (!upper && ndev == roce->netdev)) + && ibdev->ib_active) { + struct ib_event ibev = { }; + enum ib_port_state port_state; + + if (get_port_state(&ibdev->ib_dev, port_num, + &port_state)) + goto done; + + if (roce->last_port_state == port_state) + goto done; + + roce->last_port_state = port_state; + ibev.device = &ibdev->ib_dev; + if (port_state == IB_PORT_DOWN) + ibev.event = IB_EVENT_PORT_ERR; + else if (port_state == IB_PORT_ACTIVE) + ibev.event = IB_EVENT_PORT_ACTIVE; + else + goto done; + + ibev.element.port_num = port_num; + ib_dispatch_event(&ibev); + } + break; + } + + default: + break; + } +done: + mlx5_ib_put_native_port_mdev(ibdev, port_num); + return NOTIFY_DONE; +} + +static struct net_device *mlx5_ib_get_netdev(struct ib_device *device, + u8 port_num) +{ + struct mlx5_ib_dev *ibdev = to_mdev(device); + struct net_device *ndev; + struct mlx5_core_dev *mdev; + + mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL); + if (!mdev) + return NULL; + + ndev = mlx5_lag_get_roce_netdev(mdev); + if (ndev) + goto out; + + /* Ensure ndev does not disappear before we invoke dev_hold() + */ + read_lock(&ibdev->roce[port_num - 1].netdev_lock); + ndev = ibdev->roce[port_num - 1].netdev; + if (ndev) + dev_hold(ndev); + read_unlock(&ibdev->roce[port_num - 1].netdev_lock); + +out: + mlx5_ib_put_native_port_mdev(ibdev, port_num); + return ndev; +} + +struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev, + u8 ib_port_num, + u8 *native_port_num) +{ + enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev, + ib_port_num); + struct mlx5_core_dev *mdev = NULL; + struct mlx5_ib_multiport_info *mpi; + struct mlx5_ib_port *port; + + if (!mlx5_core_mp_enabled(ibdev->mdev) || + ll != IB_LINK_LAYER_ETHERNET) { + if (native_port_num) + *native_port_num = ib_port_num; + return ibdev->mdev; + } + + if (native_port_num) + *native_port_num = 1; + + port = &ibdev->port[ib_port_num - 1]; + if (!port) + return NULL; + + spin_lock(&port->mp.mpi_lock); + mpi = ibdev->port[ib_port_num - 1].mp.mpi; + if (mpi && !mpi->unaffiliate) { + mdev = mpi->mdev; + /* If it's the master no need to refcount, it'll exist + * as long as the ib_dev exists. + */ + if (!mpi->is_master) + mpi->mdev_refcnt++; + } + spin_unlock(&port->mp.mpi_lock); + + return mdev; +} + +void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *ibdev, u8 port_num) +{ + enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev, + port_num); + struct mlx5_ib_multiport_info *mpi; + struct mlx5_ib_port *port; + + if (!mlx5_core_mp_enabled(ibdev->mdev) || ll != IB_LINK_LAYER_ETHERNET) + return; + + port = &ibdev->port[port_num - 1]; + + spin_lock(&port->mp.mpi_lock); + mpi = ibdev->port[port_num - 1].mp.mpi; + if (mpi->is_master) + goto out; + + mpi->mdev_refcnt--; + if (mpi->unaffiliate) + complete(&mpi->unref_comp); +out: + spin_unlock(&port->mp.mpi_lock); +} + +static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed, + u8 *active_width) +{ + switch (eth_proto_oper) { + case MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII): + case MLX5E_PROT_MASK(MLX5E_1000BASE_KX): + case MLX5E_PROT_MASK(MLX5E_100BASE_TX): + case MLX5E_PROT_MASK(MLX5E_1000BASE_T): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_SDR; + break; + case MLX5E_PROT_MASK(MLX5E_10GBASE_T): + case MLX5E_PROT_MASK(MLX5E_10GBASE_CX4): + case MLX5E_PROT_MASK(MLX5E_10GBASE_KX4): + case MLX5E_PROT_MASK(MLX5E_10GBASE_KR): + case MLX5E_PROT_MASK(MLX5E_10GBASE_CR): + case MLX5E_PROT_MASK(MLX5E_10GBASE_SR): + case MLX5E_PROT_MASK(MLX5E_10GBASE_ER): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_QDR; + break; + case MLX5E_PROT_MASK(MLX5E_25GBASE_CR): + case MLX5E_PROT_MASK(MLX5E_25GBASE_KR): + case MLX5E_PROT_MASK(MLX5E_25GBASE_SR): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_EDR; + break; + case MLX5E_PROT_MASK(MLX5E_40GBASE_CR4): + case MLX5E_PROT_MASK(MLX5E_40GBASE_KR4): + case MLX5E_PROT_MASK(MLX5E_40GBASE_SR4): + case MLX5E_PROT_MASK(MLX5E_40GBASE_LR4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_QDR; + break; + case MLX5E_PROT_MASK(MLX5E_50GBASE_CR2): + case MLX5E_PROT_MASK(MLX5E_50GBASE_KR2): + case MLX5E_PROT_MASK(MLX5E_50GBASE_SR2): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_HDR; + break; + case MLX5E_PROT_MASK(MLX5E_56GBASE_R4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_FDR; + break; + case MLX5E_PROT_MASK(MLX5E_100GBASE_CR4): + case MLX5E_PROT_MASK(MLX5E_100GBASE_SR4): + case MLX5E_PROT_MASK(MLX5E_100GBASE_KR4): + case MLX5E_PROT_MASK(MLX5E_100GBASE_LR4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_EDR; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int mlx5_query_port_roce(struct ib_device *device, u8 port_num, + struct ib_port_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_core_dev *mdev; + struct net_device *ndev, *upper; + enum ib_mtu ndev_ib_mtu; + bool put_mdev = true; + u16 qkey_viol_cntr; + u32 eth_prot_oper; + u8 mdev_port_num; + int err; + + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num); + if (!mdev) { + /* This means the port isn't affiliated yet. Get the + * info for the master port instead. + */ + put_mdev = false; + mdev = dev->mdev; + mdev_port_num = 1; + port_num = 1; + } + + /* Possible bad flows are checked before filling out props so in case + * of an error it will still be zeroed out. + */ + err = mlx5_query_port_eth_proto_oper(mdev, ð_prot_oper, + mdev_port_num); + if (err) + goto out; + + props->active_width = IB_WIDTH_4X; + props->active_speed = IB_SPEED_QDR; + + translate_eth_proto_oper(eth_prot_oper, &props->active_speed, + &props->active_width); + + props->port_cap_flags |= IB_PORT_CM_SUP; + props->ip_gids = true; + + props->gid_tbl_len = MLX5_CAP_ROCE(dev->mdev, + roce_address_table_size); + props->max_mtu = IB_MTU_4096; + props->max_msg_sz = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg); + props->pkey_tbl_len = 1; + props->state = IB_PORT_DOWN; + props->phys_state = 3; + + mlx5_query_nic_vport_qkey_viol_cntr(mdev, &qkey_viol_cntr); + props->qkey_viol_cntr = qkey_viol_cntr; + + /* If this is a stub query for an unaffiliated port stop here */ + if (!put_mdev) + goto out; + + ndev = mlx5_ib_get_netdev(device, port_num); + if (!ndev) + goto out; + + if (mlx5_lag_is_active(dev->mdev)) { + rcu_read_lock(); + upper = netdev_master_upper_dev_get_rcu(ndev); + if (upper) { + dev_put(ndev); + ndev = upper; + dev_hold(ndev); + } + rcu_read_unlock(); + } + + if (netif_running(ndev) && netif_carrier_ok(ndev)) { + props->state = IB_PORT_ACTIVE; + props->phys_state = 5; + } + + ndev_ib_mtu = iboe_get_mtu(ndev->mtu); + + dev_put(ndev); + + props->active_mtu = min(props->max_mtu, ndev_ib_mtu); +out: + if (put_mdev) + mlx5_ib_put_native_port_mdev(dev, port_num); + return err; +} + +static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num, + unsigned int index, const union ib_gid *gid, + const struct ib_gid_attr *attr) +{ + enum ib_gid_type gid_type = IB_GID_TYPE_IB; + u8 roce_version = 0; + u8 roce_l3_type = 0; + bool vlan = false; + u8 mac[ETH_ALEN]; + u16 vlan_id = 0; + + if (gid) { + gid_type = attr->gid_type; + ether_addr_copy(mac, attr->ndev->dev_addr); + + if (is_vlan_dev(attr->ndev)) { + vlan = true; + vlan_id = vlan_dev_vlan_id(attr->ndev); + } + } + + switch (gid_type) { + case IB_GID_TYPE_IB: + roce_version = MLX5_ROCE_VERSION_1; + break; + case IB_GID_TYPE_ROCE_UDP_ENCAP: + roce_version = MLX5_ROCE_VERSION_2; + if (ipv6_addr_v4mapped((void *)gid)) + roce_l3_type = MLX5_ROCE_L3_TYPE_IPV4; + else + roce_l3_type = MLX5_ROCE_L3_TYPE_IPV6; + break; + + default: + mlx5_ib_warn(dev, "Unexpected GID type %u\n", gid_type); + } + + return mlx5_core_roce_gid_set(dev->mdev, index, roce_version, + roce_l3_type, gid->raw, mac, vlan, + vlan_id, port_num); +} + +static int mlx5_ib_add_gid(const struct ib_gid_attr *attr, + __always_unused void **context) +{ + return set_roce_addr(to_mdev(attr->device), attr->port_num, + attr->index, &attr->gid, attr); +} + +static int mlx5_ib_del_gid(const struct ib_gid_attr *attr, + __always_unused void **context) +{ + return set_roce_addr(to_mdev(attr->device), attr->port_num, + attr->index, NULL, NULL); +} + +__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, + const struct ib_gid_attr *attr) +{ + if (attr->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) + return 0; + + return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port)); +} + +static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev) +{ + if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB) + return !MLX5_CAP_GEN(dev->mdev, ib_virt); + return 0; +} + +enum { + MLX5_VPORT_ACCESS_METHOD_MAD, + MLX5_VPORT_ACCESS_METHOD_HCA, + MLX5_VPORT_ACCESS_METHOD_NIC, +}; + +static int mlx5_get_vport_access_method(struct ib_device *ibdev) +{ + if (mlx5_use_mad_ifc(to_mdev(ibdev))) + return MLX5_VPORT_ACCESS_METHOD_MAD; + + if (mlx5_ib_port_link_layer(ibdev, 1) == + IB_LINK_LAYER_ETHERNET) + return MLX5_VPORT_ACCESS_METHOD_NIC; + + return MLX5_VPORT_ACCESS_METHOD_HCA; +} + +static void get_atomic_caps(struct mlx5_ib_dev *dev, + u8 atomic_size_qp, + struct ib_device_attr *props) +{ + u8 tmp; + u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); + u8 atomic_req_8B_endianness_mode = + MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianness_mode); + + /* Check if HW supports 8 bytes standard atomic operations and capable + * of host endianness respond + */ + tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD; + if (((atomic_operations & tmp) == tmp) && + (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) && + (atomic_req_8B_endianness_mode)) { + props->atomic_cap = IB_ATOMIC_HCA; + } else { + props->atomic_cap = IB_ATOMIC_NONE; + } +} + +static void get_atomic_caps_qp(struct mlx5_ib_dev *dev, + struct ib_device_attr *props) +{ + u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); + + get_atomic_caps(dev, atomic_size_qp, props); +} + +static void get_atomic_caps_dc(struct mlx5_ib_dev *dev, + struct ib_device_attr *props) +{ + u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); + + get_atomic_caps(dev, atomic_size_qp, props); +} + +bool mlx5_ib_dc_atomic_is_supported(struct mlx5_ib_dev *dev) +{ + struct ib_device_attr props = {}; + + get_atomic_caps_dc(dev, &props); + return (props.atomic_cap == IB_ATOMIC_HCA) ? true : false; +} +static int mlx5_query_system_image_guid(struct ib_device *ibdev, + __be64 *sys_image_guid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + u64 tmp; + int err; + + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_mad_ifc_system_image_guid(ibdev, + sys_image_guid); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp); + break; + + case MLX5_VPORT_ACCESS_METHOD_NIC: + err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp); + break; + + default: + return -EINVAL; + } + + if (!err) + *sys_image_guid = cpu_to_be64(tmp); + + return err; + +} + +static int mlx5_query_max_pkeys(struct ib_device *ibdev, + u16 *max_pkeys) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + case MLX5_VPORT_ACCESS_METHOD_NIC: + *max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, + pkey_table_size)); + return 0; + + default: + return -EINVAL; + } +} + +static int mlx5_query_vendor_id(struct ib_device *ibdev, + u32 *vendor_id) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + case MLX5_VPORT_ACCESS_METHOD_NIC: + return mlx5_core_query_vendor_id(dev->mdev, vendor_id); + + default: + return -EINVAL; + } +} + +static int mlx5_query_node_guid(struct mlx5_ib_dev *dev, + __be64 *node_guid) +{ + u64 tmp; + int err; + + switch (mlx5_get_vport_access_method(&dev->ib_dev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_mad_ifc_node_guid(dev, node_guid); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp); + break; + + case MLX5_VPORT_ACCESS_METHOD_NIC: + err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp); + break; + + default: + return -EINVAL; + } + + if (!err) + *node_guid = cpu_to_be64(tmp); + + return err; +} + +struct mlx5_reg_node_desc { + u8 desc[IB_DEVICE_NODE_DESC_MAX]; +}; + +static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc) +{ + struct mlx5_reg_node_desc in; + + if (mlx5_use_mad_ifc(dev)) + return mlx5_query_mad_ifc_node_desc(dev, node_desc); + + memset(&in, 0, sizeof(in)); + + return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc, + sizeof(struct mlx5_reg_node_desc), + MLX5_REG_NODE_DESC, 0, 0); +} + +static int mlx5_ib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props, + struct ib_udata *uhw) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + int err = -ENOMEM; + int max_sq_desc; + int max_rq_sg; + int max_sq_sg; + u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz); + bool raw_support = !mlx5_core_mp_enabled(mdev); + struct mlx5_ib_query_device_resp resp = {}; + size_t resp_len; + u64 max_tso; + + resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length); + if (uhw->outlen && uhw->outlen < resp_len) + return -EINVAL; + else + resp.response_length = resp_len; + + if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen)) + return -EINVAL; + + memset(props, 0, sizeof(*props)); + err = mlx5_query_system_image_guid(ibdev, + &props->sys_image_guid); + if (err) + return err; + + err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys); + if (err) + return err; + + err = mlx5_query_vendor_id(ibdev, &props->vendor_id); + if (err) + return err; + + props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) | + (fw_rev_min(dev->mdev) << 16) | + fw_rev_sub(dev->mdev); + props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | + IB_DEVICE_PORT_ACTIVE_EVENT | + IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_RC_RNR_NAK_GEN; + + if (MLX5_CAP_GEN(mdev, pkv)) + props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; + if (MLX5_CAP_GEN(mdev, qkv)) + props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; + if (MLX5_CAP_GEN(mdev, apm)) + props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; + if (MLX5_CAP_GEN(mdev, xrc)) + props->device_cap_flags |= IB_DEVICE_XRC; + if (MLX5_CAP_GEN(mdev, imaicl)) { + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW | + IB_DEVICE_MEM_WINDOW_TYPE_2B; + props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); + /* We support 'Gappy' memory registration too */ + props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG; + } + props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; + if (MLX5_CAP_GEN(mdev, sho)) { + props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER; + /* At this stage no support for signature handover */ + props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 | + IB_PROT_T10DIF_TYPE_2 | + IB_PROT_T10DIF_TYPE_3; + props->sig_guard_cap = IB_GUARD_T10DIF_CRC | + IB_GUARD_T10DIF_CSUM; + } + if (MLX5_CAP_GEN(mdev, block_lb_mc)) + props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + + if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && raw_support) { + if (MLX5_CAP_ETH(mdev, csum_cap)) { + /* Legacy bit to support old userspace libraries */ + props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM; + props->raw_packet_caps |= IB_RAW_PACKET_CAP_IP_CSUM; + } + + if (MLX5_CAP_ETH(dev->mdev, vlan_cap)) + props->raw_packet_caps |= + IB_RAW_PACKET_CAP_CVLAN_STRIPPING; + + if (field_avail(typeof(resp), tso_caps, uhw->outlen)) { + max_tso = MLX5_CAP_ETH(mdev, max_lso_cap); + if (max_tso) { + resp.tso_caps.max_tso = 1 << max_tso; + resp.tso_caps.supported_qpts |= + 1 << IB_QPT_RAW_PACKET; + resp.response_length += sizeof(resp.tso_caps); + } + } + + if (field_avail(typeof(resp), rss_caps, uhw->outlen)) { + resp.rss_caps.rx_hash_function = + MLX5_RX_HASH_FUNC_TOEPLITZ; + resp.rss_caps.rx_hash_fields_mask = + MLX5_RX_HASH_SRC_IPV4 | + MLX5_RX_HASH_DST_IPV4 | + MLX5_RX_HASH_SRC_IPV6 | + MLX5_RX_HASH_DST_IPV6 | + MLX5_RX_HASH_SRC_PORT_TCP | + MLX5_RX_HASH_DST_PORT_TCP | + MLX5_RX_HASH_SRC_PORT_UDP | + MLX5_RX_HASH_DST_PORT_UDP | + MLX5_RX_HASH_INNER; + if (mlx5_accel_ipsec_device_caps(dev->mdev) & + MLX5_ACCEL_IPSEC_CAP_DEVICE) + resp.rss_caps.rx_hash_fields_mask |= + MLX5_RX_HASH_IPSEC_SPI; + resp.response_length += sizeof(resp.rss_caps); + } + } else { + if (field_avail(typeof(resp), tso_caps, uhw->outlen)) + resp.response_length += sizeof(resp.tso_caps); + if (field_avail(typeof(resp), rss_caps, uhw->outlen)) + resp.response_length += sizeof(resp.rss_caps); + } + + if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) { + props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; + props->device_cap_flags |= IB_DEVICE_UD_TSO; + } + + if (MLX5_CAP_GEN(dev->mdev, rq_delay_drop) && + MLX5_CAP_GEN(dev->mdev, general_notification_event) && + raw_support) + props->raw_packet_caps |= IB_RAW_PACKET_CAP_DELAY_DROP; + + if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) && + MLX5_CAP_IPOIB_ENHANCED(mdev, csum_cap)) + props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; + + if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && + MLX5_CAP_ETH(dev->mdev, scatter_fcs) && + raw_support) { + /* Legacy bit to support old userspace libraries */ + props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS; + props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS; + } + + if (MLX5_CAP_DEV_MEM(mdev, memic)) { + props->max_dm_size = + MLX5_CAP_DEV_MEM(mdev, max_memic_size); + } + + if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS)) + props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; + + if (MLX5_CAP_GEN(mdev, end_pad)) + props->device_cap_flags |= IB_DEVICE_PCI_WRITE_END_PADDING; + + props->vendor_part_id = mdev->pdev->device; + props->hw_ver = mdev->pdev->revision; + + props->max_mr_size = ~0ull; + props->page_size_cap = ~(min_page_size - 1); + props->max_qp = 1 << MLX5_CAP_GEN(mdev, log_max_qp); + props->max_qp_wr = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); + max_rq_sg = MLX5_CAP_GEN(mdev, max_wqe_sz_rq) / + sizeof(struct mlx5_wqe_data_seg); + max_sq_desc = min_t(int, MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512); + max_sq_sg = (max_sq_desc - sizeof(struct mlx5_wqe_ctrl_seg) - + sizeof(struct mlx5_wqe_raddr_seg)) / + sizeof(struct mlx5_wqe_data_seg); + props->max_send_sge = max_sq_sg; + props->max_recv_sge = max_rq_sg; + props->max_sge_rd = MLX5_MAX_SGE_RD; + props->max_cq = 1 << MLX5_CAP_GEN(mdev, log_max_cq); + props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1; + props->max_mr = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); + props->max_pd = 1 << MLX5_CAP_GEN(mdev, log_max_pd); + props->max_qp_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp); + props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp); + props->max_srq = 1 << MLX5_CAP_GEN(mdev, log_max_srq); + props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1; + props->local_ca_ack_delay = MLX5_CAP_GEN(mdev, local_ca_ack_delay); + props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; + props->max_srq_sge = max_rq_sg - 1; + props->max_fast_reg_page_list_len = + 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size); + get_atomic_caps_qp(dev, props); + props->masked_atomic_cap = IB_ATOMIC_NONE; + props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg); + props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg); + props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * + props->max_mcast_grp; + props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ + props->max_ah = INT_MAX; + props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz); + props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (MLX5_CAP_GEN(mdev, pg)) + props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING; + props->odp_caps = dev->odp_caps; +#endif + + if (MLX5_CAP_GEN(mdev, cd)) + props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL; + + if (!mlx5_core_is_pf(mdev)) + props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION; + + if (mlx5_ib_port_link_layer(ibdev, 1) == + IB_LINK_LAYER_ETHERNET && raw_support) { + props->rss_caps.max_rwq_indirection_tables = + 1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt); + props->rss_caps.max_rwq_indirection_table_size = + 1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt_size); + props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET; + props->max_wq_type_rq = + 1 << MLX5_CAP_GEN(dev->mdev, log_max_rq); + } + + if (MLX5_CAP_GEN(mdev, tag_matching)) { + props->tm_caps.max_num_tags = + (1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1; + props->tm_caps.max_ops = + 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); + props->tm_caps.max_sge = MLX5_TM_MAX_SGE; + } + + if (MLX5_CAP_GEN(mdev, tag_matching) && + MLX5_CAP_GEN(mdev, rndv_offload_rc)) { + props->tm_caps.flags = IB_TM_CAP_RNDV_RC; + props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE; + } + + if (MLX5_CAP_GEN(dev->mdev, cq_moderation)) { + props->cq_caps.max_cq_moderation_count = + MLX5_MAX_CQ_COUNT; + props->cq_caps.max_cq_moderation_period = + MLX5_MAX_CQ_PERIOD; + } + + if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) { + resp.response_length += sizeof(resp.cqe_comp_caps); + + if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) { + resp.cqe_comp_caps.max_num = + MLX5_CAP_GEN(dev->mdev, + cqe_compression_max_num); + + resp.cqe_comp_caps.supported_format = + MLX5_IB_CQE_RES_FORMAT_HASH | + MLX5_IB_CQE_RES_FORMAT_CSUM; + + if (MLX5_CAP_GEN(dev->mdev, mini_cqe_resp_stride_index)) + resp.cqe_comp_caps.supported_format |= + MLX5_IB_CQE_RES_FORMAT_CSUM_STRIDX; + } + } + + if (field_avail(typeof(resp), packet_pacing_caps, uhw->outlen) && + raw_support) { + if (MLX5_CAP_QOS(mdev, packet_pacing) && + MLX5_CAP_GEN(mdev, qos)) { + resp.packet_pacing_caps.qp_rate_limit_max = + MLX5_CAP_QOS(mdev, packet_pacing_max_rate); + resp.packet_pacing_caps.qp_rate_limit_min = + MLX5_CAP_QOS(mdev, packet_pacing_min_rate); + resp.packet_pacing_caps.supported_qpts |= + 1 << IB_QPT_RAW_PACKET; + if (MLX5_CAP_QOS(mdev, packet_pacing_burst_bound) && + MLX5_CAP_QOS(mdev, packet_pacing_typical_size)) + resp.packet_pacing_caps.cap_flags |= + MLX5_IB_PP_SUPPORT_BURST; + } + resp.response_length += sizeof(resp.packet_pacing_caps); + } + + if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes, + uhw->outlen)) { + if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe)) + resp.mlx5_ib_support_multi_pkt_send_wqes = + MLX5_IB_ALLOW_MPW; + + if (MLX5_CAP_ETH(mdev, enhanced_multi_pkt_send_wqe)) + resp.mlx5_ib_support_multi_pkt_send_wqes |= + MLX5_IB_SUPPORT_EMPW; + + resp.response_length += + sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes); + } + + if (field_avail(typeof(resp), flags, uhw->outlen)) { + resp.response_length += sizeof(resp.flags); + + if (MLX5_CAP_GEN(mdev, cqe_compression_128)) + resp.flags |= + MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_COMP; + + if (MLX5_CAP_GEN(mdev, cqe_128_always)) + resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD; + } + + if (field_avail(typeof(resp), sw_parsing_caps, + uhw->outlen)) { + resp.response_length += sizeof(resp.sw_parsing_caps); + if (MLX5_CAP_ETH(mdev, swp)) { + resp.sw_parsing_caps.sw_parsing_offloads |= + MLX5_IB_SW_PARSING; + + if (MLX5_CAP_ETH(mdev, swp_csum)) + resp.sw_parsing_caps.sw_parsing_offloads |= + MLX5_IB_SW_PARSING_CSUM; + + if (MLX5_CAP_ETH(mdev, swp_lso)) + resp.sw_parsing_caps.sw_parsing_offloads |= + MLX5_IB_SW_PARSING_LSO; + + if (resp.sw_parsing_caps.sw_parsing_offloads) + resp.sw_parsing_caps.supported_qpts = + BIT(IB_QPT_RAW_PACKET); + } + } + + if (field_avail(typeof(resp), striding_rq_caps, uhw->outlen) && + raw_support) { + resp.response_length += sizeof(resp.striding_rq_caps); + if (MLX5_CAP_GEN(mdev, striding_rq)) { + resp.striding_rq_caps.min_single_stride_log_num_of_bytes = + MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES; + resp.striding_rq_caps.max_single_stride_log_num_of_bytes = + MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES; + resp.striding_rq_caps.min_single_wqe_log_num_of_strides = + MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES; + resp.striding_rq_caps.max_single_wqe_log_num_of_strides = + MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES; + resp.striding_rq_caps.supported_qpts = + BIT(IB_QPT_RAW_PACKET); + } + } + + if (field_avail(typeof(resp), tunnel_offloads_caps, + uhw->outlen)) { + resp.response_length += sizeof(resp.tunnel_offloads_caps); + if (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan)) + resp.tunnel_offloads_caps |= + MLX5_IB_TUNNELED_OFFLOADS_VXLAN; + if (MLX5_CAP_ETH(mdev, tunnel_stateless_geneve_rx)) + resp.tunnel_offloads_caps |= + MLX5_IB_TUNNELED_OFFLOADS_GENEVE; + if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) + resp.tunnel_offloads_caps |= + MLX5_IB_TUNNELED_OFFLOADS_GRE; + if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_gre)) + resp.tunnel_offloads_caps |= + MLX5_IB_TUNNELED_OFFLOADS_MPLS_GRE; + if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_udp)) + resp.tunnel_offloads_caps |= + MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP; + } + + if (uhw->outlen) { + err = ib_copy_to_udata(uhw, &resp, resp.response_length); + + if (err) + return err; + } + + return 0; +} + +enum mlx5_ib_width { + MLX5_IB_WIDTH_1X = 1 << 0, + MLX5_IB_WIDTH_2X = 1 << 1, + MLX5_IB_WIDTH_4X = 1 << 2, + MLX5_IB_WIDTH_8X = 1 << 3, + MLX5_IB_WIDTH_12X = 1 << 4 +}; + +static void translate_active_width(struct ib_device *ibdev, u8 active_width, + u8 *ib_width) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + + if (active_width & MLX5_IB_WIDTH_1X) + *ib_width = IB_WIDTH_1X; + else if (active_width & MLX5_IB_WIDTH_4X) + *ib_width = IB_WIDTH_4X; + else if (active_width & MLX5_IB_WIDTH_8X) + *ib_width = IB_WIDTH_8X; + else if (active_width & MLX5_IB_WIDTH_12X) + *ib_width = IB_WIDTH_12X; + else { + mlx5_ib_dbg(dev, "Invalid active_width %d, setting width to default value: 4x\n", + (int)active_width); + *ib_width = IB_WIDTH_4X; + } + + return; +} + +static int mlx5_mtu_to_ib_mtu(int mtu) +{ + switch (mtu) { + case 256: return 1; + case 512: return 2; + case 1024: return 3; + case 2048: return 4; + case 4096: return 5; + default: + pr_warn("invalid mtu\n"); + return -1; + } +} + +enum ib_max_vl_num { + __IB_MAX_VL_0 = 1, + __IB_MAX_VL_0_1 = 2, + __IB_MAX_VL_0_3 = 3, + __IB_MAX_VL_0_7 = 4, + __IB_MAX_VL_0_14 = 5, +}; + +enum mlx5_vl_hw_cap { + MLX5_VL_HW_0 = 1, + MLX5_VL_HW_0_1 = 2, + MLX5_VL_HW_0_2 = 3, + MLX5_VL_HW_0_3 = 4, + MLX5_VL_HW_0_4 = 5, + MLX5_VL_HW_0_5 = 6, + MLX5_VL_HW_0_6 = 7, + MLX5_VL_HW_0_7 = 8, + MLX5_VL_HW_0_14 = 15 +}; + +static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap, + u8 *max_vl_num) +{ + switch (vl_hw_cap) { + case MLX5_VL_HW_0: + *max_vl_num = __IB_MAX_VL_0; + break; + case MLX5_VL_HW_0_1: + *max_vl_num = __IB_MAX_VL_0_1; + break; + case MLX5_VL_HW_0_3: + *max_vl_num = __IB_MAX_VL_0_3; + break; + case MLX5_VL_HW_0_7: + *max_vl_num = __IB_MAX_VL_0_7; + break; + case MLX5_VL_HW_0_14: + *max_vl_num = __IB_MAX_VL_0_14; + break; + + default: + return -EINVAL; + } + + return 0; +} + +static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_hca_vport_context *rep; + u16 max_mtu; + u16 oper_mtu; + int err; + u8 ib_link_width_oper; + u8 vl_hw_cap; + + rep = kzalloc(sizeof(*rep), GFP_KERNEL); + if (!rep) { + err = -ENOMEM; + goto out; + } + + /* props being zeroed by the caller, avoid zeroing it here */ + + err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep); + if (err) + goto out; + + props->lid = rep->lid; + props->lmc = rep->lmc; + props->sm_lid = rep->sm_lid; + props->sm_sl = rep->sm_sl; + props->state = rep->vport_state; + props->phys_state = rep->port_physical_state; + props->port_cap_flags = rep->cap_mask1; + props->gid_tbl_len = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size)); + props->max_msg_sz = 1 << MLX5_CAP_GEN(mdev, log_max_msg); + props->pkey_tbl_len = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size)); + props->bad_pkey_cntr = rep->pkey_violation_counter; + props->qkey_viol_cntr = rep->qkey_violation_counter; + props->subnet_timeout = rep->subnet_timeout; + props->init_type_reply = rep->init_type_reply; + + err = mlx5_query_port_link_width_oper(mdev, &ib_link_width_oper, port); + if (err) + goto out; + + translate_active_width(ibdev, ib_link_width_oper, &props->active_width); + + err = mlx5_query_port_ib_proto_oper(mdev, &props->active_speed, port); + if (err) + goto out; + + mlx5_query_port_max_mtu(mdev, &max_mtu, port); + + props->max_mtu = mlx5_mtu_to_ib_mtu(max_mtu); + + mlx5_query_port_oper_mtu(mdev, &oper_mtu, port); + + props->active_mtu = mlx5_mtu_to_ib_mtu(oper_mtu); + + err = mlx5_query_port_vl_hw_cap(mdev, &vl_hw_cap, port); + if (err) + goto out; + + err = translate_max_vl_num(ibdev, vl_hw_cap, + &props->max_vl_num); +out: + kfree(rep); + return err; +} + +int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + unsigned int count; + int ret; + + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + ret = mlx5_query_mad_ifc_port(ibdev, port, props); + break; + + case MLX5_VPORT_ACCESS_METHOD_HCA: + ret = mlx5_query_hca_port(ibdev, port, props); + break; + + case MLX5_VPORT_ACCESS_METHOD_NIC: + ret = mlx5_query_port_roce(ibdev, port, props); + break; + + default: + ret = -EINVAL; + } + + if (!ret && props) { + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev; + bool put_mdev = true; + + mdev = mlx5_ib_get_native_port_mdev(dev, port, NULL); + if (!mdev) { + /* If the port isn't affiliated yet query the master. + * The master and slave will have the same values. + */ + mdev = dev->mdev; + port = 1; + put_mdev = false; + } + count = mlx5_core_reserved_gids_count(mdev); + if (put_mdev) + mlx5_ib_put_native_port_mdev(dev, port); + props->gid_tbl_len -= count; + } + return ret; +} + +static int mlx5_ib_rep_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + int ret; + + /* Only link layer == ethernet is valid for representors */ + ret = mlx5_query_port_roce(ibdev, port, props); + if (ret || !props) + return ret; + + /* We don't support GIDS */ + props->gid_tbl_len = 0; + + return ret; +} + +static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_mad_ifc_gids(ibdev, port, index, gid); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + return mlx5_query_hca_vport_gid(mdev, 0, port, 0, index, gid); + + default: + return -EINVAL; + } + +} + +static int mlx5_query_hca_nic_pkey(struct ib_device *ibdev, u8 port, + u16 index, u16 *pkey) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev; + bool put_mdev = true; + u8 mdev_port_num; + int err; + + mdev = mlx5_ib_get_native_port_mdev(dev, port, &mdev_port_num); + if (!mdev) { + /* The port isn't affiliated yet, get the PKey from the master + * port. For RoCE the PKey tables will be the same. + */ + put_mdev = false; + mdev = dev->mdev; + mdev_port_num = 1; + } + + err = mlx5_query_hca_vport_pkey(mdev, 0, mdev_port_num, 0, + index, pkey); + if (put_mdev) + mlx5_ib_put_native_port_mdev(dev, port); + + return err; +} + +static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + case MLX5_VPORT_ACCESS_METHOD_NIC: + return mlx5_query_hca_nic_pkey(ibdev, port, index, pkey); + default: + return -EINVAL; + } +} + +static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask, + struct ib_device_modify *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_reg_node_desc in; + struct mlx5_reg_node_desc out; + int err; + + if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) + return -EOPNOTSUPP; + + if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) + return 0; + + /* + * If possible, pass node desc to FW, so it can generate + * a 144 trap. If cmd fails, just ignore. + */ + memcpy(&in, props->node_desc, IB_DEVICE_NODE_DESC_MAX); + err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out, + sizeof(out), MLX5_REG_NODE_DESC, 0, 1); + if (err) + return err; + + memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX); + + return err; +} + +static int set_port_caps_atomic(struct mlx5_ib_dev *dev, u8 port_num, u32 mask, + u32 value) +{ + struct mlx5_hca_vport_context ctx = {}; + struct mlx5_core_dev *mdev; + u8 mdev_port_num; + int err; + + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num); + if (!mdev) + return -ENODEV; + + err = mlx5_query_hca_vport_context(mdev, 0, mdev_port_num, 0, &ctx); + if (err) + goto out; + + if (~ctx.cap_mask1_perm & mask) { + mlx5_ib_warn(dev, "trying to change bitmask 0x%X but change supported 0x%X\n", + mask, ctx.cap_mask1_perm); + err = -EINVAL; + goto out; + } + + ctx.cap_mask1 = value; + ctx.cap_mask1_perm = mask; + err = mlx5_core_modify_hca_vport_context(mdev, 0, mdev_port_num, + 0, &ctx); + +out: + mlx5_ib_put_native_port_mdev(dev, port_num); + + return err; +} + +static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, + struct ib_port_modify *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct ib_port_attr attr; + u32 tmp; + int err; + u32 change_mask; + u32 value; + bool is_ib = (mlx5_ib_port_link_layer(ibdev, port) == + IB_LINK_LAYER_INFINIBAND); + + /* CM layer calls ib_modify_port() regardless of the link layer. For + * Ethernet ports, qkey violation and Port capabilities are meaningless. + */ + if (!is_ib) + return 0; + + if (MLX5_CAP_GEN(dev->mdev, ib_virt) && is_ib) { + change_mask = props->clr_port_cap_mask | props->set_port_cap_mask; + value = ~props->clr_port_cap_mask | props->set_port_cap_mask; + return set_port_caps_atomic(dev, port, change_mask, value); + } + + mutex_lock(&dev->cap_mask_mutex); + + err = ib_query_port(ibdev, port, &attr); + if (err) + goto out; + + tmp = (attr.port_cap_flags | props->set_port_cap_mask) & + ~props->clr_port_cap_mask; + + err = mlx5_set_port_caps(dev->mdev, port, tmp); + +out: + mutex_unlock(&dev->cap_mask_mutex); + return err; +} + +static void print_lib_caps(struct mlx5_ib_dev *dev, u64 caps) +{ + mlx5_ib_dbg(dev, "MLX5_LIB_CAP_4K_UAR = %s\n", + caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n"); +} + +static u16 calc_dynamic_bfregs(int uars_per_sys_page) +{ + /* Large page with non 4k uar support might limit the dynamic size */ + if (uars_per_sys_page == 1 && PAGE_SIZE > 4096) + return MLX5_MIN_DYN_BFREGS; + + return MLX5_MAX_DYN_BFREGS; +} + +static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k, + struct mlx5_ib_alloc_ucontext_req_v2 *req, + struct mlx5_bfreg_info *bfregi) +{ + int uars_per_sys_page; + int bfregs_per_sys_page; + int ref_bfregs = req->total_num_bfregs; + + if (req->total_num_bfregs == 0) + return -EINVAL; + + BUILD_BUG_ON(MLX5_MAX_BFREGS % MLX5_NON_FP_BFREGS_IN_PAGE); + BUILD_BUG_ON(MLX5_MAX_BFREGS < MLX5_NON_FP_BFREGS_IN_PAGE); + + if (req->total_num_bfregs > MLX5_MAX_BFREGS) + return -ENOMEM; + + uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k); + bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR; + /* This holds the required static allocation asked by the user */ + req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page); + if (req->num_low_latency_bfregs > req->total_num_bfregs - 1) + return -EINVAL; + + bfregi->num_static_sys_pages = req->total_num_bfregs / bfregs_per_sys_page; + bfregi->num_dyn_bfregs = ALIGN(calc_dynamic_bfregs(uars_per_sys_page), bfregs_per_sys_page); + bfregi->total_num_bfregs = req->total_num_bfregs + bfregi->num_dyn_bfregs; + bfregi->num_sys_pages = bfregi->total_num_bfregs / bfregs_per_sys_page; + + mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, allocated %d, total bfregs %d, using %d sys pages\n", + MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no", + lib_uar_4k ? "yes" : "no", ref_bfregs, + req->total_num_bfregs, bfregi->total_num_bfregs, + bfregi->num_sys_pages); + + return 0; +} + +static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context) +{ + struct mlx5_bfreg_info *bfregi; + int err; + int i; + + bfregi = &context->bfregi; + for (i = 0; i < bfregi->num_static_sys_pages; i++) { + err = mlx5_cmd_alloc_uar(dev->mdev, &bfregi->sys_pages[i]); + if (err) + goto error; + + mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]); + } + + for (i = bfregi->num_static_sys_pages; i < bfregi->num_sys_pages; i++) + bfregi->sys_pages[i] = MLX5_IB_INVALID_UAR_INDEX; + + return 0; + +error: + for (--i; i >= 0; i--) + if (mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i])) + mlx5_ib_warn(dev, "failed to free uar %d\n", i); + + return err; +} + +static void deallocate_uars(struct mlx5_ib_dev *dev, + struct mlx5_ib_ucontext *context) +{ + struct mlx5_bfreg_info *bfregi; + int i; + + bfregi = &context->bfregi; + for (i = 0; i < bfregi->num_sys_pages; i++) + if (i < bfregi->num_static_sys_pages || + bfregi->sys_pages[i] != MLX5_IB_INVALID_UAR_INDEX) + mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]); +} + +static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn) +{ + int err; + + if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) + return 0; + + err = mlx5_core_alloc_transport_domain(dev->mdev, tdn); + if (err) + return err; + + if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) || + (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) && + !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) + return err; + + mutex_lock(&dev->lb_mutex); + dev->user_td++; + + if (dev->user_td == 2) + err = mlx5_nic_vport_update_local_lb(dev->mdev, true); + + mutex_unlock(&dev->lb_mutex); + return err; +} + +static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn) +{ + if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) + return; + + mlx5_core_dealloc_transport_domain(dev->mdev, tdn); + + if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) || + (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) && + !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) + return; + + mutex_lock(&dev->lb_mutex); + dev->user_td--; + + if (dev->user_td < 2) + mlx5_nic_vport_update_local_lb(dev->mdev, false); + + mutex_unlock(&dev->lb_mutex); +} + +static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_alloc_ucontext_req_v2 req = {}; + struct mlx5_ib_alloc_ucontext_resp resp = {}; + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_ucontext *context; + struct mlx5_bfreg_info *bfregi; + int ver; + int err; + size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2, + max_cqe_version); + u32 dump_fill_mkey; + bool lib_uar_4k; + + if (!dev->ib_active) + return ERR_PTR(-EAGAIN); + + if (udata->inlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) + ver = 0; + else if (udata->inlen >= min_req_v2) + ver = 2; + else + return ERR_PTR(-EINVAL); + + err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); + if (err) + return ERR_PTR(err); + + if (req.flags & ~MLX5_IB_ALLOC_UCTX_DEVX) + return ERR_PTR(-EOPNOTSUPP); + + if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2) + return ERR_PTR(-EOPNOTSUPP); + + req.total_num_bfregs = ALIGN(req.total_num_bfregs, + MLX5_NON_FP_BFREGS_PER_UAR); + if (req.num_low_latency_bfregs > req.total_num_bfregs - 1) + return ERR_PTR(-EINVAL); + + resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp); + if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf)) + resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size); + resp.cache_line_size = cache_line_size(); + resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq); + resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq); + resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); + resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); + resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); + resp.cqe_version = min_t(__u8, + (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version), + req.max_cqe_version); + resp.log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ? + MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT; + resp.num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? + MLX5_CAP_GEN(dev->mdev, num_of_uars_per_page) : 1; + resp.response_length = min(offsetof(typeof(resp), response_length) + + sizeof(resp.response_length), udata->outlen); + + if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_DEVICE) { + if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_EGRESS)) + resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM; + if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA) + resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_REQ_METADATA; + if (MLX5_CAP_FLOWTABLE(dev->mdev, flow_table_properties_nic_receive.ft_field_support.outer_esp_spi)) + resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_SPI_STEERING; + if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN) + resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_TX_IV_IS_ESN; + /* MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD is currently always 0 */ + } + + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + + lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR; + bfregi = &context->bfregi; + + /* updates req->total_num_bfregs */ + err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi); + if (err) + goto out_ctx; + + mutex_init(&bfregi->lock); + bfregi->lib_uar_4k = lib_uar_4k; + bfregi->count = kcalloc(bfregi->total_num_bfregs, sizeof(*bfregi->count), + GFP_KERNEL); + if (!bfregi->count) { + err = -ENOMEM; + goto out_ctx; + } + + bfregi->sys_pages = kcalloc(bfregi->num_sys_pages, + sizeof(*bfregi->sys_pages), + GFP_KERNEL); + if (!bfregi->sys_pages) { + err = -ENOMEM; + goto out_count; + } + + err = allocate_uars(dev, context); + if (err) + goto out_sys_pages; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range; +#endif + + err = mlx5_ib_alloc_transport_domain(dev, &context->tdn); + if (err) + goto out_uars; + + if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) { + /* Block DEVX on Infiniband as of SELinux */ + if (mlx5_ib_port_link_layer(ibdev, 1) != IB_LINK_LAYER_ETHERNET) { + err = -EPERM; + goto out_td; + } + + err = mlx5_ib_devx_create(dev, context); + if (err) + goto out_td; + } + + if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) { + err = mlx5_cmd_dump_fill_mkey(dev->mdev, &dump_fill_mkey); + if (err) + goto out_mdev; + } + + INIT_LIST_HEAD(&context->vma_private_list); + mutex_init(&context->vma_private_list_mutex); + INIT_LIST_HEAD(&context->db_page_list); + mutex_init(&context->db_page_mutex); + + resp.tot_bfregs = req.total_num_bfregs; + resp.num_ports = dev->num_ports; + + if (field_avail(typeof(resp), cqe_version, udata->outlen)) + resp.response_length += sizeof(resp.cqe_version); + + if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) { + resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE | + MLX5_USER_CMDS_SUPP_UHW_CREATE_AH; + resp.response_length += sizeof(resp.cmds_supp_uhw); + } + + if (field_avail(typeof(resp), eth_min_inline, udata->outlen)) { + if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) { + mlx5_query_min_inline(dev->mdev, &resp.eth_min_inline); + resp.eth_min_inline++; + } + resp.response_length += sizeof(resp.eth_min_inline); + } + + if (field_avail(typeof(resp), clock_info_versions, udata->outlen)) { + if (mdev->clock_info) + resp.clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1); + resp.response_length += sizeof(resp.clock_info_versions); + } + + /* + * We don't want to expose information from the PCI bar that is located + * after 4096 bytes, so if the arch only supports larger pages, let's + * pretend we don't support reading the HCA's core clock. This is also + * forced by mmap function. + */ + if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) { + if (PAGE_SIZE <= 4096) { + resp.comp_mask |= + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET; + resp.hca_core_clock_offset = + offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE; + } + resp.response_length += sizeof(resp.hca_core_clock_offset); + } + + if (field_avail(typeof(resp), log_uar_size, udata->outlen)) + resp.response_length += sizeof(resp.log_uar_size); + + if (field_avail(typeof(resp), num_uars_per_page, udata->outlen)) + resp.response_length += sizeof(resp.num_uars_per_page); + + if (field_avail(typeof(resp), num_dyn_bfregs, udata->outlen)) { + resp.num_dyn_bfregs = bfregi->num_dyn_bfregs; + resp.response_length += sizeof(resp.num_dyn_bfregs); + } + + if (field_avail(typeof(resp), dump_fill_mkey, udata->outlen)) { + if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) { + resp.dump_fill_mkey = dump_fill_mkey; + resp.comp_mask |= + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY; + } + resp.response_length += sizeof(resp.dump_fill_mkey); + } + + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) + goto out_mdev; + + bfregi->ver = ver; + bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs; + context->cqe_version = resp.cqe_version; + context->lib_caps = req.lib_caps; + print_lib_caps(dev, context->lib_caps); + + if (mlx5_lag_is_active(dev->mdev)) { + u8 port = mlx5_core_native_port_num(dev->mdev); + + atomic_set(&context->tx_port_affinity, + atomic_add_return( + 1, &dev->roce[port].tx_port_affinity)); + } + + return &context->ibucontext; + +out_mdev: + if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) + mlx5_ib_devx_destroy(dev, context); +out_td: + mlx5_ib_dealloc_transport_domain(dev, context->tdn); + +out_uars: + deallocate_uars(dev, context); + +out_sys_pages: + kfree(bfregi->sys_pages); + +out_count: + kfree(bfregi->count); + +out_ctx: + kfree(context); + + return ERR_PTR(err); +} + +static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +{ + struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); + struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); + struct mlx5_bfreg_info *bfregi; + + if (context->devx_uid) + mlx5_ib_devx_destroy(dev, context); + + bfregi = &context->bfregi; + mlx5_ib_dealloc_transport_domain(dev, context->tdn); + + deallocate_uars(dev, context); + kfree(bfregi->sys_pages); + kfree(bfregi->count); + kfree(context); + + return 0; +} + +static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, + int uar_idx) +{ + int fw_uars_per_page; + + fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1; + + return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + uar_idx / fw_uars_per_page; +} + +static int get_command(unsigned long offset) +{ + return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK; +} + +static int get_arg(unsigned long offset) +{ + return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1); +} + +static int get_index(unsigned long offset) +{ + return get_arg(offset); +} + +/* Index resides in an extra byte to enable larger values than 255 */ +static int get_extended_index(unsigned long offset) +{ + return get_arg(offset) | ((offset >> 16) & 0xff) << 8; +} + +static void mlx5_ib_vma_open(struct vm_area_struct *area) +{ + /* vma_open is called when a new VMA is created on top of our VMA. This + * is done through either mremap flow or split_vma (usually due to + * mlock, madvise, munmap, etc.) We do not support a clone of the VMA, + * as this VMA is strongly hardware related. Therefore we set the + * vm_ops of the newly created/cloned VMA to NULL, to prevent it from + * calling us again and trying to do incorrect actions. We assume that + * the original VMA size is exactly a single page, and therefore all + * "splitting" operation will not happen to it. + */ + area->vm_ops = NULL; +} + +static void mlx5_ib_vma_close(struct vm_area_struct *area) +{ + struct mlx5_ib_vma_private_data *mlx5_ib_vma_priv_data; + + /* It's guaranteed that all VMAs opened on a FD are closed before the + * file itself is closed, therefore no sync is needed with the regular + * closing flow. (e.g. mlx5 ib_dealloc_ucontext) + * However need a sync with accessing the vma as part of + * mlx5_ib_disassociate_ucontext. + * The close operation is usually called under mm->mmap_sem except when + * process is exiting. + * The exiting case is handled explicitly as part of + * mlx5_ib_disassociate_ucontext. + */ + mlx5_ib_vma_priv_data = (struct mlx5_ib_vma_private_data *)area->vm_private_data; + + /* setting the vma context pointer to null in the mlx5_ib driver's + * private data, to protect a race condition in + * mlx5_ib_disassociate_ucontext(). + */ + mlx5_ib_vma_priv_data->vma = NULL; + mutex_lock(mlx5_ib_vma_priv_data->vma_private_list_mutex); + list_del(&mlx5_ib_vma_priv_data->list); + mutex_unlock(mlx5_ib_vma_priv_data->vma_private_list_mutex); + kfree(mlx5_ib_vma_priv_data); +} + +static const struct vm_operations_struct mlx5_ib_vm_ops = { + .open = mlx5_ib_vma_open, + .close = mlx5_ib_vma_close +}; + +static int mlx5_ib_set_vma_data(struct vm_area_struct *vma, + struct mlx5_ib_ucontext *ctx) +{ + struct mlx5_ib_vma_private_data *vma_prv; + struct list_head *vma_head = &ctx->vma_private_list; + + vma_prv = kzalloc(sizeof(*vma_prv), GFP_KERNEL); + if (!vma_prv) + return -ENOMEM; + + vma_prv->vma = vma; + vma_prv->vma_private_list_mutex = &ctx->vma_private_list_mutex; + vma->vm_private_data = vma_prv; + vma->vm_ops = &mlx5_ib_vm_ops; + + mutex_lock(&ctx->vma_private_list_mutex); + list_add(&vma_prv->list, vma_head); + mutex_unlock(&ctx->vma_private_list_mutex); + + return 0; +} + +static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) +{ + struct vm_area_struct *vma; + struct mlx5_ib_vma_private_data *vma_private, *n; + struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); + + mutex_lock(&context->vma_private_list_mutex); + list_for_each_entry_safe(vma_private, n, &context->vma_private_list, + list) { + vma = vma_private->vma; + zap_vma_ptes(vma, vma->vm_start, PAGE_SIZE); + /* context going to be destroyed, should + * not access ops any more. + */ + vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); + vma->vm_ops = NULL; + list_del(&vma_private->list); + kfree(vma_private); + } + mutex_unlock(&context->vma_private_list_mutex); +} + +static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) +{ + switch (cmd) { + case MLX5_IB_MMAP_WC_PAGE: + return "WC"; + case MLX5_IB_MMAP_REGULAR_PAGE: + return "best effort WC"; + case MLX5_IB_MMAP_NC_PAGE: + return "NC"; + case MLX5_IB_MMAP_DEVICE_MEM: + return "Device Memory"; + default: + return NULL; + } +} + +static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, + struct vm_area_struct *vma, + struct mlx5_ib_ucontext *context) +{ + phys_addr_t pfn; + int err; + + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + if (get_index(vma->vm_pgoff) != MLX5_IB_CLOCK_INFO_V1) + return -EOPNOTSUPP; + + if (vma->vm_flags & VM_WRITE) + return -EPERM; + vma->vm_flags &= ~VM_MAYWRITE; + + if (!dev->mdev->clock_info_page) + return -EOPNOTSUPP; + + pfn = page_to_pfn(dev->mdev->clock_info_page); + err = remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, + vma->vm_page_prot); + if (err) + return err; + + return mlx5_ib_set_vma_data(vma, context); +} + +static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, + struct vm_area_struct *vma, + struct mlx5_ib_ucontext *context) +{ + struct mlx5_bfreg_info *bfregi = &context->bfregi; + int err; + unsigned long idx; + phys_addr_t pfn; + pgprot_t prot; + u32 bfreg_dyn_idx = 0; + u32 uar_index; + int dyn_uar = (cmd == MLX5_IB_MMAP_ALLOC_WC); + int max_valid_idx = dyn_uar ? bfregi->num_sys_pages : + bfregi->num_static_sys_pages; + + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + if (dyn_uar) + idx = get_extended_index(vma->vm_pgoff) + bfregi->num_static_sys_pages; + else + idx = get_index(vma->vm_pgoff); + + if (idx >= max_valid_idx) { + mlx5_ib_warn(dev, "invalid uar index %lu, max=%d\n", + idx, max_valid_idx); + return -EINVAL; + } + + switch (cmd) { + case MLX5_IB_MMAP_WC_PAGE: + case MLX5_IB_MMAP_ALLOC_WC: +/* Some architectures don't support WC memory */ +#if defined(CONFIG_X86) + if (!pat_enabled()) + return -EPERM; +#elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU))) + return -EPERM; +#endif + /* fall through */ + case MLX5_IB_MMAP_REGULAR_PAGE: + /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */ + prot = pgprot_writecombine(vma->vm_page_prot); + break; + case MLX5_IB_MMAP_NC_PAGE: + prot = pgprot_noncached(vma->vm_page_prot); + break; + default: + return -EINVAL; + } + + if (dyn_uar) { + int uars_per_page; + + uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k); + bfreg_dyn_idx = idx * (uars_per_page * MLX5_NON_FP_BFREGS_PER_UAR); + if (bfreg_dyn_idx >= bfregi->total_num_bfregs) { + mlx5_ib_warn(dev, "invalid bfreg_dyn_idx %u, max=%u\n", + bfreg_dyn_idx, bfregi->total_num_bfregs); + return -EINVAL; + } + + mutex_lock(&bfregi->lock); + /* Fail if uar already allocated, first bfreg index of each + * page holds its count. + */ + if (bfregi->count[bfreg_dyn_idx]) { + mlx5_ib_warn(dev, "wrong offset, idx %lu is busy, bfregn=%u\n", idx, bfreg_dyn_idx); + mutex_unlock(&bfregi->lock); + return -EINVAL; + } + + bfregi->count[bfreg_dyn_idx]++; + mutex_unlock(&bfregi->lock); + + err = mlx5_cmd_alloc_uar(dev->mdev, &uar_index); + if (err) { + mlx5_ib_warn(dev, "UAR alloc failed\n"); + goto free_bfreg; + } + } else { + uar_index = bfregi->sys_pages[idx]; + } + + pfn = uar_index2pfn(dev, uar_index); + mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn); + + vma->vm_page_prot = prot; + err = io_remap_pfn_range(vma, vma->vm_start, pfn, + PAGE_SIZE, vma->vm_page_prot); + if (err) { + mlx5_ib_err(dev, + "io_remap_pfn_range failed with error=%d, mmap_cmd=%s\n", + err, mmap_cmd2str(cmd)); + err = -EAGAIN; + goto err; + } + + err = mlx5_ib_set_vma_data(vma, context); + if (err) + goto err; + + if (dyn_uar) + bfregi->sys_pages[idx] = uar_index; + return 0; + +err: + if (!dyn_uar) + return err; + + mlx5_cmd_free_uar(dev->mdev, idx); + +free_bfreg: + mlx5_ib_free_bfreg(dev, bfregi, bfreg_dyn_idx); + + return err; +} + +static int dm_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + struct mlx5_ib_ucontext *mctx = to_mucontext(context); + struct mlx5_ib_dev *dev = to_mdev(context->device); + u16 page_idx = get_extended_index(vma->vm_pgoff); + size_t map_size = vma->vm_end - vma->vm_start; + u32 npages = map_size >> PAGE_SHIFT; + phys_addr_t pfn; + pgprot_t prot; + + if (find_next_zero_bit(mctx->dm_pages, page_idx + npages, page_idx) != + page_idx + npages) + return -EINVAL; + + pfn = ((pci_resource_start(dev->mdev->pdev, 0) + + MLX5_CAP64_DEV_MEM(dev->mdev, memic_bar_start_addr)) >> + PAGE_SHIFT) + + page_idx; + prot = pgprot_writecombine(vma->vm_page_prot); + vma->vm_page_prot = prot; + + if (io_remap_pfn_range(vma, vma->vm_start, pfn, map_size, + vma->vm_page_prot)) + return -EAGAIN; + + return mlx5_ib_set_vma_data(vma, mctx); +} + +static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) +{ + struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); + struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); + unsigned long command; + phys_addr_t pfn; + + command = get_command(vma->vm_pgoff); + switch (command) { + case MLX5_IB_MMAP_WC_PAGE: + case MLX5_IB_MMAP_NC_PAGE: + case MLX5_IB_MMAP_REGULAR_PAGE: + case MLX5_IB_MMAP_ALLOC_WC: + return uar_mmap(dev, command, vma, context); + + case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES: + return -ENOSYS; + + case MLX5_IB_MMAP_CORE_CLOCK: + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + if (vma->vm_flags & VM_WRITE) + return -EPERM; + vma->vm_flags &= ~VM_MAYWRITE; + + /* Don't expose to user-space information it shouldn't have */ + if (PAGE_SIZE > 4096) + return -EOPNOTSUPP; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + pfn = (dev->mdev->iseg_base + + offsetof(struct mlx5_init_seg, internal_timer_h)) >> + PAGE_SHIFT; + if (io_remap_pfn_range(vma, vma->vm_start, pfn, + PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + break; + case MLX5_IB_MMAP_CLOCK_INFO: + return mlx5_ib_mmap_clock_info_page(dev, vma, context); + + case MLX5_IB_MMAP_DEVICE_MEM: + return dm_mmap(ibcontext, vma); + + default: + return -EINVAL; + } + + return 0; +} + +struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_dm_alloc_attr *attr, + struct uverbs_attr_bundle *attrs) +{ + u64 act_size = roundup(attr->length, MLX5_MEMIC_BASE_SIZE); + struct mlx5_memic *memic = &to_mdev(ibdev)->memic; + phys_addr_t memic_addr; + struct mlx5_ib_dm *dm; + u64 start_offset; + u32 page_idx; + int err; + + dm = kzalloc(sizeof(*dm), GFP_KERNEL); + if (!dm) + return ERR_PTR(-ENOMEM); + + mlx5_ib_dbg(to_mdev(ibdev), "alloc_memic req: user_length=0x%llx act_length=0x%llx log_alignment=%d\n", + attr->length, act_size, attr->alignment); + + err = mlx5_cmd_alloc_memic(memic, &memic_addr, + act_size, attr->alignment); + if (err) + goto err_free; + + start_offset = memic_addr & ~PAGE_MASK; + page_idx = (memic_addr - pci_resource_start(memic->dev->pdev, 0) - + MLX5_CAP64_DEV_MEM(memic->dev, memic_bar_start_addr)) >> + PAGE_SHIFT; + + err = uverbs_copy_to(attrs, + MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET, + &start_offset, sizeof(start_offset)); + if (err) + goto err_dealloc; + + err = uverbs_copy_to(attrs, + MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX, + &page_idx, sizeof(page_idx)); + if (err) + goto err_dealloc; + + bitmap_set(to_mucontext(context)->dm_pages, page_idx, + DIV_ROUND_UP(act_size, PAGE_SIZE)); + + dm->dev_addr = memic_addr; + + return &dm->ibdm; + +err_dealloc: + mlx5_cmd_dealloc_memic(memic, memic_addr, + act_size); +err_free: + kfree(dm); + return ERR_PTR(err); +} + +int mlx5_ib_dealloc_dm(struct ib_dm *ibdm) +{ + struct mlx5_memic *memic = &to_mdev(ibdm->device)->memic; + struct mlx5_ib_dm *dm = to_mdm(ibdm); + u64 act_size = roundup(dm->ibdm.length, MLX5_MEMIC_BASE_SIZE); + u32 page_idx; + int ret; + + ret = mlx5_cmd_dealloc_memic(memic, dm->dev_addr, act_size); + if (ret) + return ret; + + page_idx = (dm->dev_addr - pci_resource_start(memic->dev->pdev, 0) - + MLX5_CAP64_DEV_MEM(memic->dev, memic_bar_start_addr)) >> + PAGE_SHIFT; + bitmap_clear(to_mucontext(ibdm->uobject->context)->dm_pages, + page_idx, + DIV_ROUND_UP(act_size, PAGE_SIZE)); + + kfree(dm); + + return 0; +} + +static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx5_ib_alloc_pd_resp resp; + struct mlx5_ib_pd *pd; + int err; + + pd = kmalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn); + if (err) { + kfree(pd); + return ERR_PTR(err); + } + + if (context) { + resp.pdn = pd->pdn; + if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { + mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn); + kfree(pd); + return ERR_PTR(-EFAULT); + } + } + + return &pd->ibpd; +} + +static int mlx5_ib_dealloc_pd(struct ib_pd *pd) +{ + struct mlx5_ib_dev *mdev = to_mdev(pd->device); + struct mlx5_ib_pd *mpd = to_mpd(pd); + + mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn); + kfree(mpd); + + return 0; +} + +enum { + MATCH_CRITERIA_ENABLE_OUTER_BIT, + MATCH_CRITERIA_ENABLE_MISC_BIT, + MATCH_CRITERIA_ENABLE_INNER_BIT, + MATCH_CRITERIA_ENABLE_MISC2_BIT +}; + +#define HEADER_IS_ZERO(match_criteria, headers) \ + !(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \ + 0, MLX5_FLD_SZ_BYTES(fte_match_param, headers))) \ + +static u8 get_match_criteria_enable(u32 *match_criteria) +{ + u8 match_criteria_enable; + + match_criteria_enable = + (!HEADER_IS_ZERO(match_criteria, outer_headers)) << + MATCH_CRITERIA_ENABLE_OUTER_BIT; + match_criteria_enable |= + (!HEADER_IS_ZERO(match_criteria, misc_parameters)) << + MATCH_CRITERIA_ENABLE_MISC_BIT; + match_criteria_enable |= + (!HEADER_IS_ZERO(match_criteria, inner_headers)) << + MATCH_CRITERIA_ENABLE_INNER_BIT; + match_criteria_enable |= + (!HEADER_IS_ZERO(match_criteria, misc_parameters_2)) << + MATCH_CRITERIA_ENABLE_MISC2_BIT; + + return match_criteria_enable; +} + +static int set_proto(void *outer_c, void *outer_v, u8 mask, u8 val) +{ + u8 entry_mask; + u8 entry_val; + int err = 0; + + if (!mask) + goto out; + + entry_mask = MLX5_GET(fte_match_set_lyr_2_4, outer_c, + ip_protocol); + entry_val = MLX5_GET(fte_match_set_lyr_2_4, outer_v, + ip_protocol); + if (!entry_mask) { + MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask); + MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val); + goto out; + } + /* Don't override existing ip protocol */ + if (mask != entry_mask || val != entry_val) + err = -EINVAL; +out: + return err; +} + +static void set_flow_label(void *misc_c, void *misc_v, u32 mask, u32 val, + bool inner) +{ + if (inner) { + MLX5_SET(fte_match_set_misc, + misc_c, inner_ipv6_flow_label, mask); + MLX5_SET(fte_match_set_misc, + misc_v, inner_ipv6_flow_label, val); + } else { + MLX5_SET(fte_match_set_misc, + misc_c, outer_ipv6_flow_label, mask); + MLX5_SET(fte_match_set_misc, + misc_v, outer_ipv6_flow_label, val); + } +} + +static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val) +{ + MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask); + MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val); + MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2); + MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2); +} + +static int check_mpls_supp_fields(u32 field_support, const __be32 *set_mask) +{ + if (MLX5_GET(fte_match_mpls, set_mask, mpls_label) && + !(field_support & MLX5_FIELD_SUPPORT_MPLS_LABEL)) + return -EOPNOTSUPP; + + if (MLX5_GET(fte_match_mpls, set_mask, mpls_exp) && + !(field_support & MLX5_FIELD_SUPPORT_MPLS_EXP)) + return -EOPNOTSUPP; + + if (MLX5_GET(fte_match_mpls, set_mask, mpls_s_bos) && + !(field_support & MLX5_FIELD_SUPPORT_MPLS_S_BOS)) + return -EOPNOTSUPP; + + if (MLX5_GET(fte_match_mpls, set_mask, mpls_ttl) && + !(field_support & MLX5_FIELD_SUPPORT_MPLS_TTL)) + return -EOPNOTSUPP; + + return 0; +} + +#define LAST_ETH_FIELD vlan_tag +#define LAST_IB_FIELD sl +#define LAST_IPV4_FIELD tos +#define LAST_IPV6_FIELD traffic_class +#define LAST_TCP_UDP_FIELD src_port +#define LAST_TUNNEL_FIELD tunnel_id +#define LAST_FLOW_TAG_FIELD tag_id +#define LAST_DROP_FIELD size +#define LAST_COUNTERS_FIELD counters + +/* Field is the last supported field */ +#define FIELDS_NOT_SUPPORTED(filter, field)\ + memchr_inv((void *)&filter.field +\ + sizeof(filter.field), 0,\ + sizeof(filter) -\ + offsetof(typeof(filter), field) -\ + sizeof(filter.field)) + +static int parse_flow_flow_action(const union ib_flow_spec *ib_spec, + const struct ib_flow_attr *flow_attr, + struct mlx5_flow_act *action) +{ + struct mlx5_ib_flow_action *maction = to_mflow_act(ib_spec->action.act); + + switch (maction->ib_action.type) { + case IB_FLOW_ACTION_ESP: + /* Currently only AES_GCM keymat is supported by the driver */ + action->esp_id = (uintptr_t)maction->esp_aes_gcm.ctx; + action->action |= flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS ? + MLX5_FLOW_CONTEXT_ACTION_ENCRYPT : + MLX5_FLOW_CONTEXT_ACTION_DECRYPT; + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c, + u32 *match_v, const union ib_flow_spec *ib_spec, + const struct ib_flow_attr *flow_attr, + struct mlx5_flow_act *action, u32 prev_type) +{ + void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c, + misc_parameters); + void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v, + misc_parameters); + void *misc_params2_c = MLX5_ADDR_OF(fte_match_param, match_c, + misc_parameters_2); + void *misc_params2_v = MLX5_ADDR_OF(fte_match_param, match_v, + misc_parameters_2); + void *headers_c; + void *headers_v; + int match_ipv; + int ret; + + if (ib_spec->type & IB_FLOW_SPEC_INNER) { + headers_c = MLX5_ADDR_OF(fte_match_param, match_c, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, match_v, + inner_headers); + match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.inner_ip_version); + } else { + headers_c = MLX5_ADDR_OF(fte_match_param, match_c, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, match_v, + outer_headers); + match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.outer_ip_version); + } + + switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) { + case IB_FLOW_SPEC_ETH: + if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD)) + return -EOPNOTSUPP; + + ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + dmac_47_16), + ib_spec->eth.mask.dst_mac); + ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + dmac_47_16), + ib_spec->eth.val.dst_mac); + + ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + smac_47_16), + ib_spec->eth.mask.src_mac); + ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + smac_47_16), + ib_spec->eth.val.src_mac); + + if (ib_spec->eth.mask.vlan_tag) { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + cvlan_tag, 1); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + cvlan_tag, 1); + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + first_vid, ntohs(ib_spec->eth.mask.vlan_tag)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + first_vid, ntohs(ib_spec->eth.val.vlan_tag)); + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + first_cfi, + ntohs(ib_spec->eth.mask.vlan_tag) >> 12); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + first_cfi, + ntohs(ib_spec->eth.val.vlan_tag) >> 12); + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + first_prio, + ntohs(ib_spec->eth.mask.vlan_tag) >> 13); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + first_prio, + ntohs(ib_spec->eth.val.vlan_tag) >> 13); + } + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + ethertype, ntohs(ib_spec->eth.mask.ether_type)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + ethertype, ntohs(ib_spec->eth.val.ether_type)); + break; + case IB_FLOW_SPEC_IPV4: + if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD)) + return -EOPNOTSUPP; + + if (match_ipv) { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + ip_version, 0xf); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + ip_version, MLX5_FS_IPV4_VERSION); + } else { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + ethertype, 0xffff); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + ethertype, ETH_P_IP); + } + + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + src_ipv4_src_ipv6.ipv4_layout.ipv4), + &ib_spec->ipv4.mask.src_ip, + sizeof(ib_spec->ipv4.mask.src_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + src_ipv4_src_ipv6.ipv4_layout.ipv4), + &ib_spec->ipv4.val.src_ip, + sizeof(ib_spec->ipv4.val.src_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4), + &ib_spec->ipv4.mask.dst_ip, + sizeof(ib_spec->ipv4.mask.dst_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4), + &ib_spec->ipv4.val.dst_ip, + sizeof(ib_spec->ipv4.val.dst_ip)); + + set_tos(headers_c, headers_v, + ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos); + + if (set_proto(headers_c, headers_v, + ib_spec->ipv4.mask.proto, + ib_spec->ipv4.val.proto)) + return -EINVAL; + break; + case IB_FLOW_SPEC_IPV6: + if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD)) + return -EOPNOTSUPP; + + if (match_ipv) { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + ip_version, 0xf); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + ip_version, MLX5_FS_IPV6_VERSION); + } else { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + ethertype, 0xffff); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + ethertype, ETH_P_IPV6); + } + + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + &ib_spec->ipv6.mask.src_ip, + sizeof(ib_spec->ipv6.mask.src_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + &ib_spec->ipv6.val.src_ip, + sizeof(ib_spec->ipv6.val.src_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &ib_spec->ipv6.mask.dst_ip, + sizeof(ib_spec->ipv6.mask.dst_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &ib_spec->ipv6.val.dst_ip, + sizeof(ib_spec->ipv6.val.dst_ip)); + + set_tos(headers_c, headers_v, + ib_spec->ipv6.mask.traffic_class, + ib_spec->ipv6.val.traffic_class); + + if (set_proto(headers_c, headers_v, + ib_spec->ipv6.mask.next_hdr, + ib_spec->ipv6.val.next_hdr)) + return -EINVAL; + + set_flow_label(misc_params_c, misc_params_v, + ntohl(ib_spec->ipv6.mask.flow_label), + ntohl(ib_spec->ipv6.val.flow_label), + ib_spec->type & IB_FLOW_SPEC_INNER); + break; + case IB_FLOW_SPEC_ESP: + if (ib_spec->esp.mask.seq) + return -EOPNOTSUPP; + + MLX5_SET(fte_match_set_misc, misc_params_c, outer_esp_spi, + ntohl(ib_spec->esp.mask.spi)); + MLX5_SET(fte_match_set_misc, misc_params_v, outer_esp_spi, + ntohl(ib_spec->esp.val.spi)); + break; + case IB_FLOW_SPEC_TCP: + if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask, + LAST_TCP_UDP_FIELD)) + return -EOPNOTSUPP; + + if (set_proto(headers_c, headers_v, 0xff, IPPROTO_TCP)) + return -EINVAL; + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_sport, + ntohs(ib_spec->tcp_udp.mask.src_port)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_sport, + ntohs(ib_spec->tcp_udp.val.src_port)); + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_dport, + ntohs(ib_spec->tcp_udp.mask.dst_port)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_dport, + ntohs(ib_spec->tcp_udp.val.dst_port)); + break; + case IB_FLOW_SPEC_UDP: + if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask, + LAST_TCP_UDP_FIELD)) + return -EOPNOTSUPP; + + if (set_proto(headers_c, headers_v, 0xff, IPPROTO_UDP)) + return -EINVAL; + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_sport, + ntohs(ib_spec->tcp_udp.mask.src_port)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport, + ntohs(ib_spec->tcp_udp.val.src_port)); + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_dport, + ntohs(ib_spec->tcp_udp.mask.dst_port)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, + ntohs(ib_spec->tcp_udp.val.dst_port)); + break; + case IB_FLOW_SPEC_GRE: + if (ib_spec->gre.mask.c_ks_res0_ver) + return -EOPNOTSUPP; + + if (set_proto(headers_c, headers_v, 0xff, IPPROTO_GRE)) + return -EINVAL; + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol, + 0xff); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, + IPPROTO_GRE); + + MLX5_SET(fte_match_set_misc, misc_params_c, gre_protocol, + ntohs(ib_spec->gre.mask.protocol)); + MLX5_SET(fte_match_set_misc, misc_params_v, gre_protocol, + ntohs(ib_spec->gre.val.protocol)); + + memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_c, + gre_key_h), + &ib_spec->gre.mask.key, + sizeof(ib_spec->gre.mask.key)); + memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_v, + gre_key_h), + &ib_spec->gre.val.key, + sizeof(ib_spec->gre.val.key)); + break; + case IB_FLOW_SPEC_MPLS: + switch (prev_type) { + case IB_FLOW_SPEC_UDP: + if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.outer_first_mpls_over_udp), + &ib_spec->mpls.mask.tag)) + return -EOPNOTSUPP; + + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v, + outer_first_mpls_over_udp), + &ib_spec->mpls.val.tag, + sizeof(ib_spec->mpls.val.tag)); + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c, + outer_first_mpls_over_udp), + &ib_spec->mpls.mask.tag, + sizeof(ib_spec->mpls.mask.tag)); + break; + case IB_FLOW_SPEC_GRE: + if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.outer_first_mpls_over_gre), + &ib_spec->mpls.mask.tag)) + return -EOPNOTSUPP; + + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v, + outer_first_mpls_over_gre), + &ib_spec->mpls.val.tag, + sizeof(ib_spec->mpls.val.tag)); + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c, + outer_first_mpls_over_gre), + &ib_spec->mpls.mask.tag, + sizeof(ib_spec->mpls.mask.tag)); + break; + default: + if (ib_spec->type & IB_FLOW_SPEC_INNER) { + if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.inner_first_mpls), + &ib_spec->mpls.mask.tag)) + return -EOPNOTSUPP; + + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v, + inner_first_mpls), + &ib_spec->mpls.val.tag, + sizeof(ib_spec->mpls.val.tag)); + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c, + inner_first_mpls), + &ib_spec->mpls.mask.tag, + sizeof(ib_spec->mpls.mask.tag)); + } else { + if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.outer_first_mpls), + &ib_spec->mpls.mask.tag)) + return -EOPNOTSUPP; + + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v, + outer_first_mpls), + &ib_spec->mpls.val.tag, + sizeof(ib_spec->mpls.val.tag)); + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c, + outer_first_mpls), + &ib_spec->mpls.mask.tag, + sizeof(ib_spec->mpls.mask.tag)); + } + } + break; + case IB_FLOW_SPEC_VXLAN_TUNNEL: + if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask, + LAST_TUNNEL_FIELD)) + return -EOPNOTSUPP; + + MLX5_SET(fte_match_set_misc, misc_params_c, vxlan_vni, + ntohl(ib_spec->tunnel.mask.tunnel_id)); + MLX5_SET(fte_match_set_misc, misc_params_v, vxlan_vni, + ntohl(ib_spec->tunnel.val.tunnel_id)); + break; + case IB_FLOW_SPEC_ACTION_TAG: + if (FIELDS_NOT_SUPPORTED(ib_spec->flow_tag, + LAST_FLOW_TAG_FIELD)) + return -EOPNOTSUPP; + if (ib_spec->flow_tag.tag_id >= BIT(24)) + return -EINVAL; + + action->flow_tag = ib_spec->flow_tag.tag_id; + action->has_flow_tag = true; + break; + case IB_FLOW_SPEC_ACTION_DROP: + if (FIELDS_NOT_SUPPORTED(ib_spec->drop, + LAST_DROP_FIELD)) + return -EOPNOTSUPP; + action->action |= MLX5_FLOW_CONTEXT_ACTION_DROP; + break; + case IB_FLOW_SPEC_ACTION_HANDLE: + ret = parse_flow_flow_action(ib_spec, flow_attr, action); + if (ret) + return ret; + break; + case IB_FLOW_SPEC_ACTION_COUNT: + if (FIELDS_NOT_SUPPORTED(ib_spec->flow_count, + LAST_COUNTERS_FIELD)) + return -EOPNOTSUPP; + + /* for now support only one counters spec per flow */ + if (action->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) + return -EINVAL; + + action->counters = ib_spec->flow_count.counters; + action->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; + break; + default: + return -EINVAL; + } + + return 0; +} + +/* If a flow could catch both multicast and unicast packets, + * it won't fall into the multicast flow steering table and this rule + * could steal other multicast packets. + */ +static bool flow_is_multicast_only(const struct ib_flow_attr *ib_attr) +{ + union ib_flow_spec *flow_spec; + + if (ib_attr->type != IB_FLOW_ATTR_NORMAL || + ib_attr->num_of_specs < 1) + return false; + + flow_spec = (union ib_flow_spec *)(ib_attr + 1); + if (flow_spec->type == IB_FLOW_SPEC_IPV4) { + struct ib_flow_spec_ipv4 *ipv4_spec; + + ipv4_spec = (struct ib_flow_spec_ipv4 *)flow_spec; + if (ipv4_is_multicast(ipv4_spec->val.dst_ip)) + return true; + + return false; + } + + if (flow_spec->type == IB_FLOW_SPEC_ETH) { + struct ib_flow_spec_eth *eth_spec; + + eth_spec = (struct ib_flow_spec_eth *)flow_spec; + return is_multicast_ether_addr(eth_spec->mask.dst_mac) && + is_multicast_ether_addr(eth_spec->val.dst_mac); + } + + return false; +} + +enum valid_spec { + VALID_SPEC_INVALID, + VALID_SPEC_VALID, + VALID_SPEC_NA, +}; + +static enum valid_spec +is_valid_esp_aes_gcm(struct mlx5_core_dev *mdev, + const struct mlx5_flow_spec *spec, + const struct mlx5_flow_act *flow_act, + bool egress) +{ + const u32 *match_c = spec->match_criteria; + bool is_crypto = + (flow_act->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT | + MLX5_FLOW_CONTEXT_ACTION_DECRYPT)); + bool is_ipsec = mlx5_fs_is_ipsec_flow(match_c); + bool is_drop = flow_act->action & MLX5_FLOW_CONTEXT_ACTION_DROP; + + /* + * Currently only crypto is supported in egress, when regular egress + * rules would be supported, always return VALID_SPEC_NA. + */ + if (!is_crypto) + return egress ? VALID_SPEC_INVALID : VALID_SPEC_NA; + + return is_crypto && is_ipsec && + (!egress || (!is_drop && !flow_act->has_flow_tag)) ? + VALID_SPEC_VALID : VALID_SPEC_INVALID; +} + +static bool is_valid_spec(struct mlx5_core_dev *mdev, + const struct mlx5_flow_spec *spec, + const struct mlx5_flow_act *flow_act, + bool egress) +{ + /* We curretly only support ipsec egress flow */ + return is_valid_esp_aes_gcm(mdev, spec, flow_act, egress) != VALID_SPEC_INVALID; +} + +static bool is_valid_ethertype(struct mlx5_core_dev *mdev, + const struct ib_flow_attr *flow_attr, + bool check_inner) +{ + union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1); + int match_ipv = check_inner ? + MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.inner_ip_version) : + MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.outer_ip_version); + int inner_bit = check_inner ? IB_FLOW_SPEC_INNER : 0; + bool ipv4_spec_valid, ipv6_spec_valid; + unsigned int ip_spec_type = 0; + bool has_ethertype = false; + unsigned int spec_index; + bool mask_valid = true; + u16 eth_type = 0; + bool type_valid; + + /* Validate that ethertype is correct */ + for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { + if ((ib_spec->type == (IB_FLOW_SPEC_ETH | inner_bit)) && + ib_spec->eth.mask.ether_type) { + mask_valid = (ib_spec->eth.mask.ether_type == + htons(0xffff)); + has_ethertype = true; + eth_type = ntohs(ib_spec->eth.val.ether_type); + } else if ((ib_spec->type == (IB_FLOW_SPEC_IPV4 | inner_bit)) || + (ib_spec->type == (IB_FLOW_SPEC_IPV6 | inner_bit))) { + ip_spec_type = ib_spec->type; + } + ib_spec = (void *)ib_spec + ib_spec->size; + } + + type_valid = (!has_ethertype) || (!ip_spec_type); + if (!type_valid && mask_valid) { + ipv4_spec_valid = (eth_type == ETH_P_IP) && + (ip_spec_type == (IB_FLOW_SPEC_IPV4 | inner_bit)); + ipv6_spec_valid = (eth_type == ETH_P_IPV6) && + (ip_spec_type == (IB_FLOW_SPEC_IPV6 | inner_bit)); + + type_valid = (ipv4_spec_valid) || (ipv6_spec_valid) || + (((eth_type == ETH_P_MPLS_UC) || + (eth_type == ETH_P_MPLS_MC)) && match_ipv); + } + + return type_valid; +} + +static bool is_valid_attr(struct mlx5_core_dev *mdev, + const struct ib_flow_attr *flow_attr) +{ + return is_valid_ethertype(mdev, flow_attr, false) && + is_valid_ethertype(mdev, flow_attr, true); +} + +static void put_flow_table(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *prio, bool ft_added) +{ + prio->refcount -= !!ft_added; + if (!prio->refcount) { + mlx5_destroy_flow_table(prio->flow_table); + prio->flow_table = NULL; + } +} + +static void counters_clear_description(struct ib_counters *counters) +{ + struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); + + mutex_lock(&mcounters->mcntrs_mutex); + kfree(mcounters->counters_data); + mcounters->counters_data = NULL; + mcounters->cntrs_max_index = 0; + mutex_unlock(&mcounters->mcntrs_mutex); +} + +static int mlx5_ib_destroy_flow(struct ib_flow *flow_id) +{ + struct mlx5_ib_flow_handler *handler = container_of(flow_id, + struct mlx5_ib_flow_handler, + ibflow); + struct mlx5_ib_flow_handler *iter, *tmp; + struct mlx5_ib_dev *dev = handler->dev; + + mutex_lock(&dev->flow_db->lock); + + list_for_each_entry_safe(iter, tmp, &handler->list, list) { + mlx5_del_flow_rules(iter->rule); + put_flow_table(dev, iter->prio, true); + list_del(&iter->list); + kfree(iter); + } + + mlx5_del_flow_rules(handler->rule); + put_flow_table(dev, handler->prio, true); + if (handler->ibcounters && + atomic_read(&handler->ibcounters->usecnt) == 1) + counters_clear_description(handler->ibcounters); + + mutex_unlock(&dev->flow_db->lock); + if (handler->flow_matcher) + atomic_dec(&handler->flow_matcher->usecnt); + kfree(handler); + + return 0; +} + +static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap) +{ + priority *= 2; + if (!dont_trap) + priority++; + return priority; +} + +enum flow_table_type { + MLX5_IB_FT_RX, + MLX5_IB_FT_TX +}; + +#define MLX5_FS_MAX_TYPES 6 +#define MLX5_FS_MAX_ENTRIES BIT(16) + +static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns, + struct mlx5_ib_flow_prio *prio, + int priority, + int num_entries, int num_groups) +{ + struct mlx5_flow_table *ft; + + ft = mlx5_create_auto_grouped_flow_table(ns, priority, + num_entries, + num_groups, + 0, 0); + if (IS_ERR(ft)) + return ERR_CAST(ft); + + prio->flow_table = ft; + prio->refcount = 0; + return prio; +} + +static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, + struct ib_flow_attr *flow_attr, + enum flow_table_type ft_type) +{ + bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP; + struct mlx5_flow_namespace *ns = NULL; + struct mlx5_ib_flow_prio *prio; + struct mlx5_flow_table *ft; + int max_table_size; + int num_entries; + int num_groups; + int priority; + + max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, + log_max_ft_size)); + if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { + if (ft_type == MLX5_IB_FT_TX) + priority = 0; + else if (flow_is_multicast_only(flow_attr) && + !dont_trap) + priority = MLX5_IB_FLOW_MCAST_PRIO; + else + priority = ib_prio_to_core_prio(flow_attr->priority, + dont_trap); + ns = mlx5_get_flow_namespace(dev->mdev, + ft_type == MLX5_IB_FT_TX ? + MLX5_FLOW_NAMESPACE_EGRESS : + MLX5_FLOW_NAMESPACE_BYPASS); + num_entries = MLX5_FS_MAX_ENTRIES; + num_groups = MLX5_FS_MAX_TYPES; + prio = &dev->flow_db->prios[priority]; + } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || + flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { + ns = mlx5_get_flow_namespace(dev->mdev, + MLX5_FLOW_NAMESPACE_LEFTOVERS); + build_leftovers_ft_param(&priority, + &num_entries, + &num_groups); + prio = &dev->flow_db->prios[MLX5_IB_FLOW_LEFTOVERS_PRIO]; + } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) { + if (!MLX5_CAP_FLOWTABLE(dev->mdev, + allow_sniffer_and_nic_rx_shared_tir)) + return ERR_PTR(-ENOTSUPP); + + ns = mlx5_get_flow_namespace(dev->mdev, ft_type == MLX5_IB_FT_RX ? + MLX5_FLOW_NAMESPACE_SNIFFER_RX : + MLX5_FLOW_NAMESPACE_SNIFFER_TX); + + prio = &dev->flow_db->sniffer[ft_type]; + priority = 0; + num_entries = 1; + num_groups = 1; + } + + if (!ns) + return ERR_PTR(-ENOTSUPP); + + if (num_entries > max_table_size) + return ERR_PTR(-ENOMEM); + + ft = prio->flow_table; + if (!ft) + return _get_prio(ns, prio, priority, num_entries, num_groups); + + return prio; +} + +static void set_underlay_qp(struct mlx5_ib_dev *dev, + struct mlx5_flow_spec *spec, + u32 underlay_qpn) +{ + void *misc_params_c = MLX5_ADDR_OF(fte_match_param, + spec->match_criteria, + misc_parameters); + void *misc_params_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters); + + if (underlay_qpn && + MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, + ft_field_support.bth_dst_qp)) { + MLX5_SET(fte_match_set_misc, + misc_params_v, bth_dst_qp, underlay_qpn); + MLX5_SET(fte_match_set_misc, + misc_params_c, bth_dst_qp, 0xffffff); + } +} + +static int read_flow_counters(struct ib_device *ibdev, + struct mlx5_read_counters_attr *read_attr) +{ + struct mlx5_fc *fc = read_attr->hw_cntrs_hndl; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + + return mlx5_fc_query(dev->mdev, fc, + &read_attr->out[IB_COUNTER_PACKETS], + &read_attr->out[IB_COUNTER_BYTES]); +} + +/* flow counters currently expose two counters packets and bytes */ +#define FLOW_COUNTERS_NUM 2 +static int counters_set_description(struct ib_counters *counters, + enum mlx5_ib_counters_type counters_type, + struct mlx5_ib_flow_counters_desc *desc_data, + u32 ncounters) +{ + struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); + u32 cntrs_max_index = 0; + int i; + + if (counters_type != MLX5_IB_COUNTERS_FLOW) + return -EINVAL; + + /* init the fields for the object */ + mcounters->type = counters_type; + mcounters->read_counters = read_flow_counters; + mcounters->counters_num = FLOW_COUNTERS_NUM; + mcounters->ncounters = ncounters; + /* each counter entry have both description and index pair */ + for (i = 0; i < ncounters; i++) { + if (desc_data[i].description > IB_COUNTER_BYTES) + return -EINVAL; + + if (cntrs_max_index <= desc_data[i].index) + cntrs_max_index = desc_data[i].index + 1; + } + + mutex_lock(&mcounters->mcntrs_mutex); + mcounters->counters_data = desc_data; + mcounters->cntrs_max_index = cntrs_max_index; + mutex_unlock(&mcounters->mcntrs_mutex); + + return 0; +} + +#define MAX_COUNTERS_NUM (USHRT_MAX / (sizeof(u32) * 2)) +static int flow_counters_set_data(struct ib_counters *ibcounters, + struct mlx5_ib_create_flow *ucmd) +{ + struct mlx5_ib_mcounters *mcounters = to_mcounters(ibcounters); + struct mlx5_ib_flow_counters_data *cntrs_data = NULL; + struct mlx5_ib_flow_counters_desc *desc_data = NULL; + bool hw_hndl = false; + int ret = 0; + + if (ucmd && ucmd->ncounters_data != 0) { + cntrs_data = ucmd->data; + if (cntrs_data->ncounters > MAX_COUNTERS_NUM) + return -EINVAL; + + desc_data = kcalloc(cntrs_data->ncounters, + sizeof(*desc_data), + GFP_KERNEL); + if (!desc_data) + return -ENOMEM; + + if (copy_from_user(desc_data, + u64_to_user_ptr(cntrs_data->counters_data), + sizeof(*desc_data) * cntrs_data->ncounters)) { + ret = -EFAULT; + goto free; + } + } + + if (!mcounters->hw_cntrs_hndl) { + mcounters->hw_cntrs_hndl = mlx5_fc_create( + to_mdev(ibcounters->device)->mdev, false); + if (IS_ERR(mcounters->hw_cntrs_hndl)) { + ret = PTR_ERR(mcounters->hw_cntrs_hndl); + goto free; + } + hw_hndl = true; + } + + if (desc_data) { + /* counters already bound to at least one flow */ + if (mcounters->cntrs_max_index) { + ret = -EINVAL; + goto free_hndl; + } + + ret = counters_set_description(ibcounters, + MLX5_IB_COUNTERS_FLOW, + desc_data, + cntrs_data->ncounters); + if (ret) + goto free_hndl; + + } else if (!mcounters->cntrs_max_index) { + /* counters not bound yet, must have udata passed */ + ret = -EINVAL; + goto free_hndl; + } + + return 0; + +free_hndl: + if (hw_hndl) { + mlx5_fc_destroy(to_mdev(ibcounters->device)->mdev, + mcounters->hw_cntrs_hndl); + mcounters->hw_cntrs_hndl = NULL; + } +free: + kfree(desc_data); + return ret; +} + +static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + const struct ib_flow_attr *flow_attr, + struct mlx5_flow_destination *dst, + u32 underlay_qpn, + struct mlx5_ib_create_flow *ucmd) +{ + struct mlx5_flow_table *ft = ft_prio->flow_table; + struct mlx5_ib_flow_handler *handler; + struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG}; + struct mlx5_flow_spec *spec; + struct mlx5_flow_destination dest_arr[2] = {}; + struct mlx5_flow_destination *rule_dst = dest_arr; + const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr); + unsigned int spec_index; + u32 prev_type = 0; + int err = 0; + int dest_num = 0; + bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS; + + if (!is_valid_attr(dev->mdev, flow_attr)) + return ERR_PTR(-EINVAL); + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + handler = kzalloc(sizeof(*handler), GFP_KERNEL); + if (!handler || !spec) { + err = -ENOMEM; + goto free; + } + + INIT_LIST_HEAD(&handler->list); + + for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { + err = parse_flow_attr(dev->mdev, spec->match_criteria, + spec->match_value, + ib_flow, flow_attr, &flow_act, + prev_type); + if (err < 0) + goto free; + + prev_type = ((union ib_flow_spec *)ib_flow)->type; + ib_flow += ((union ib_flow_spec *)ib_flow)->size; + } + + if (dst && !(flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP)) { + memcpy(&dest_arr[0], dst, sizeof(*dst)); + dest_num++; + } + + if (!flow_is_multicast_only(flow_attr)) + set_underlay_qp(dev, spec, underlay_qpn); + + if (dev->rep) { + void *misc; + + misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters); + MLX5_SET(fte_match_set_misc, misc, source_port, + dev->rep->vport); + misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters); + MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port); + } + + spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria); + + if (is_egress && + !is_valid_spec(dev->mdev, spec, &flow_act, is_egress)) { + err = -EINVAL; + goto free; + } + + if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + err = flow_counters_set_data(flow_act.counters, ucmd); + if (err) + goto free; + + handler->ibcounters = flow_act.counters; + dest_arr[dest_num].type = + MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dest_arr[dest_num].counter = + to_mcounters(flow_act.counters)->hw_cntrs_hndl; + dest_num++; + } + + if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP) { + if (!dest_num) + rule_dst = NULL; + } else { + if (is_egress) + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW; + else + flow_act.action |= + dest_num ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST : + MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; + } + + if (flow_act.has_flow_tag && + (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || + flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) { + mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n", + flow_act.flow_tag, flow_attr->type); + err = -EINVAL; + goto free; + } + handler->rule = mlx5_add_flow_rules(ft, spec, + &flow_act, + rule_dst, dest_num); + + if (IS_ERR(handler->rule)) { + err = PTR_ERR(handler->rule); + goto free; + } + + ft_prio->refcount++; + handler->prio = ft_prio; + handler->dev = dev; + + ft_prio->flow_table = ft; +free: + if (err && handler) { + if (handler->ibcounters && + atomic_read(&handler->ibcounters->usecnt) == 1) + counters_clear_description(handler->ibcounters); + kfree(handler); + } + kvfree(spec); + return err ? ERR_PTR(err) : handler; +} + +static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + const struct ib_flow_attr *flow_attr, + struct mlx5_flow_destination *dst) +{ + return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0, NULL); +} + +static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + struct ib_flow_attr *flow_attr, + struct mlx5_flow_destination *dst) +{ + struct mlx5_ib_flow_handler *handler_dst = NULL; + struct mlx5_ib_flow_handler *handler = NULL; + + handler = create_flow_rule(dev, ft_prio, flow_attr, NULL); + if (!IS_ERR(handler)) { + handler_dst = create_flow_rule(dev, ft_prio, + flow_attr, dst); + if (IS_ERR(handler_dst)) { + mlx5_del_flow_rules(handler->rule); + ft_prio->refcount--; + kfree(handler); + handler = handler_dst; + } else { + list_add(&handler_dst->list, &handler->list); + } + } + + return handler; +} +enum { + LEFTOVERS_MC, + LEFTOVERS_UC, +}; + +static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + struct ib_flow_attr *flow_attr, + struct mlx5_flow_destination *dst) +{ + struct mlx5_ib_flow_handler *handler_ucast = NULL; + struct mlx5_ib_flow_handler *handler = NULL; + + static struct { + struct ib_flow_attr flow_attr; + struct ib_flow_spec_eth eth_flow; + } leftovers_specs[] = { + [LEFTOVERS_MC] = { + .flow_attr = { + .num_of_specs = 1, + .size = sizeof(leftovers_specs[0]) + }, + .eth_flow = { + .type = IB_FLOW_SPEC_ETH, + .size = sizeof(struct ib_flow_spec_eth), + .mask = {.dst_mac = {0x1} }, + .val = {.dst_mac = {0x1} } + } + }, + [LEFTOVERS_UC] = { + .flow_attr = { + .num_of_specs = 1, + .size = sizeof(leftovers_specs[0]) + }, + .eth_flow = { + .type = IB_FLOW_SPEC_ETH, + .size = sizeof(struct ib_flow_spec_eth), + .mask = {.dst_mac = {0x1} }, + .val = {.dst_mac = {} } + } + } + }; + + handler = create_flow_rule(dev, ft_prio, + &leftovers_specs[LEFTOVERS_MC].flow_attr, + dst); + if (!IS_ERR(handler) && + flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) { + handler_ucast = create_flow_rule(dev, ft_prio, + &leftovers_specs[LEFTOVERS_UC].flow_attr, + dst); + if (IS_ERR(handler_ucast)) { + mlx5_del_flow_rules(handler->rule); + ft_prio->refcount--; + kfree(handler); + handler = handler_ucast; + } else { + list_add(&handler_ucast->list, &handler->list); + } + } + + return handler; +} + +static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_rx, + struct mlx5_ib_flow_prio *ft_tx, + struct mlx5_flow_destination *dst) +{ + struct mlx5_ib_flow_handler *handler_rx; + struct mlx5_ib_flow_handler *handler_tx; + int err; + static const struct ib_flow_attr flow_attr = { + .num_of_specs = 0, + .size = sizeof(flow_attr) + }; + + handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst); + if (IS_ERR(handler_rx)) { + err = PTR_ERR(handler_rx); + goto err; + } + + handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst); + if (IS_ERR(handler_tx)) { + err = PTR_ERR(handler_tx); + goto err_tx; + } + + list_add(&handler_tx->list, &handler_rx->list); + + return handler_rx; + +err_tx: + mlx5_del_flow_rules(handler_rx->rule); + ft_rx->refcount--; + kfree(handler_rx); +err: + return ERR_PTR(err); +} + +static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, + struct ib_flow_attr *flow_attr, + int domain, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_qp *mqp = to_mqp(qp); + struct mlx5_ib_flow_handler *handler = NULL; + struct mlx5_flow_destination *dst = NULL; + struct mlx5_ib_flow_prio *ft_prio_tx = NULL; + struct mlx5_ib_flow_prio *ft_prio; + bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS; + struct mlx5_ib_create_flow *ucmd = NULL, ucmd_hdr; + size_t min_ucmd_sz, required_ucmd_sz; + int err; + int underlay_qpn; + + if (udata && udata->inlen) { + min_ucmd_sz = offsetof(typeof(ucmd_hdr), reserved) + + sizeof(ucmd_hdr.reserved); + if (udata->inlen < min_ucmd_sz) + return ERR_PTR(-EOPNOTSUPP); + + err = ib_copy_from_udata(&ucmd_hdr, udata, min_ucmd_sz); + if (err) + return ERR_PTR(err); + + /* currently supports only one counters data */ + if (ucmd_hdr.ncounters_data > 1) + return ERR_PTR(-EINVAL); + + required_ucmd_sz = min_ucmd_sz + + sizeof(struct mlx5_ib_flow_counters_data) * + ucmd_hdr.ncounters_data; + if (udata->inlen > required_ucmd_sz && + !ib_is_udata_cleared(udata, required_ucmd_sz, + udata->inlen - required_ucmd_sz)) + return ERR_PTR(-EOPNOTSUPP); + + ucmd = kzalloc(required_ucmd_sz, GFP_KERNEL); + if (!ucmd) + return ERR_PTR(-ENOMEM); + + err = ib_copy_from_udata(ucmd, udata, required_ucmd_sz); + if (err) + goto free_ucmd; + } + + if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) { + err = -ENOMEM; + goto free_ucmd; + } + + if (domain != IB_FLOW_DOMAIN_USER || + flow_attr->port > dev->num_ports || + (flow_attr->flags & ~(IB_FLOW_ATTR_FLAGS_DONT_TRAP | + IB_FLOW_ATTR_FLAGS_EGRESS))) { + err = -EINVAL; + goto free_ucmd; + } + + if (is_egress && + (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || + flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) { + err = -EINVAL; + goto free_ucmd; + } + + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (!dst) { + err = -ENOMEM; + goto free_ucmd; + } + + mutex_lock(&dev->flow_db->lock); + + ft_prio = get_flow_table(dev, flow_attr, + is_egress ? MLX5_IB_FT_TX : MLX5_IB_FT_RX); + if (IS_ERR(ft_prio)) { + err = PTR_ERR(ft_prio); + goto unlock; + } + if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) { + ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX); + if (IS_ERR(ft_prio_tx)) { + err = PTR_ERR(ft_prio_tx); + ft_prio_tx = NULL; + goto destroy_ft; + } + } + + if (is_egress) { + dst->type = MLX5_FLOW_DESTINATION_TYPE_PORT; + } else { + dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR; + if (mqp->flags & MLX5_IB_QP_RSS) + dst->tir_num = mqp->rss_qp.tirn; + else + dst->tir_num = mqp->raw_packet_qp.rq.tirn; + } + + if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { + if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) { + handler = create_dont_trap_rule(dev, ft_prio, + flow_attr, dst); + } else { + underlay_qpn = (mqp->flags & MLX5_IB_QP_UNDERLAY) ? + mqp->underlay_qpn : 0; + handler = _create_flow_rule(dev, ft_prio, flow_attr, + dst, underlay_qpn, ucmd); + } + } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || + flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { + handler = create_leftovers_rule(dev, ft_prio, flow_attr, + dst); + } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) { + handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst); + } else { + err = -EINVAL; + goto destroy_ft; + } + + if (IS_ERR(handler)) { + err = PTR_ERR(handler); + handler = NULL; + goto destroy_ft; + } + + mutex_unlock(&dev->flow_db->lock); + kfree(dst); + kfree(ucmd); + + return &handler->ibflow; + +destroy_ft: + put_flow_table(dev, ft_prio, false); + if (ft_prio_tx) + put_flow_table(dev, ft_prio_tx, false); +unlock: + mutex_unlock(&dev->flow_db->lock); + kfree(dst); +free_ucmd: + kfree(ucmd); + return ERR_PTR(err); +} + +static struct mlx5_ib_flow_prio *_get_flow_table(struct mlx5_ib_dev *dev, + int priority, bool mcast) +{ + int max_table_size; + struct mlx5_flow_namespace *ns = NULL; + struct mlx5_ib_flow_prio *prio; + + max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, + log_max_ft_size)); + if (max_table_size < MLX5_FS_MAX_ENTRIES) + return ERR_PTR(-ENOMEM); + + if (mcast) + priority = MLX5_IB_FLOW_MCAST_PRIO; + else + priority = ib_prio_to_core_prio(priority, false); + + ns = mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS); + if (!ns) + return ERR_PTR(-ENOTSUPP); + + prio = &dev->flow_db->prios[priority]; + + if (prio->flow_table) + return prio; + + return _get_prio(ns, prio, priority, MLX5_FS_MAX_ENTRIES, + MLX5_FS_MAX_TYPES); +} + +static struct mlx5_ib_flow_handler * +_create_raw_flow_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + struct mlx5_flow_destination *dst, + struct mlx5_ib_flow_matcher *fs_matcher, + void *cmd_in, int inlen) +{ + struct mlx5_ib_flow_handler *handler; + struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG}; + struct mlx5_flow_spec *spec; + struct mlx5_flow_table *ft = ft_prio->flow_table; + int err = 0; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + handler = kzalloc(sizeof(*handler), GFP_KERNEL); + if (!handler || !spec) { + err = -ENOMEM; + goto free; + } + + INIT_LIST_HEAD(&handler->list); + + memcpy(spec->match_value, cmd_in, inlen); + memcpy(spec->match_criteria, fs_matcher->matcher_mask.match_params, + fs_matcher->mask_len); + spec->match_criteria_enable = fs_matcher->match_criteria_enable; + + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + handler->rule = mlx5_add_flow_rules(ft, spec, + &flow_act, dst, 1); + + if (IS_ERR(handler->rule)) { + err = PTR_ERR(handler->rule); + goto free; + } + + ft_prio->refcount++; + handler->prio = ft_prio; + handler->dev = dev; + ft_prio->flow_table = ft; + +free: + if (err) + kfree(handler); + kvfree(spec); + return err ? ERR_PTR(err) : handler; +} + +static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher, + void *match_v) +{ + void *match_c; + void *match_v_set_lyr_2_4, *match_c_set_lyr_2_4; + void *dmac, *dmac_mask; + void *ipv4, *ipv4_mask; + + if (!(fs_matcher->match_criteria_enable & + (1 << MATCH_CRITERIA_ENABLE_OUTER_BIT))) + return false; + + match_c = fs_matcher->matcher_mask.match_params; + match_v_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_v, + outer_headers); + match_c_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_c, + outer_headers); + + dmac = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4, + dmac_47_16); + dmac_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4, + dmac_47_16); + + if (is_multicast_ether_addr(dmac) && + is_multicast_ether_addr(dmac_mask)) + return true; + + ipv4 = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + + ipv4_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + + if (ipv4_is_multicast(*(__be32 *)(ipv4)) && + ipv4_is_multicast(*(__be32 *)(ipv4_mask))) + return true; + + return false; +} + +struct mlx5_ib_flow_handler * +mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_matcher *fs_matcher, + void *cmd_in, int inlen, int dest_id, + int dest_type) +{ + struct mlx5_flow_destination *dst; + struct mlx5_ib_flow_prio *ft_prio; + int priority = fs_matcher->priority; + struct mlx5_ib_flow_handler *handler; + bool mcast; + int err; + + if (fs_matcher->flow_type != MLX5_IB_FLOW_TYPE_NORMAL) + return ERR_PTR(-EOPNOTSUPP); + + if (fs_matcher->priority > MLX5_IB_FLOW_LAST_PRIO) + return ERR_PTR(-ENOMEM); + + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (!dst) + return ERR_PTR(-ENOMEM); + + mcast = raw_fs_is_multicast(fs_matcher, cmd_in); + mutex_lock(&dev->flow_db->lock); + + ft_prio = _get_flow_table(dev, priority, mcast); + if (IS_ERR(ft_prio)) { + err = PTR_ERR(ft_prio); + goto unlock; + } + + if (dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR) { + dst->type = dest_type; + dst->tir_num = dest_id; + } else { + dst->type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM; + dst->ft_num = dest_id; + } + + handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher, cmd_in, + inlen); + + if (IS_ERR(handler)) { + err = PTR_ERR(handler); + goto destroy_ft; + } + + mutex_unlock(&dev->flow_db->lock); + atomic_inc(&fs_matcher->usecnt); + handler->flow_matcher = fs_matcher; + + kfree(dst); + + return handler; + +destroy_ft: + put_flow_table(dev, ft_prio, false); +unlock: + mutex_unlock(&dev->flow_db->lock); + kfree(dst); + + return ERR_PTR(err); +} + +static u32 mlx5_ib_flow_action_flags_to_accel_xfrm_flags(u32 mlx5_flags) +{ + u32 flags = 0; + + if (mlx5_flags & MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA) + flags |= MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA; + + return flags; +} + +#define MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA +static struct ib_flow_action * +mlx5_ib_create_flow_action_esp(struct ib_device *device, + const struct ib_flow_action_attrs_esp *attr, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_dev *mdev = to_mdev(device); + struct ib_uverbs_flow_action_esp_keymat_aes_gcm *aes_gcm; + struct mlx5_accel_esp_xfrm_attrs accel_attrs = {}; + struct mlx5_ib_flow_action *action; + u64 action_flags; + u64 flags; + int err = 0; + + err = uverbs_get_flags64( + &action_flags, attrs, MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS, + ((MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED << 1) - 1)); + if (err) + return ERR_PTR(err); + + flags = mlx5_ib_flow_action_flags_to_accel_xfrm_flags(action_flags); + + /* We current only support a subset of the standard features. Only a + * keymat of type AES_GCM, with icv_len == 16, iv_algo == SEQ and esn + * (with overlap). Full offload mode isn't supported. + */ + if (!attr->keymat || attr->replay || attr->encap || + attr->spi || attr->seq || attr->tfc_pad || + attr->hard_limit_pkts || + (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED | + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT))) + return ERR_PTR(-EOPNOTSUPP); + + if (attr->keymat->protocol != + IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM) + return ERR_PTR(-EOPNOTSUPP); + + aes_gcm = &attr->keymat->keymat.aes_gcm; + + if (aes_gcm->icv_len != 16 || + aes_gcm->iv_algo != IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ) + return ERR_PTR(-EOPNOTSUPP); + + action = kmalloc(sizeof(*action), GFP_KERNEL); + if (!action) + return ERR_PTR(-ENOMEM); + + action->esp_aes_gcm.ib_flags = attr->flags; + memcpy(&accel_attrs.keymat.aes_gcm.aes_key, &aes_gcm->aes_key, + sizeof(accel_attrs.keymat.aes_gcm.aes_key)); + accel_attrs.keymat.aes_gcm.key_len = aes_gcm->key_len * 8; + memcpy(&accel_attrs.keymat.aes_gcm.salt, &aes_gcm->salt, + sizeof(accel_attrs.keymat.aes_gcm.salt)); + memcpy(&accel_attrs.keymat.aes_gcm.seq_iv, &aes_gcm->iv, + sizeof(accel_attrs.keymat.aes_gcm.seq_iv)); + accel_attrs.keymat.aes_gcm.icv_len = aes_gcm->icv_len * 8; + accel_attrs.keymat.aes_gcm.iv_algo = MLX5_ACCEL_ESP_AES_GCM_IV_ALGO_SEQ; + accel_attrs.keymat_type = MLX5_ACCEL_ESP_KEYMAT_AES_GCM; + + accel_attrs.esn = attr->esn; + if (attr->flags & IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED) + accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED; + if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW) + accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP; + + if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT) + accel_attrs.action |= MLX5_ACCEL_ESP_ACTION_ENCRYPT; + + action->esp_aes_gcm.ctx = + mlx5_accel_esp_create_xfrm(mdev->mdev, &accel_attrs, flags); + if (IS_ERR(action->esp_aes_gcm.ctx)) { + err = PTR_ERR(action->esp_aes_gcm.ctx); + goto err_parse; + } + + action->esp_aes_gcm.ib_flags = attr->flags; + + return &action->ib_action; + +err_parse: + kfree(action); + return ERR_PTR(err); +} + +static int +mlx5_ib_modify_flow_action_esp(struct ib_flow_action *action, + const struct ib_flow_action_attrs_esp *attr, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_flow_action *maction = to_mflow_act(action); + struct mlx5_accel_esp_xfrm_attrs accel_attrs; + int err = 0; + + if (attr->keymat || attr->replay || attr->encap || + attr->spi || attr->seq || attr->tfc_pad || + attr->hard_limit_pkts || + (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED | + IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS | + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW))) + return -EOPNOTSUPP; + + /* Only the ESN value or the MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP can + * be modified. + */ + if (!(maction->esp_aes_gcm.ib_flags & + IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED) && + attr->flags & (IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED | + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)) + return -EINVAL; + + memcpy(&accel_attrs, &maction->esp_aes_gcm.ctx->attrs, + sizeof(accel_attrs)); + + accel_attrs.esn = attr->esn; + if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW) + accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP; + else + accel_attrs.flags &= ~MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP; + + err = mlx5_accel_esp_modify_xfrm(maction->esp_aes_gcm.ctx, + &accel_attrs); + if (err) + return err; + + maction->esp_aes_gcm.ib_flags &= + ~IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW; + maction->esp_aes_gcm.ib_flags |= + attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW; + + return 0; +} + +static int mlx5_ib_destroy_flow_action(struct ib_flow_action *action) +{ + struct mlx5_ib_flow_action *maction = to_mflow_act(action); + + switch (action->type) { + case IB_FLOW_ACTION_ESP: + /* + * We only support aes_gcm by now, so we implicitly know this is + * the underline crypto. + */ + mlx5_accel_esp_destroy_xfrm(maction->esp_aes_gcm.ctx); + break; + default: + WARN_ON(true); + break; + } + + kfree(maction); + return 0; +} + +static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *mqp = to_mqp(ibqp); + int err; + + if (mqp->flags & MLX5_IB_QP_UNDERLAY) { + mlx5_ib_dbg(dev, "Attaching a multi cast group to underlay QP is not supported\n"); + return -EOPNOTSUPP; + } + + err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num); + if (err) + mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n", + ibqp->qp_num, gid->raw); + + return err; +} + +static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + int err; + + err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num); + if (err) + mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n", + ibqp->qp_num, gid->raw); + + return err; +} + +static int init_node_data(struct mlx5_ib_dev *dev) +{ + int err; + + err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc); + if (err) + return err; + + dev->mdev->rev_id = dev->mdev->pdev->revision; + + return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid); +} + +static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + + return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages); +} + +static ssize_t show_reg_pages(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + + return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); +} + +static ssize_t show_hca(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "MT%d\n", dev->mdev->pdev->device); +} + +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "%x\n", dev->mdev->rev_id); +} + +static ssize_t show_board(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN, + dev->mdev->board_id); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); +static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL); +static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL); + +static struct device_attribute *mlx5_class_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_hca_type, + &dev_attr_board_id, + &dev_attr_fw_pages, + &dev_attr_reg_pages, +}; + +static void pkey_change_handler(struct work_struct *work) +{ + struct mlx5_ib_port_resources *ports = + container_of(work, struct mlx5_ib_port_resources, + pkey_change_work); + + mutex_lock(&ports->devr->mutex); + mlx5_ib_gsi_pkey_change(ports->gsi); + mutex_unlock(&ports->devr->mutex); +} + +static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev) +{ + struct mlx5_ib_qp *mqp; + struct mlx5_ib_cq *send_mcq, *recv_mcq; + struct mlx5_core_cq *mcq; + struct list_head cq_armed_list; + unsigned long flags_qp; + unsigned long flags_cq; + unsigned long flags; + + INIT_LIST_HEAD(&cq_armed_list); + + /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/ + spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags); + list_for_each_entry(mqp, &ibdev->qp_list, qps_list) { + spin_lock_irqsave(&mqp->sq.lock, flags_qp); + if (mqp->sq.tail != mqp->sq.head) { + send_mcq = to_mcq(mqp->ibqp.send_cq); + spin_lock_irqsave(&send_mcq->lock, flags_cq); + if (send_mcq->mcq.comp && + mqp->ibqp.send_cq->comp_handler) { + if (!send_mcq->mcq.reset_notify_added) { + send_mcq->mcq.reset_notify_added = 1; + list_add_tail(&send_mcq->mcq.reset_notify, + &cq_armed_list); + } + } + spin_unlock_irqrestore(&send_mcq->lock, flags_cq); + } + spin_unlock_irqrestore(&mqp->sq.lock, flags_qp); + spin_lock_irqsave(&mqp->rq.lock, flags_qp); + /* no handling is needed for SRQ */ + if (!mqp->ibqp.srq) { + if (mqp->rq.tail != mqp->rq.head) { + recv_mcq = to_mcq(mqp->ibqp.recv_cq); + spin_lock_irqsave(&recv_mcq->lock, flags_cq); + if (recv_mcq->mcq.comp && + mqp->ibqp.recv_cq->comp_handler) { + if (!recv_mcq->mcq.reset_notify_added) { + recv_mcq->mcq.reset_notify_added = 1; + list_add_tail(&recv_mcq->mcq.reset_notify, + &cq_armed_list); + } + } + spin_unlock_irqrestore(&recv_mcq->lock, + flags_cq); + } + } + spin_unlock_irqrestore(&mqp->rq.lock, flags_qp); + } + /*At that point all inflight post send were put to be executed as of we + * lock/unlock above locks Now need to arm all involved CQs. + */ + list_for_each_entry(mcq, &cq_armed_list, reset_notify) { + mcq->comp(mcq); + } + spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags); +} + +static void delay_drop_handler(struct work_struct *work) +{ + int err; + struct mlx5_ib_delay_drop *delay_drop = + container_of(work, struct mlx5_ib_delay_drop, + delay_drop_work); + + atomic_inc(&delay_drop->events_cnt); + + mutex_lock(&delay_drop->lock); + err = mlx5_core_set_delay_drop(delay_drop->dev->mdev, + delay_drop->timeout); + if (err) { + mlx5_ib_warn(delay_drop->dev, "Failed to set delay drop, timeout=%u\n", + delay_drop->timeout); + delay_drop->activate = false; + } + mutex_unlock(&delay_drop->lock); +} + +static void mlx5_ib_handle_event(struct work_struct *_work) +{ + struct mlx5_ib_event_work *work = + container_of(_work, struct mlx5_ib_event_work, work); + struct mlx5_ib_dev *ibdev; + struct ib_event ibev; + bool fatal = false; + u8 port = (u8)work->param; + + if (mlx5_core_is_mp_slave(work->dev)) { + ibdev = mlx5_ib_get_ibdev_from_mpi(work->context); + if (!ibdev) + goto out; + } else { + ibdev = work->context; + } + + switch (work->event) { + case MLX5_DEV_EVENT_SYS_ERROR: + ibev.event = IB_EVENT_DEVICE_FATAL; + mlx5_ib_handle_internal_error(ibdev); + fatal = true; + break; + + case MLX5_DEV_EVENT_PORT_UP: + case MLX5_DEV_EVENT_PORT_DOWN: + case MLX5_DEV_EVENT_PORT_INITIALIZED: + /* In RoCE, port up/down events are handled in + * mlx5_netdev_event(). + */ + if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) == + IB_LINK_LAYER_ETHERNET) + goto out; + + ibev.event = (work->event == MLX5_DEV_EVENT_PORT_UP) ? + IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; + break; + + case MLX5_DEV_EVENT_LID_CHANGE: + ibev.event = IB_EVENT_LID_CHANGE; + break; + + case MLX5_DEV_EVENT_PKEY_CHANGE: + ibev.event = IB_EVENT_PKEY_CHANGE; + schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work); + break; + + case MLX5_DEV_EVENT_GUID_CHANGE: + ibev.event = IB_EVENT_GID_CHANGE; + break; + + case MLX5_DEV_EVENT_CLIENT_REREG: + ibev.event = IB_EVENT_CLIENT_REREGISTER; + break; + case MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT: + schedule_work(&ibdev->delay_drop.delay_drop_work); + goto out; + default: + goto out; + } + + ibev.device = &ibdev->ib_dev; + ibev.element.port_num = port; + + if (!rdma_is_port_valid(&ibdev->ib_dev, port)) { + mlx5_ib_warn(ibdev, "warning: event on port %d\n", port); + goto out; + } + + if (ibdev->ib_active) + ib_dispatch_event(&ibev); + + if (fatal) + ibdev->ib_active = false; +out: + kfree(work); +} + +static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, + enum mlx5_dev_event event, unsigned long param) +{ + struct mlx5_ib_event_work *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return; + + INIT_WORK(&work->work, mlx5_ib_handle_event); + work->dev = dev; + work->param = param; + work->context = context; + work->event = event; + + queue_work(mlx5_ib_event_wq, &work->work); +} + +static int set_has_smi_cap(struct mlx5_ib_dev *dev) +{ + struct mlx5_hca_vport_context vport_ctx; + int err; + int port; + + for (port = 1; port <= dev->num_ports; port++) { + dev->mdev->port_caps[port - 1].has_smi = false; + if (MLX5_CAP_GEN(dev->mdev, port_type) == + MLX5_CAP_PORT_TYPE_IB) { + if (MLX5_CAP_GEN(dev->mdev, ib_virt)) { + err = mlx5_query_hca_vport_context(dev->mdev, 0, + port, 0, + &vport_ctx); + if (err) { + mlx5_ib_err(dev, "query_hca_vport_context for port=%d failed %d\n", + port, err); + return err; + } + dev->mdev->port_caps[port - 1].has_smi = + vport_ctx.has_smi; + } else { + dev->mdev->port_caps[port - 1].has_smi = true; + } + } + } + return 0; +} + +static void get_ext_port_caps(struct mlx5_ib_dev *dev) +{ + int port; + + for (port = 1; port <= dev->num_ports; port++) + mlx5_query_ext_port_caps(dev, port); +} + +static int get_port_caps(struct mlx5_ib_dev *dev, u8 port) +{ + struct ib_device_attr *dprops = NULL; + struct ib_port_attr *pprops = NULL; + int err = -ENOMEM; + struct ib_udata uhw = {.inlen = 0, .outlen = 0}; + + pprops = kmalloc(sizeof(*pprops), GFP_KERNEL); + if (!pprops) + goto out; + + dprops = kmalloc(sizeof(*dprops), GFP_KERNEL); + if (!dprops) + goto out; + + err = set_has_smi_cap(dev); + if (err) + goto out; + + err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw); + if (err) { + mlx5_ib_warn(dev, "query_device failed %d\n", err); + goto out; + } + + memset(pprops, 0, sizeof(*pprops)); + err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); + if (err) { + mlx5_ib_warn(dev, "query_port %d failed %d\n", + port, err); + goto out; + } + + dev->mdev->port_caps[port - 1].pkey_table_len = + dprops->max_pkeys; + dev->mdev->port_caps[port - 1].gid_table_len = + pprops->gid_tbl_len; + mlx5_ib_dbg(dev, "port %d: pkey_table_len %d, gid_table_len %d\n", + port, dprops->max_pkeys, pprops->gid_tbl_len); + +out: + kfree(pprops); + kfree(dprops); + + return err; +} + +static void destroy_umrc_res(struct mlx5_ib_dev *dev) +{ + int err; + + err = mlx5_mr_cache_cleanup(dev); + if (err) + mlx5_ib_warn(dev, "mr cache cleanup failed\n"); + + if (dev->umrc.qp) + mlx5_ib_destroy_qp(dev->umrc.qp); + if (dev->umrc.cq) + ib_free_cq(dev->umrc.cq); + if (dev->umrc.pd) + ib_dealloc_pd(dev->umrc.pd); +} + +enum { + MAX_UMR_WR = 128, +}; + +static int create_umr_res(struct mlx5_ib_dev *dev) +{ + struct ib_qp_init_attr *init_attr = NULL; + struct ib_qp_attr *attr = NULL; + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; + int ret; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL); + if (!attr || !init_attr) { + ret = -ENOMEM; + goto error_0; + } + + pd = ib_alloc_pd(&dev->ib_dev, 0); + if (IS_ERR(pd)) { + mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); + ret = PTR_ERR(pd); + goto error_0; + } + + cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ); + if (IS_ERR(cq)) { + mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); + ret = PTR_ERR(cq); + goto error_2; + } + + init_attr->send_cq = cq; + init_attr->recv_cq = cq; + init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr->cap.max_send_wr = MAX_UMR_WR; + init_attr->cap.max_send_sge = 1; + init_attr->qp_type = MLX5_IB_QPT_REG_UMR; + init_attr->port_num = 1; + qp = mlx5_ib_create_qp(pd, init_attr, NULL); + if (IS_ERR(qp)) { + mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n"); + ret = PTR_ERR(qp); + goto error_3; + } + qp->device = &dev->ib_dev; + qp->real_qp = qp; + qp->uobject = NULL; + qp->qp_type = MLX5_IB_QPT_REG_UMR; + qp->send_cq = init_attr->send_cq; + qp->recv_cq = init_attr->recv_cq; + + attr->qp_state = IB_QPS_INIT; + attr->port_num = 1; + ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX | + IB_QP_PORT, NULL); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); + goto error_4; + } + + memset(attr, 0, sizeof(*attr)); + attr->qp_state = IB_QPS_RTR; + attr->path_mtu = IB_MTU_256; + + ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n"); + goto error_4; + } + + memset(attr, 0, sizeof(*attr)); + attr->qp_state = IB_QPS_RTS; + ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n"); + goto error_4; + } + + dev->umrc.qp = qp; + dev->umrc.cq = cq; + dev->umrc.pd = pd; + + sema_init(&dev->umrc.sem, MAX_UMR_WR); + ret = mlx5_mr_cache_init(dev); + if (ret) { + mlx5_ib_warn(dev, "mr cache init failed %d\n", ret); + goto error_4; + } + + kfree(attr); + kfree(init_attr); + + return 0; + +error_4: + mlx5_ib_destroy_qp(qp); + dev->umrc.qp = NULL; + +error_3: + ib_free_cq(cq); + dev->umrc.cq = NULL; + +error_2: + ib_dealloc_pd(pd); + dev->umrc.pd = NULL; + +error_0: + kfree(attr); + kfree(init_attr); + return ret; +} + +static u8 mlx5_get_umr_fence(u8 umr_fence_cap) +{ + switch (umr_fence_cap) { + case MLX5_CAP_UMR_FENCE_NONE: + return MLX5_FENCE_MODE_NONE; + case MLX5_CAP_UMR_FENCE_SMALL: + return MLX5_FENCE_MODE_INITIATOR_SMALL; + default: + return MLX5_FENCE_MODE_STRONG_ORDERING; + } +} + +static int create_dev_resources(struct mlx5_ib_resources *devr) +{ + struct ib_srq_init_attr attr; + struct mlx5_ib_dev *dev; + struct ib_cq_init_attr cq_attr = {.cqe = 1}; + int port; + int ret = 0; + + dev = container_of(devr, struct mlx5_ib_dev, devr); + + mutex_init(&devr->mutex); + + devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL); + if (IS_ERR(devr->p0)) { + ret = PTR_ERR(devr->p0); + goto error0; + } + devr->p0->device = &dev->ib_dev; + devr->p0->uobject = NULL; + atomic_set(&devr->p0->usecnt, 0); + + devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL); + if (IS_ERR(devr->c0)) { + ret = PTR_ERR(devr->c0); + goto error1; + } + devr->c0->device = &dev->ib_dev; + devr->c0->uobject = NULL; + devr->c0->comp_handler = NULL; + devr->c0->event_handler = NULL; + devr->c0->cq_context = NULL; + atomic_set(&devr->c0->usecnt, 0); + + devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); + if (IS_ERR(devr->x0)) { + ret = PTR_ERR(devr->x0); + goto error2; + } + devr->x0->device = &dev->ib_dev; + devr->x0->inode = NULL; + atomic_set(&devr->x0->usecnt, 0); + mutex_init(&devr->x0->tgt_qp_mutex); + INIT_LIST_HEAD(&devr->x0->tgt_qp_list); + + devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); + if (IS_ERR(devr->x1)) { + ret = PTR_ERR(devr->x1); + goto error3; + } + devr->x1->device = &dev->ib_dev; + devr->x1->inode = NULL; + atomic_set(&devr->x1->usecnt, 0); + mutex_init(&devr->x1->tgt_qp_mutex); + INIT_LIST_HEAD(&devr->x1->tgt_qp_list); + + memset(&attr, 0, sizeof(attr)); + attr.attr.max_sge = 1; + attr.attr.max_wr = 1; + attr.srq_type = IB_SRQT_XRC; + attr.ext.cq = devr->c0; + attr.ext.xrc.xrcd = devr->x0; + + devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL); + if (IS_ERR(devr->s0)) { + ret = PTR_ERR(devr->s0); + goto error4; + } + devr->s0->device = &dev->ib_dev; + devr->s0->pd = devr->p0; + devr->s0->uobject = NULL; + devr->s0->event_handler = NULL; + devr->s0->srq_context = NULL; + devr->s0->srq_type = IB_SRQT_XRC; + devr->s0->ext.xrc.xrcd = devr->x0; + devr->s0->ext.cq = devr->c0; + atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt); + atomic_inc(&devr->s0->ext.cq->usecnt); + atomic_inc(&devr->p0->usecnt); + atomic_set(&devr->s0->usecnt, 0); + + memset(&attr, 0, sizeof(attr)); + attr.attr.max_sge = 1; + attr.attr.max_wr = 1; + attr.srq_type = IB_SRQT_BASIC; + devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL); + if (IS_ERR(devr->s1)) { + ret = PTR_ERR(devr->s1); + goto error5; + } + devr->s1->device = &dev->ib_dev; + devr->s1->pd = devr->p0; + devr->s1->uobject = NULL; + devr->s1->event_handler = NULL; + devr->s1->srq_context = NULL; + devr->s1->srq_type = IB_SRQT_BASIC; + devr->s1->ext.cq = devr->c0; + atomic_inc(&devr->p0->usecnt); + atomic_set(&devr->s1->usecnt, 0); + + for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) { + INIT_WORK(&devr->ports[port].pkey_change_work, + pkey_change_handler); + devr->ports[port].devr = devr; + } + + return 0; + +error5: + mlx5_ib_destroy_srq(devr->s0); +error4: + mlx5_ib_dealloc_xrcd(devr->x1); +error3: + mlx5_ib_dealloc_xrcd(devr->x0); +error2: + mlx5_ib_destroy_cq(devr->c0); +error1: + mlx5_ib_dealloc_pd(devr->p0); +error0: + return ret; +} + +static void destroy_dev_resources(struct mlx5_ib_resources *devr) +{ + struct mlx5_ib_dev *dev = + container_of(devr, struct mlx5_ib_dev, devr); + int port; + + mlx5_ib_destroy_srq(devr->s1); + mlx5_ib_destroy_srq(devr->s0); + mlx5_ib_dealloc_xrcd(devr->x0); + mlx5_ib_dealloc_xrcd(devr->x1); + mlx5_ib_destroy_cq(devr->c0); + mlx5_ib_dealloc_pd(devr->p0); + + /* Make sure no change P_Key work items are still executing */ + for (port = 0; port < dev->num_ports; ++port) + cancel_work_sync(&devr->ports[port].pkey_change_work); +} + +static u32 get_core_cap_flags(struct ib_device *ibdev, + struct mlx5_hca_vport_context *rep) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1); + u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type); + u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version); + bool raw_support = !mlx5_core_mp_enabled(dev->mdev); + u32 ret = 0; + + if (rep->grh_required) + ret |= RDMA_CORE_CAP_IB_GRH_REQUIRED; + + if (ll == IB_LINK_LAYER_INFINIBAND) + return ret | RDMA_CORE_PORT_IBA_IB; + + if (raw_support) + ret |= RDMA_CORE_PORT_RAW_PACKET; + + if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP)) + return ret; + + if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP)) + return ret; + + if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP) + ret |= RDMA_CORE_PORT_IBA_ROCE; + + if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP) + ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + + return ret; +} + +static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num, + struct ib_port_immutable *immutable) +{ + struct ib_port_attr attr; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num); + struct mlx5_hca_vport_context rep = {0}; + int err; + + err = ib_query_port(ibdev, port_num, &attr); + if (err) + return err; + + if (ll == IB_LINK_LAYER_INFINIBAND) { + err = mlx5_query_hca_vport_context(dev->mdev, 0, port_num, 0, + &rep); + if (err) + return err; + } + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + immutable->core_cap_flags = get_core_cap_flags(ibdev, &rep); + if ((ll == IB_LINK_LAYER_INFINIBAND) || MLX5_CAP_GEN(dev->mdev, roce)) + immutable->max_mad_size = IB_MGMT_MAD_SIZE; + + return 0; +} + +static int mlx5_port_rep_immutable(struct ib_device *ibdev, u8 port_num, + struct ib_port_immutable *immutable) +{ + struct ib_port_attr attr; + int err; + + immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET; + + err = ib_query_port(ibdev, port_num, &attr); + if (err) + return err; + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET; + + return 0; +} + +static void get_dev_fw_str(struct ib_device *ibdev, char *str) +{ + struct mlx5_ib_dev *dev = + container_of(ibdev, struct mlx5_ib_dev, ib_dev); + snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%04d", + fw_rev_maj(dev->mdev), fw_rev_min(dev->mdev), + fw_rev_sub(dev->mdev)); +} + +static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_flow_namespace *ns = mlx5_get_flow_namespace(mdev, + MLX5_FLOW_NAMESPACE_LAG); + struct mlx5_flow_table *ft; + int err; + + if (!ns || !mlx5_lag_is_active(mdev)) + return 0; + + err = mlx5_cmd_create_vport_lag(mdev); + if (err) + return err; + + ft = mlx5_create_lag_demux_flow_table(ns, 0, 0); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + goto err_destroy_vport_lag; + } + + dev->flow_db->lag_demux_ft = ft; + return 0; + +err_destroy_vport_lag: + mlx5_cmd_destroy_vport_lag(mdev); + return err; +} + +static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + + if (dev->flow_db->lag_demux_ft) { + mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft); + dev->flow_db->lag_demux_ft = NULL; + + mlx5_cmd_destroy_vport_lag(mdev); + } +} + +static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num) +{ + int err; + + dev->roce[port_num].nb.notifier_call = mlx5_netdev_event; + err = register_netdevice_notifier(&dev->roce[port_num].nb); + if (err) { + dev->roce[port_num].nb.notifier_call = NULL; + return err; + } + + return 0; +} + +static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num) +{ + if (dev->roce[port_num].nb.notifier_call) { + unregister_netdevice_notifier(&dev->roce[port_num].nb); + dev->roce[port_num].nb.notifier_call = NULL; + } +} + +static int mlx5_enable_eth(struct mlx5_ib_dev *dev) +{ + int err; + + if (MLX5_CAP_GEN(dev->mdev, roce)) { + err = mlx5_nic_vport_enable_roce(dev->mdev); + if (err) + return err; + } + + err = mlx5_eth_lag_init(dev); + if (err) + goto err_disable_roce; + + return 0; + +err_disable_roce: + if (MLX5_CAP_GEN(dev->mdev, roce)) + mlx5_nic_vport_disable_roce(dev->mdev); + + return err; +} + +static void mlx5_disable_eth(struct mlx5_ib_dev *dev) +{ + mlx5_eth_lag_cleanup(dev); + if (MLX5_CAP_GEN(dev->mdev, roce)) + mlx5_nic_vport_disable_roce(dev->mdev); +} + +struct mlx5_ib_counter { + const char *name; + size_t offset; +}; + +#define INIT_Q_COUNTER(_name) \ + { .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)} + +static const struct mlx5_ib_counter basic_q_cnts[] = { + INIT_Q_COUNTER(rx_write_requests), + INIT_Q_COUNTER(rx_read_requests), + INIT_Q_COUNTER(rx_atomic_requests), + INIT_Q_COUNTER(out_of_buffer), +}; + +static const struct mlx5_ib_counter out_of_seq_q_cnts[] = { + INIT_Q_COUNTER(out_of_sequence), +}; + +static const struct mlx5_ib_counter retrans_q_cnts[] = { + INIT_Q_COUNTER(duplicate_request), + INIT_Q_COUNTER(rnr_nak_retry_err), + INIT_Q_COUNTER(packet_seq_err), + INIT_Q_COUNTER(implied_nak_seq_err), + INIT_Q_COUNTER(local_ack_timeout_err), +}; + +#define INIT_CONG_COUNTER(_name) \ + { .name = #_name, .offset = \ + MLX5_BYTE_OFF(query_cong_statistics_out, _name ## _high)} + +static const struct mlx5_ib_counter cong_cnts[] = { + INIT_CONG_COUNTER(rp_cnp_ignored), + INIT_CONG_COUNTER(rp_cnp_handled), + INIT_CONG_COUNTER(np_ecn_marked_roce_packets), + INIT_CONG_COUNTER(np_cnp_sent), +}; + +static const struct mlx5_ib_counter extended_err_cnts[] = { + INIT_Q_COUNTER(resp_local_length_error), + INIT_Q_COUNTER(resp_cqe_error), + INIT_Q_COUNTER(req_cqe_error), + INIT_Q_COUNTER(req_remote_invalid_request), + INIT_Q_COUNTER(req_remote_access_errors), + INIT_Q_COUNTER(resp_remote_access_errors), + INIT_Q_COUNTER(resp_cqe_flush_error), + INIT_Q_COUNTER(req_cqe_flush_error), +}; + +#define INIT_EXT_PPCNT_COUNTER(_name) \ + { .name = #_name, .offset = \ + MLX5_BYTE_OFF(ppcnt_reg, \ + counter_set.eth_extended_cntrs_grp_data_layout._name##_high)} + +static const struct mlx5_ib_counter ext_ppcnt_cnts[] = { + INIT_EXT_PPCNT_COUNTER(rx_icrc_encapsulated), +}; + +static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev) +{ + int i; + + for (i = 0; i < dev->num_ports; i++) { + if (dev->port[i].cnts.set_id_valid) + mlx5_core_dealloc_q_counter(dev->mdev, + dev->port[i].cnts.set_id); + kfree(dev->port[i].cnts.names); + kfree(dev->port[i].cnts.offsets); + } +} + +static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev, + struct mlx5_ib_counters *cnts) +{ + u32 num_counters; + + num_counters = ARRAY_SIZE(basic_q_cnts); + + if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) + num_counters += ARRAY_SIZE(out_of_seq_q_cnts); + + if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) + num_counters += ARRAY_SIZE(retrans_q_cnts); + + if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) + num_counters += ARRAY_SIZE(extended_err_cnts); + + cnts->num_q_counters = num_counters; + + if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) { + cnts->num_cong_counters = ARRAY_SIZE(cong_cnts); + num_counters += ARRAY_SIZE(cong_cnts); + } + if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) { + cnts->num_ext_ppcnt_counters = ARRAY_SIZE(ext_ppcnt_cnts); + num_counters += ARRAY_SIZE(ext_ppcnt_cnts); + } + cnts->names = kcalloc(num_counters, sizeof(cnts->names), GFP_KERNEL); + if (!cnts->names) + return -ENOMEM; + + cnts->offsets = kcalloc(num_counters, + sizeof(cnts->offsets), GFP_KERNEL); + if (!cnts->offsets) + goto err_names; + + return 0; + +err_names: + kfree(cnts->names); + cnts->names = NULL; + return -ENOMEM; +} + +static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev, + const char **names, + size_t *offsets) +{ + int i; + int j = 0; + + for (i = 0; i < ARRAY_SIZE(basic_q_cnts); i++, j++) { + names[j] = basic_q_cnts[i].name; + offsets[j] = basic_q_cnts[i].offset; + } + + if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) { + for (i = 0; i < ARRAY_SIZE(out_of_seq_q_cnts); i++, j++) { + names[j] = out_of_seq_q_cnts[i].name; + offsets[j] = out_of_seq_q_cnts[i].offset; + } + } + + if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) { + for (i = 0; i < ARRAY_SIZE(retrans_q_cnts); i++, j++) { + names[j] = retrans_q_cnts[i].name; + offsets[j] = retrans_q_cnts[i].offset; + } + } + + if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) { + for (i = 0; i < ARRAY_SIZE(extended_err_cnts); i++, j++) { + names[j] = extended_err_cnts[i].name; + offsets[j] = extended_err_cnts[i].offset; + } + } + + if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) { + for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) { + names[j] = cong_cnts[i].name; + offsets[j] = cong_cnts[i].offset; + } + } + + if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) { + for (i = 0; i < ARRAY_SIZE(ext_ppcnt_cnts); i++, j++) { + names[j] = ext_ppcnt_cnts[i].name; + offsets[j] = ext_ppcnt_cnts[i].offset; + } + } +} + +static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev) +{ + int err = 0; + int i; + + for (i = 0; i < dev->num_ports; i++) { + err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts); + if (err) + goto err_alloc; + + mlx5_ib_fill_counters(dev, dev->port[i].cnts.names, + dev->port[i].cnts.offsets); + + err = mlx5_core_alloc_q_counter(dev->mdev, + &dev->port[i].cnts.set_id); + if (err) { + mlx5_ib_warn(dev, + "couldn't allocate queue counter for port %d, err %d\n", + i + 1, err); + goto err_alloc; + } + dev->port[i].cnts.set_id_valid = true; + } + + return 0; + +err_alloc: + mlx5_ib_dealloc_counters(dev); + return err; +} + +static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev, + u8 port_num) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_port *port = &dev->port[port_num - 1]; + + /* We support only per port stats */ + if (port_num == 0) + return NULL; + + return rdma_alloc_hw_stats_struct(port->cnts.names, + port->cnts.num_q_counters + + port->cnts.num_cong_counters + + port->cnts.num_ext_ppcnt_counters, + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} + +static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev, + struct mlx5_ib_port *port, + struct rdma_hw_stats *stats) +{ + int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out); + void *out; + __be32 val; + int ret, i; + + out = kvzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + ret = mlx5_core_query_q_counter(mdev, + port->cnts.set_id, 0, + out, outlen); + if (ret) + goto free; + + for (i = 0; i < port->cnts.num_q_counters; i++) { + val = *(__be32 *)(out + port->cnts.offsets[i]); + stats->value[i] = (u64)be32_to_cpu(val); + } + +free: + kvfree(out); + return ret; +} + +static int mlx5_ib_query_ext_ppcnt_counters(struct mlx5_ib_dev *dev, + struct mlx5_ib_port *port, + struct rdma_hw_stats *stats) +{ + int offset = port->cnts.num_q_counters + port->cnts.num_cong_counters; + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + int ret, i; + void *out; + + out = kvzalloc(sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + + ret = mlx5_cmd_query_ext_ppcnt_counters(dev->mdev, out); + if (ret) + goto free; + + for (i = 0; i < port->cnts.num_ext_ppcnt_counters; i++) { + stats->value[i + offset] = + be64_to_cpup((__be64 *)(out + + port->cnts.offsets[i + offset])); + } + +free: + kvfree(out); + return ret; +} + +static int mlx5_ib_get_hw_stats(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + u8 port_num, int index) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_port *port = &dev->port[port_num - 1]; + struct mlx5_core_dev *mdev; + int ret, num_counters; + u8 mdev_port_num; + + if (!stats) + return -EINVAL; + + num_counters = port->cnts.num_q_counters + + port->cnts.num_cong_counters + + port->cnts.num_ext_ppcnt_counters; + + /* q_counters are per IB device, query the master mdev */ + ret = mlx5_ib_query_q_counters(dev->mdev, port, stats); + if (ret) + return ret; + + if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) { + ret = mlx5_ib_query_ext_ppcnt_counters(dev, port, stats); + if (ret) + return ret; + } + + if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) { + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, + &mdev_port_num); + if (!mdev) { + /* If port is not affiliated yet, its in down state + * which doesn't have any counters yet, so it would be + * zero. So no need to read from the HCA. + */ + goto done; + } + ret = mlx5_lag_query_cong_counters(dev->mdev, + stats->value + + port->cnts.num_q_counters, + port->cnts.num_cong_counters, + port->cnts.offsets + + port->cnts.num_q_counters); + + mlx5_ib_put_native_port_mdev(dev, port_num); + if (ret) + return ret; + } + +done: + return num_counters; +} + +static struct net_device* +mlx5_ib_alloc_rdma_netdev(struct ib_device *hca, + u8 port_num, + enum rdma_netdev_t type, + const char *name, + unsigned char name_assign_type, + void (*setup)(struct net_device *)) +{ + struct net_device *netdev; + + if (type != RDMA_NETDEV_IPOIB) + return ERR_PTR(-EOPNOTSUPP); + + netdev = mlx5_rdma_netdev_alloc(to_mdev(hca)->mdev, hca, + name, setup); + return netdev; +} + +static void delay_drop_debugfs_cleanup(struct mlx5_ib_dev *dev) +{ + if (!dev->delay_drop.dbg) + return; + debugfs_remove_recursive(dev->delay_drop.dbg->dir_debugfs); + kfree(dev->delay_drop.dbg); + dev->delay_drop.dbg = NULL; +} + +static void cancel_delay_drop(struct mlx5_ib_dev *dev) +{ + if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP)) + return; + + cancel_work_sync(&dev->delay_drop.delay_drop_work); + delay_drop_debugfs_cleanup(dev); +} + +static ssize_t delay_drop_timeout_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_ib_delay_drop *delay_drop = filp->private_data; + char lbuf[20]; + int len; + + len = snprintf(lbuf, sizeof(lbuf), "%u\n", delay_drop->timeout); + return simple_read_from_buffer(buf, count, pos, lbuf, len); +} + +static ssize_t delay_drop_timeout_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_ib_delay_drop *delay_drop = filp->private_data; + u32 timeout; + u32 var; + + if (kstrtouint_from_user(buf, count, 0, &var)) + return -EFAULT; + + timeout = min_t(u32, roundup(var, 100), MLX5_MAX_DELAY_DROP_TIMEOUT_MS * + 1000); + if (timeout != var) + mlx5_ib_dbg(delay_drop->dev, "Round delay drop timeout to %u usec\n", + timeout); + + delay_drop->timeout = timeout; + + return count; +} + +static const struct file_operations fops_delay_drop_timeout = { + .owner = THIS_MODULE, + .open = simple_open, + .write = delay_drop_timeout_write, + .read = delay_drop_timeout_read, +}; + +static int delay_drop_debugfs_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_dbg_delay_drop *dbg; + + if (!mlx5_debugfs_root) + return 0; + + dbg = kzalloc(sizeof(*dbg), GFP_KERNEL); + if (!dbg) + return -ENOMEM; + + dev->delay_drop.dbg = dbg; + + dbg->dir_debugfs = + debugfs_create_dir("delay_drop", + dev->mdev->priv.dbg_root); + if (!dbg->dir_debugfs) + goto out_debugfs; + + dbg->events_cnt_debugfs = + debugfs_create_atomic_t("num_timeout_events", 0400, + dbg->dir_debugfs, + &dev->delay_drop.events_cnt); + if (!dbg->events_cnt_debugfs) + goto out_debugfs; + + dbg->rqs_cnt_debugfs = + debugfs_create_atomic_t("num_rqs", 0400, + dbg->dir_debugfs, + &dev->delay_drop.rqs_cnt); + if (!dbg->rqs_cnt_debugfs) + goto out_debugfs; + + dbg->timeout_debugfs = + debugfs_create_file("timeout", 0600, + dbg->dir_debugfs, + &dev->delay_drop, + &fops_delay_drop_timeout); + if (!dbg->timeout_debugfs) + goto out_debugfs; + + return 0; + +out_debugfs: + delay_drop_debugfs_cleanup(dev); + return -ENOMEM; +} + +static void init_delay_drop(struct mlx5_ib_dev *dev) +{ + if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP)) + return; + + mutex_init(&dev->delay_drop.lock); + dev->delay_drop.dev = dev; + dev->delay_drop.activate = false; + dev->delay_drop.timeout = MLX5_MAX_DELAY_DROP_TIMEOUT_MS * 1000; + INIT_WORK(&dev->delay_drop.delay_drop_work, delay_drop_handler); + atomic_set(&dev->delay_drop.rqs_cnt, 0); + atomic_set(&dev->delay_drop.events_cnt, 0); + + if (delay_drop_debugfs_init(dev)) + mlx5_ib_warn(dev, "Failed to init delay drop debugfs\n"); +} + +static const struct cpumask * +mlx5_ib_get_vector_affinity(struct ib_device *ibdev, int comp_vector) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + + return mlx5_get_vector_affinity_hint(dev->mdev, comp_vector); +} + +/* The mlx5_ib_multiport_mutex should be held when calling this function */ +static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev, + struct mlx5_ib_multiport_info *mpi) +{ + u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1; + struct mlx5_ib_port *port = &ibdev->port[port_num]; + int comps; + int err; + int i; + + mlx5_ib_cleanup_cong_debugfs(ibdev, port_num); + + spin_lock(&port->mp.mpi_lock); + if (!mpi->ibdev) { + spin_unlock(&port->mp.mpi_lock); + return; + } + mpi->ibdev = NULL; + + spin_unlock(&port->mp.mpi_lock); + mlx5_remove_netdev_notifier(ibdev, port_num); + spin_lock(&port->mp.mpi_lock); + + comps = mpi->mdev_refcnt; + if (comps) { + mpi->unaffiliate = true; + init_completion(&mpi->unref_comp); + spin_unlock(&port->mp.mpi_lock); + + for (i = 0; i < comps; i++) + wait_for_completion(&mpi->unref_comp); + + spin_lock(&port->mp.mpi_lock); + mpi->unaffiliate = false; + } + + port->mp.mpi = NULL; + + spin_unlock(&port->mp.mpi_lock); + + err = mlx5_nic_vport_unaffiliate_multiport(mpi->mdev); + + mlx5_ib_dbg(ibdev, "unaffiliated port %d\n", port_num + 1); + /* Log an error, still needed to cleanup the pointers and add + * it back to the list. + */ + if (err) + mlx5_ib_err(ibdev, "Failed to unaffiliate port %u\n", + port_num + 1); + + ibdev->roce[port_num].last_port_state = IB_PORT_DOWN; +} + +/* The mlx5_ib_multiport_mutex should be held when calling this function */ +static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev, + struct mlx5_ib_multiport_info *mpi) +{ + u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1; + int err; + + spin_lock(&ibdev->port[port_num].mp.mpi_lock); + if (ibdev->port[port_num].mp.mpi) { + mlx5_ib_dbg(ibdev, "port %d already affiliated.\n", + port_num + 1); + spin_unlock(&ibdev->port[port_num].mp.mpi_lock); + return false; + } + + ibdev->port[port_num].mp.mpi = mpi; + mpi->ibdev = ibdev; + spin_unlock(&ibdev->port[port_num].mp.mpi_lock); + + err = mlx5_nic_vport_affiliate_multiport(ibdev->mdev, mpi->mdev); + if (err) + goto unbind; + + err = get_port_caps(ibdev, mlx5_core_native_port_num(mpi->mdev)); + if (err) + goto unbind; + + err = mlx5_add_netdev_notifier(ibdev, port_num); + if (err) { + mlx5_ib_err(ibdev, "failed adding netdev notifier for port %u\n", + port_num + 1); + goto unbind; + } + + err = mlx5_ib_init_cong_debugfs(ibdev, port_num); + if (err) + goto unbind; + + return true; + +unbind: + mlx5_ib_unbind_slave_port(ibdev, mpi); + return false; +} + +static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev) +{ + int port_num = mlx5_core_native_port_num(dev->mdev) - 1; + enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, + port_num + 1); + struct mlx5_ib_multiport_info *mpi; + int err; + int i; + + if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET) + return 0; + + err = mlx5_query_nic_vport_system_image_guid(dev->mdev, + &dev->sys_image_guid); + if (err) + return err; + + err = mlx5_nic_vport_enable_roce(dev->mdev); + if (err) + return err; + + mutex_lock(&mlx5_ib_multiport_mutex); + for (i = 0; i < dev->num_ports; i++) { + bool bound = false; + + /* build a stub multiport info struct for the native port. */ + if (i == port_num) { + mpi = kzalloc(sizeof(*mpi), GFP_KERNEL); + if (!mpi) { + mutex_unlock(&mlx5_ib_multiport_mutex); + mlx5_nic_vport_disable_roce(dev->mdev); + return -ENOMEM; + } + + mpi->is_master = true; + mpi->mdev = dev->mdev; + mpi->sys_image_guid = dev->sys_image_guid; + dev->port[i].mp.mpi = mpi; + mpi->ibdev = dev; + mpi = NULL; + continue; + } + + list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list, + list) { + if (dev->sys_image_guid == mpi->sys_image_guid && + (mlx5_core_native_port_num(mpi->mdev) - 1) == i) { + bound = mlx5_ib_bind_slave_port(dev, mpi); + } + + if (bound) { + dev_dbg(&mpi->mdev->pdev->dev, "removing port from unaffiliated list.\n"); + mlx5_ib_dbg(dev, "port %d bound\n", i + 1); + list_del(&mpi->list); + break; + } + } + if (!bound) { + get_port_caps(dev, i + 1); + mlx5_ib_dbg(dev, "no free port found for port %d\n", + i + 1); + } + } + + list_add_tail(&dev->ib_dev_list, &mlx5_ib_dev_list); + mutex_unlock(&mlx5_ib_multiport_mutex); + return err; +} + +static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev) +{ + int port_num = mlx5_core_native_port_num(dev->mdev) - 1; + enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, + port_num + 1); + int i; + + if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET) + return; + + mutex_lock(&mlx5_ib_multiport_mutex); + for (i = 0; i < dev->num_ports; i++) { + if (dev->port[i].mp.mpi) { + /* Destroy the native port stub */ + if (i == port_num) { + kfree(dev->port[i].mp.mpi); + dev->port[i].mp.mpi = NULL; + } else { + mlx5_ib_dbg(dev, "unbinding port_num: %d\n", i + 1); + list_add_tail(&dev->port[i].mp.mpi->list, + &mlx5_ib_unaffiliated_port_list); + mlx5_ib_unbind_slave_port(dev, dev->port[i].mp.mpi); + } + } + } + + mlx5_ib_dbg(dev, "removing from devlist\n"); + list_del(&dev->ib_dev_list); + mutex_unlock(&mlx5_ib_multiport_mutex); + + mlx5_nic_vport_disable_roce(dev->mdev); +} + +ADD_UVERBS_ATTRIBUTES_SIMPLE( + mlx5_ib_dm, + UVERBS_OBJECT_DM, + UVERBS_METHOD_DM_ALLOC, + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX, + UVERBS_ATTR_TYPE(u16), + UA_MANDATORY)); + +ADD_UVERBS_ATTRIBUTES_SIMPLE( + mlx5_ib_flow_action, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS, + enum mlx5_ib_uapi_flow_action_flags)); + +static int populate_specs_root(struct mlx5_ib_dev *dev) +{ + const struct uverbs_object_tree_def **trees = dev->driver_trees; + size_t num_trees = 0; + + if (mlx5_accel_ipsec_device_caps(dev->mdev) & + MLX5_ACCEL_IPSEC_CAP_DEVICE) + trees[num_trees++] = &mlx5_ib_flow_action; + + if (MLX5_CAP_DEV_MEM(dev->mdev, memic)) + trees[num_trees++] = &mlx5_ib_dm; + + if (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) & + MLX5_GENERAL_OBJ_TYPES_CAP_UCTX) + trees[num_trees++] = mlx5_ib_get_devx_tree(); + + num_trees += mlx5_ib_get_flow_trees(trees + num_trees); + + WARN_ON(num_trees >= ARRAY_SIZE(dev->driver_trees)); + trees[num_trees] = NULL; + dev->ib_dev.driver_specs = trees; + + return 0; +} + +static int mlx5_ib_read_counters(struct ib_counters *counters, + struct ib_counters_read_attr *read_attr, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); + struct mlx5_read_counters_attr mread_attr = {}; + struct mlx5_ib_flow_counters_desc *desc; + int ret, i; + + mutex_lock(&mcounters->mcntrs_mutex); + if (mcounters->cntrs_max_index > read_attr->ncounters) { + ret = -EINVAL; + goto err_bound; + } + + mread_attr.out = kcalloc(mcounters->counters_num, sizeof(u64), + GFP_KERNEL); + if (!mread_attr.out) { + ret = -ENOMEM; + goto err_bound; + } + + mread_attr.hw_cntrs_hndl = mcounters->hw_cntrs_hndl; + mread_attr.flags = read_attr->flags; + ret = mcounters->read_counters(counters->device, &mread_attr); + if (ret) + goto err_read; + + /* do the pass over the counters data array to assign according to the + * descriptions and indexing pairs + */ + desc = mcounters->counters_data; + for (i = 0; i < mcounters->ncounters; i++) + read_attr->counters_buff[desc[i].index] += mread_attr.out[desc[i].description]; + +err_read: + kfree(mread_attr.out); +err_bound: + mutex_unlock(&mcounters->mcntrs_mutex); + return ret; +} + +static int mlx5_ib_destroy_counters(struct ib_counters *counters) +{ + struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); + + counters_clear_description(counters); + if (mcounters->hw_cntrs_hndl) + mlx5_fc_destroy(to_mdev(counters->device)->mdev, + mcounters->hw_cntrs_hndl); + + kfree(mcounters); + + return 0; +} + +static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_mcounters *mcounters; + + mcounters = kzalloc(sizeof(*mcounters), GFP_KERNEL); + if (!mcounters) + return ERR_PTR(-ENOMEM); + + mutex_init(&mcounters->mcntrs_mutex); + + return &mcounters->ibcntrs; +} + +void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_ib_cleanup_multiport_master(dev); +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + cleanup_srcu_struct(&dev->mr_srcu); +#endif + kfree(dev->port); +} + +int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + const char *name; + int err; + int i; + + dev->port = kcalloc(dev->num_ports, sizeof(*dev->port), + GFP_KERNEL); + if (!dev->port) + return -ENOMEM; + + for (i = 0; i < dev->num_ports; i++) { + spin_lock_init(&dev->port[i].mp.mpi_lock); + rwlock_init(&dev->roce[i].netdev_lock); + } + + err = mlx5_ib_init_multiport_master(dev); + if (err) + goto err_free_port; + + if (!mlx5_core_mp_enabled(mdev)) { + for (i = 1; i <= dev->num_ports; i++) { + err = get_port_caps(dev, i); + if (err) + break; + } + } else { + err = get_port_caps(dev, mlx5_core_native_port_num(mdev)); + } + if (err) + goto err_mp; + + if (mlx5_use_mad_ifc(dev)) + get_ext_port_caps(dev); + + if (!mlx5_lag_is_active(mdev)) + name = "mlx5_%d"; + else + name = "mlx5_bond_%d"; + + strlcpy(dev->ib_dev.name, name, IB_DEVICE_NAME_MAX); + dev->ib_dev.owner = THIS_MODULE; + dev->ib_dev.node_type = RDMA_NODE_IB_CA; + dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; + dev->ib_dev.phys_port_cnt = dev->num_ports; + dev->ib_dev.num_comp_vectors = + dev->mdev->priv.eq_table.num_comp_vectors; + dev->ib_dev.dev.parent = &mdev->pdev->dev; + + mutex_init(&dev->cap_mask_mutex); + INIT_LIST_HEAD(&dev->qp_list); + spin_lock_init(&dev->reset_flow_resource_lock); + + spin_lock_init(&dev->memic.memic_lock); + dev->memic.dev = mdev; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + err = init_srcu_struct(&dev->mr_srcu); + if (err) + goto err_free_port; +#endif + + return 0; +err_mp: + mlx5_ib_cleanup_multiport_master(dev); + +err_free_port: + kfree(dev->port); + + return -ENOMEM; +} + +static int mlx5_ib_stage_flow_db_init(struct mlx5_ib_dev *dev) +{ + dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL); + + if (!dev->flow_db) + return -ENOMEM; + + mutex_init(&dev->flow_db->lock); + + return 0; +} + +int mlx5_ib_stage_rep_flow_db_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_dev *nic_dev; + + nic_dev = mlx5_ib_get_uplink_ibdev(dev->mdev->priv.eswitch); + + if (!nic_dev) + return -EINVAL; + + dev->flow_db = nic_dev->flow_db; + + return 0; +} + +static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev) +{ + kfree(dev->flow_db); +} + +int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + int err; + + dev->ib_dev.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION; + dev->ib_dev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_CREATE_AH) | + (1ull << IB_USER_VERBS_CMD_DESTROY_AH) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_REREG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | + (1ull << IB_USER_VERBS_CMD_OPEN_QP); + dev->ib_dev.uverbs_ex_cmd_mask = + (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_EX_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_EX_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_EX_CMD_MODIFY_CQ); + + dev->ib_dev.query_device = mlx5_ib_query_device; + dev->ib_dev.get_link_layer = mlx5_ib_port_link_layer; + dev->ib_dev.query_gid = mlx5_ib_query_gid; + dev->ib_dev.add_gid = mlx5_ib_add_gid; + dev->ib_dev.del_gid = mlx5_ib_del_gid; + dev->ib_dev.query_pkey = mlx5_ib_query_pkey; + dev->ib_dev.modify_device = mlx5_ib_modify_device; + dev->ib_dev.modify_port = mlx5_ib_modify_port; + dev->ib_dev.alloc_ucontext = mlx5_ib_alloc_ucontext; + dev->ib_dev.dealloc_ucontext = mlx5_ib_dealloc_ucontext; + dev->ib_dev.mmap = mlx5_ib_mmap; + dev->ib_dev.alloc_pd = mlx5_ib_alloc_pd; + dev->ib_dev.dealloc_pd = mlx5_ib_dealloc_pd; + dev->ib_dev.create_ah = mlx5_ib_create_ah; + dev->ib_dev.query_ah = mlx5_ib_query_ah; + dev->ib_dev.destroy_ah = mlx5_ib_destroy_ah; + dev->ib_dev.create_srq = mlx5_ib_create_srq; + dev->ib_dev.modify_srq = mlx5_ib_modify_srq; + dev->ib_dev.query_srq = mlx5_ib_query_srq; + dev->ib_dev.destroy_srq = mlx5_ib_destroy_srq; + dev->ib_dev.post_srq_recv = mlx5_ib_post_srq_recv; + dev->ib_dev.create_qp = mlx5_ib_create_qp; + dev->ib_dev.modify_qp = mlx5_ib_modify_qp; + dev->ib_dev.query_qp = mlx5_ib_query_qp; + dev->ib_dev.destroy_qp = mlx5_ib_destroy_qp; + dev->ib_dev.drain_sq = mlx5_ib_drain_sq; + dev->ib_dev.drain_rq = mlx5_ib_drain_rq; + dev->ib_dev.post_send = mlx5_ib_post_send; + dev->ib_dev.post_recv = mlx5_ib_post_recv; + dev->ib_dev.create_cq = mlx5_ib_create_cq; + dev->ib_dev.modify_cq = mlx5_ib_modify_cq; + dev->ib_dev.resize_cq = mlx5_ib_resize_cq; + dev->ib_dev.destroy_cq = mlx5_ib_destroy_cq; + dev->ib_dev.poll_cq = mlx5_ib_poll_cq; + dev->ib_dev.req_notify_cq = mlx5_ib_arm_cq; + dev->ib_dev.get_dma_mr = mlx5_ib_get_dma_mr; + dev->ib_dev.reg_user_mr = mlx5_ib_reg_user_mr; + dev->ib_dev.rereg_user_mr = mlx5_ib_rereg_user_mr; + dev->ib_dev.dereg_mr = mlx5_ib_dereg_mr; + dev->ib_dev.attach_mcast = mlx5_ib_mcg_attach; + dev->ib_dev.detach_mcast = mlx5_ib_mcg_detach; + dev->ib_dev.process_mad = mlx5_ib_process_mad; + dev->ib_dev.alloc_mr = mlx5_ib_alloc_mr; + dev->ib_dev.map_mr_sg = mlx5_ib_map_mr_sg; + dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; + dev->ib_dev.get_dev_fw_str = get_dev_fw_str; + dev->ib_dev.get_vector_affinity = mlx5_ib_get_vector_affinity; + if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads)) + dev->ib_dev.alloc_rdma_netdev = mlx5_ib_alloc_rdma_netdev; + + if (mlx5_core_is_pf(mdev)) { + dev->ib_dev.get_vf_config = mlx5_ib_get_vf_config; + dev->ib_dev.set_vf_link_state = mlx5_ib_set_vf_link_state; + dev->ib_dev.get_vf_stats = mlx5_ib_get_vf_stats; + dev->ib_dev.set_vf_guid = mlx5_ib_set_vf_guid; + } + + dev->ib_dev.disassociate_ucontext = mlx5_ib_disassociate_ucontext; + + dev->umr_fence = mlx5_get_umr_fence(MLX5_CAP_GEN(mdev, umr_fence)); + + if (MLX5_CAP_GEN(mdev, imaicl)) { + dev->ib_dev.alloc_mw = mlx5_ib_alloc_mw; + dev->ib_dev.dealloc_mw = mlx5_ib_dealloc_mw; + dev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); + } + + if (MLX5_CAP_GEN(mdev, xrc)) { + dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; + dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; + dev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | + (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); + } + + if (MLX5_CAP_DEV_MEM(mdev, memic)) { + dev->ib_dev.alloc_dm = mlx5_ib_alloc_dm; + dev->ib_dev.dealloc_dm = mlx5_ib_dealloc_dm; + dev->ib_dev.reg_dm_mr = mlx5_ib_reg_dm_mr; + } + + dev->ib_dev.create_flow = mlx5_ib_create_flow; + dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow; + dev->ib_dev.uverbs_ex_cmd_mask |= + (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); + dev->ib_dev.create_flow_action_esp = mlx5_ib_create_flow_action_esp; + dev->ib_dev.destroy_flow_action = mlx5_ib_destroy_flow_action; + dev->ib_dev.modify_flow_action_esp = mlx5_ib_modify_flow_action_esp; + dev->ib_dev.driver_id = RDMA_DRIVER_MLX5; + dev->ib_dev.create_counters = mlx5_ib_create_counters; + dev->ib_dev.destroy_counters = mlx5_ib_destroy_counters; + dev->ib_dev.read_counters = mlx5_ib_read_counters; + + err = init_node_data(dev); + if (err) + return err; + + if ((MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && + (MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) || + MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) + mutex_init(&dev->lb_mutex); + + return 0; +} + +static int mlx5_ib_stage_non_default_cb(struct mlx5_ib_dev *dev) +{ + dev->ib_dev.get_port_immutable = mlx5_port_immutable; + dev->ib_dev.query_port = mlx5_ib_query_port; + + return 0; +} + +int mlx5_ib_stage_rep_non_default_cb(struct mlx5_ib_dev *dev) +{ + dev->ib_dev.get_port_immutable = mlx5_port_rep_immutable; + dev->ib_dev.query_port = mlx5_ib_rep_query_port; + + return 0; +} + +static int mlx5_ib_stage_common_roce_init(struct mlx5_ib_dev *dev) +{ + u8 port_num; + int i; + + for (i = 0; i < dev->num_ports; i++) { + dev->roce[i].dev = dev; + dev->roce[i].native_port_num = i + 1; + dev->roce[i].last_port_state = IB_PORT_DOWN; + } + + dev->ib_dev.get_netdev = mlx5_ib_get_netdev; + dev->ib_dev.create_wq = mlx5_ib_create_wq; + dev->ib_dev.modify_wq = mlx5_ib_modify_wq; + dev->ib_dev.destroy_wq = mlx5_ib_destroy_wq; + dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table; + dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table; + + dev->ib_dev.uverbs_ex_cmd_mask |= + (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL); + + port_num = mlx5_core_native_port_num(dev->mdev) - 1; + + return mlx5_add_netdev_notifier(dev, port_num); +} + +static void mlx5_ib_stage_common_roce_cleanup(struct mlx5_ib_dev *dev) +{ + u8 port_num = mlx5_core_native_port_num(dev->mdev) - 1; + + mlx5_remove_netdev_notifier(dev, port_num); +} + +int mlx5_ib_stage_rep_roce_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + enum rdma_link_layer ll; + int port_type_cap; + int err = 0; + + port_type_cap = MLX5_CAP_GEN(mdev, port_type); + ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); + + if (ll == IB_LINK_LAYER_ETHERNET) + err = mlx5_ib_stage_common_roce_init(dev); + + return err; +} + +void mlx5_ib_stage_rep_roce_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_ib_stage_common_roce_cleanup(dev); +} + +static int mlx5_ib_stage_roce_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + enum rdma_link_layer ll; + int port_type_cap; + int err; + + port_type_cap = MLX5_CAP_GEN(mdev, port_type); + ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); + + if (ll == IB_LINK_LAYER_ETHERNET) { + err = mlx5_ib_stage_common_roce_init(dev); + if (err) + return err; + + err = mlx5_enable_eth(dev); + if (err) + goto cleanup; + } + + return 0; +cleanup: + mlx5_ib_stage_common_roce_cleanup(dev); + + return err; +} + +static void mlx5_ib_stage_roce_cleanup(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + enum rdma_link_layer ll; + int port_type_cap; + + port_type_cap = MLX5_CAP_GEN(mdev, port_type); + ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); + + if (ll == IB_LINK_LAYER_ETHERNET) { + mlx5_disable_eth(dev); + mlx5_ib_stage_common_roce_cleanup(dev); + } +} + +int mlx5_ib_stage_dev_res_init(struct mlx5_ib_dev *dev) +{ + return create_dev_resources(&dev->devr); +} + +void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev) +{ + destroy_dev_resources(&dev->devr); +} + +static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev) +{ + mlx5_ib_internal_fill_odp_caps(dev); + + return mlx5_ib_odp_init_one(dev); +} + +int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev) +{ + if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) { + dev->ib_dev.get_hw_stats = mlx5_ib_get_hw_stats; + dev->ib_dev.alloc_hw_stats = mlx5_ib_alloc_hw_stats; + + return mlx5_ib_alloc_counters(dev); + } + + return 0; +} + +void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev) +{ + if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) + mlx5_ib_dealloc_counters(dev); +} + +static int mlx5_ib_stage_cong_debugfs_init(struct mlx5_ib_dev *dev) +{ + return mlx5_ib_init_cong_debugfs(dev, + mlx5_core_native_port_num(dev->mdev) - 1); +} + +static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_ib_cleanup_cong_debugfs(dev, + mlx5_core_native_port_num(dev->mdev) - 1); +} + +static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev) +{ + dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev); + return PTR_ERR_OR_ZERO(dev->mdev->priv.uar); +} + +static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar); +} + +int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev) +{ + int err; + + err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false); + if (err) + return err; + + err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true); + if (err) + mlx5_free_bfreg(dev->mdev, &dev->bfreg); + + return err; +} + +void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); + mlx5_free_bfreg(dev->mdev, &dev->bfreg); +} + +static int mlx5_ib_stage_populate_specs(struct mlx5_ib_dev *dev) +{ + return populate_specs_root(dev); +} + +int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev) +{ + return ib_register_device(&dev->ib_dev, NULL); +} + +void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev) +{ + destroy_umrc_res(dev); +} + +void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev) +{ + ib_unregister_device(&dev->ib_dev); +} + +int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev) +{ + return create_umr_res(dev); +} + +static int mlx5_ib_stage_delay_drop_init(struct mlx5_ib_dev *dev) +{ + init_delay_drop(dev); + + return 0; +} + +static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev) +{ + cancel_delay_drop(dev); +} + +int mlx5_ib_stage_class_attr_init(struct mlx5_ib_dev *dev) +{ + int err; + int i; + + for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { + err = device_create_file(&dev->ib_dev.dev, + mlx5_class_attributes[i]); + if (err) + return err; + } + + return 0; +} + +static int mlx5_ib_stage_rep_reg_init(struct mlx5_ib_dev *dev) +{ + mlx5_ib_register_vport_reps(dev); + + return 0; +} + +static void mlx5_ib_stage_rep_reg_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_ib_unregister_vport_reps(dev); +} + +void __mlx5_ib_remove(struct mlx5_ib_dev *dev, + const struct mlx5_ib_profile *profile, + int stage) +{ + /* Number of stages to cleanup */ + while (stage) { + stage--; + if (profile->stage[stage].cleanup) + profile->stage[stage].cleanup(dev); + } + + ib_dealloc_device((struct ib_device *)dev); +} + +void *__mlx5_ib_add(struct mlx5_ib_dev *dev, + const struct mlx5_ib_profile *profile) +{ + int err; + int i; + + printk_once(KERN_INFO "%s", mlx5_version); + + for (i = 0; i < MLX5_IB_STAGE_MAX; i++) { + if (profile->stage[i].init) { + err = profile->stage[i].init(dev); + if (err) + goto err_out; + } + } + + dev->profile = profile; + dev->ib_active = true; + + return dev; + +err_out: + __mlx5_ib_remove(dev, profile, i); + + return NULL; +} + +static const struct mlx5_ib_profile pf_profile = { + STAGE_CREATE(MLX5_IB_STAGE_INIT, + mlx5_ib_stage_init_init, + mlx5_ib_stage_init_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB, + mlx5_ib_stage_flow_db_init, + mlx5_ib_stage_flow_db_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_CAPS, + mlx5_ib_stage_caps_init, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB, + mlx5_ib_stage_non_default_cb, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_ROCE, + mlx5_ib_stage_roce_init, + mlx5_ib_stage_roce_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, + mlx5_ib_stage_dev_res_init, + mlx5_ib_stage_dev_res_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_ODP, + mlx5_ib_stage_odp_init, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_COUNTERS, + mlx5_ib_stage_counters_init, + mlx5_ib_stage_counters_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS, + mlx5_ib_stage_cong_debugfs_init, + mlx5_ib_stage_cong_debugfs_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_UAR, + mlx5_ib_stage_uar_init, + mlx5_ib_stage_uar_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_BFREG, + mlx5_ib_stage_bfrag_init, + mlx5_ib_stage_bfrag_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR, + NULL, + mlx5_ib_stage_pre_ib_reg_umr_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_SPECS, + mlx5_ib_stage_populate_specs, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_IB_REG, + mlx5_ib_stage_ib_reg_init, + mlx5_ib_stage_ib_reg_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, + mlx5_ib_stage_post_ib_reg_umr_init, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP, + mlx5_ib_stage_delay_drop_init, + mlx5_ib_stage_delay_drop_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR, + mlx5_ib_stage_class_attr_init, + NULL), +}; + +static const struct mlx5_ib_profile nic_rep_profile = { + STAGE_CREATE(MLX5_IB_STAGE_INIT, + mlx5_ib_stage_init_init, + mlx5_ib_stage_init_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB, + mlx5_ib_stage_flow_db_init, + mlx5_ib_stage_flow_db_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_CAPS, + mlx5_ib_stage_caps_init, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB, + mlx5_ib_stage_rep_non_default_cb, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_ROCE, + mlx5_ib_stage_rep_roce_init, + mlx5_ib_stage_rep_roce_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, + mlx5_ib_stage_dev_res_init, + mlx5_ib_stage_dev_res_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_COUNTERS, + mlx5_ib_stage_counters_init, + mlx5_ib_stage_counters_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_UAR, + mlx5_ib_stage_uar_init, + mlx5_ib_stage_uar_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_BFREG, + mlx5_ib_stage_bfrag_init, + mlx5_ib_stage_bfrag_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR, + NULL, + mlx5_ib_stage_pre_ib_reg_umr_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_SPECS, + mlx5_ib_stage_populate_specs, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_IB_REG, + mlx5_ib_stage_ib_reg_init, + mlx5_ib_stage_ib_reg_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, + mlx5_ib_stage_post_ib_reg_umr_init, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR, + mlx5_ib_stage_class_attr_init, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_REP_REG, + mlx5_ib_stage_rep_reg_init, + mlx5_ib_stage_rep_reg_cleanup), +}; + +static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev) +{ + struct mlx5_ib_multiport_info *mpi; + struct mlx5_ib_dev *dev; + bool bound = false; + int err; + + mpi = kzalloc(sizeof(*mpi), GFP_KERNEL); + if (!mpi) + return NULL; + + mpi->mdev = mdev; + + err = mlx5_query_nic_vport_system_image_guid(mdev, + &mpi->sys_image_guid); + if (err) { + kfree(mpi); + return NULL; + } + + mutex_lock(&mlx5_ib_multiport_mutex); + list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) { + if (dev->sys_image_guid == mpi->sys_image_guid) + bound = mlx5_ib_bind_slave_port(dev, mpi); + + if (bound) { + rdma_roce_rescan_device(&dev->ib_dev); + mpi->ibdev->ib_active = true; + break; + } + } + + if (!bound) { + list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list); + dev_dbg(&mdev->pdev->dev, "no suitable IB device found to bind to, added to unaffiliated list.\n"); + } + mutex_unlock(&mlx5_ib_multiport_mutex); + + return mpi; +} + +static void *mlx5_ib_add(struct mlx5_core_dev *mdev) +{ + enum rdma_link_layer ll; + struct mlx5_ib_dev *dev; + int port_type_cap; + + printk_once(KERN_INFO "%s", mlx5_version); + + port_type_cap = MLX5_CAP_GEN(mdev, port_type); + ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); + + if (mlx5_core_is_mp_slave(mdev) && ll == IB_LINK_LAYER_ETHERNET) + return mlx5_ib_add_slave_port(mdev); + + dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev)); + if (!dev) + return NULL; + + dev->mdev = mdev; + dev->num_ports = max(MLX5_CAP_GEN(mdev, num_ports), + MLX5_CAP_GEN(mdev, num_vhca_ports)); + + if (MLX5_ESWITCH_MANAGER(mdev) && + mlx5_ib_eswitch_mode(mdev->priv.eswitch) == SRIOV_OFFLOADS) { + dev->rep = mlx5_ib_vport_rep(mdev->priv.eswitch, 0); + + return __mlx5_ib_add(dev, &nic_rep_profile); + } + + return __mlx5_ib_add(dev, &pf_profile); +} + +static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) +{ + struct mlx5_ib_multiport_info *mpi; + struct mlx5_ib_dev *dev; + + if (mlx5_core_is_mp_slave(mdev)) { + mpi = context; + mutex_lock(&mlx5_ib_multiport_mutex); + if (mpi->ibdev) + mlx5_ib_unbind_slave_port(mpi->ibdev, mpi); + list_del(&mpi->list); + mutex_unlock(&mlx5_ib_multiport_mutex); + kfree(mpi); + return; + } + + dev = context; + __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX); +} + +static struct mlx5_interface mlx5_ib_interface = { + .add = mlx5_ib_add, + .remove = mlx5_ib_remove, + .event = mlx5_ib_event, +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + .pfault = mlx5_ib_pfault, +#endif + .protocol = MLX5_INTERFACE_PROTOCOL_IB, +}; + +unsigned long mlx5_ib_get_xlt_emergency_page(void) +{ + mutex_lock(&xlt_emergency_page_mutex); + return xlt_emergency_page; +} + +void mlx5_ib_put_xlt_emergency_page(void) +{ + mutex_unlock(&xlt_emergency_page_mutex); +} + +static int __init mlx5_ib_init(void) +{ + int err; + + xlt_emergency_page = __get_free_page(GFP_KERNEL); + if (!xlt_emergency_page) + return -ENOMEM; + + mutex_init(&xlt_emergency_page_mutex); + + mlx5_ib_event_wq = alloc_ordered_workqueue("mlx5_ib_event_wq", 0); + if (!mlx5_ib_event_wq) { + free_page(xlt_emergency_page); + return -ENOMEM; + } + + mlx5_ib_odp_init(); + + err = mlx5_register_interface(&mlx5_ib_interface); + + return err; +} + +static void __exit mlx5_ib_cleanup(void) +{ + mlx5_unregister_interface(&mlx5_ib_interface); + destroy_workqueue(mlx5_ib_event_wq); + mutex_destroy(&xlt_emergency_page_mutex); + free_page(xlt_emergency_page); +} + +module_init(mlx5_ib_init); +module_exit(mlx5_ib_cleanup); diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c new file mode 100644 index 000000000..f3dbd75a0 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/module.h> +#include <rdma/ib_umem.h> +#include <rdma/ib_umem_odp.h> +#include "mlx5_ib.h" + +/* @umem: umem object to scan + * @addr: ib virtual address requested by the user + * @max_page_shift: high limit for page_shift - 0 means no limit + * @count: number of PAGE_SIZE pages covered by umem + * @shift: page shift for the compound pages found in the region + * @ncont: number of compund pages + * @order: log2 of the number of compound pages + */ +void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, + unsigned long max_page_shift, + int *count, int *shift, + int *ncont, int *order) +{ + unsigned long tmp; + unsigned long m; + u64 base = ~0, p = 0; + u64 len, pfn; + int i = 0; + struct scatterlist *sg; + int entry; + unsigned long page_shift = umem->page_shift; + + if (umem->odp_data) { + *ncont = ib_umem_page_count(umem); + *count = *ncont << (page_shift - PAGE_SHIFT); + *shift = page_shift; + if (order) + *order = ilog2(roundup_pow_of_two(*ncont)); + + return; + } + + addr = addr >> page_shift; + tmp = (unsigned long)addr; + m = find_first_bit(&tmp, BITS_PER_LONG); + if (max_page_shift) + m = min_t(unsigned long, max_page_shift - page_shift, m); + + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + len = sg_dma_len(sg) >> page_shift; + pfn = sg_dma_address(sg) >> page_shift; + if (base + p != pfn) { + /* If either the offset or the new + * base are unaligned update m + */ + tmp = (unsigned long)(pfn | p); + if (!IS_ALIGNED(tmp, 1 << m)) + m = find_first_bit(&tmp, BITS_PER_LONG); + + base = pfn; + p = 0; + } + + p += len; + i += len; + } + + if (i) { + m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m); + + if (order) + *order = ilog2(roundup_pow_of_two(i) >> m); + + *ncont = DIV_ROUND_UP(i, (1 << m)); + } else { + m = 0; + + if (order) + *order = 0; + + *ncont = 0; + } + *shift = page_shift + m; + *count = i; +} + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +static u64 umem_dma_to_mtt(dma_addr_t umem_dma) +{ + u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK; + + if (umem_dma & ODP_READ_ALLOWED_BIT) + mtt_entry |= MLX5_IB_MTT_READ; + if (umem_dma & ODP_WRITE_ALLOWED_BIT) + mtt_entry |= MLX5_IB_MTT_WRITE; + + return mtt_entry; +} +#endif + +/* + * Populate the given array with bus addresses from the umem. + * + * dev - mlx5_ib device + * umem - umem to use to fill the pages + * page_shift - determines the page size used in the resulting array + * offset - offset into the umem to start from, + * only implemented for ODP umems + * num_pages - total number of pages to fill + * pas - bus addresses array to fill + * access_flags - access flags to set on all present pages. + use enum mlx5_ib_mtt_access_flags for this. + */ +void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, size_t offset, size_t num_pages, + __be64 *pas, int access_flags) +{ + unsigned long umem_page_shift = umem->page_shift; + int shift = page_shift - umem_page_shift; + int mask = (1 << shift) - 1; + int i, k, idx; + u64 cur = 0; + u64 base; + int len; + struct scatterlist *sg; + int entry; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + const bool odp = umem->odp_data != NULL; + + if (odp) { + WARN_ON(shift != 0); + WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)); + + for (i = 0; i < num_pages; ++i) { + dma_addr_t pa = umem->odp_data->dma_list[offset + i]; + + pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); + } + return; + } +#endif + + i = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + len = sg_dma_len(sg) >> umem_page_shift; + base = sg_dma_address(sg); + + /* Skip elements below offset */ + if (i + len < offset << shift) { + i += len; + continue; + } + + /* Skip pages below offset */ + if (i < offset << shift) { + k = (offset << shift) - i; + i = offset << shift; + } else { + k = 0; + } + + for (; k < len; k++) { + if (!(i & mask)) { + cur = base + (k << umem_page_shift); + cur |= access_flags; + idx = (i >> shift) - offset; + + pas[idx] = cpu_to_be64(cur); + mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n", + i >> shift, be64_to_cpu(pas[idx])); + } + i++; + + /* Stop after num_pages reached */ + if (i >> shift >= offset + num_pages) + return; + } + } +} + +void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, __be64 *pas, int access_flags) +{ + return __mlx5_ib_populate_pas(dev, umem, page_shift, 0, + ib_umem_num_pages(umem), pas, + access_flags); +} +int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset) +{ + u64 page_size; + u64 page_mask; + u64 off_size; + u64 off_mask; + u64 buf_off; + + page_size = (u64)1 << page_shift; + page_mask = page_size - 1; + buf_off = addr & page_mask; + off_size = page_size >> 6; + off_mask = off_size - 1; + + if (buf_off & off_mask) + return -EINVAL; + + *offset = buf_off >> ilog2(off_size); + return 0; +} diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h new file mode 100644 index 000000000..6a060c845 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -0,0 +1,1367 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_IB_H +#define MLX5_IB_H + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <rdma/ib_verbs.h> +#include <rdma/ib_smi.h> +#include <linux/mlx5/driver.h> +#include <linux/mlx5/cq.h> +#include <linux/mlx5/qp.h> +#include <linux/mlx5/srq.h> +#include <linux/types.h> +#include <linux/mlx5/transobj.h> +#include <rdma/ib_user_verbs.h> +#include <rdma/mlx5-abi.h> +#include <rdma/uverbs_ioctl.h> +#include <rdma/mlx5_user_ioctl_cmds.h> + +#define mlx5_ib_dbg(dev, format, arg...) \ +pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, current->pid, ##arg) + +#define mlx5_ib_err(dev, format, arg...) \ +pr_err("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, current->pid, ##arg) + +#define mlx5_ib_warn(dev, format, arg...) \ +pr_warn("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, current->pid, ##arg) + +#define field_avail(type, fld, sz) (offsetof(type, fld) + \ + sizeof(((type *)0)->fld) <= (sz)) +#define MLX5_IB_DEFAULT_UIDX 0xffffff +#define MLX5_USER_ASSIGNED_UIDX_MASK __mlx5_mask(qpc, user_index) + +#define MLX5_MKEY_PAGE_SHIFT_MASK __mlx5_mask(mkc, log_page_size) + +enum { + MLX5_IB_MMAP_CMD_SHIFT = 8, + MLX5_IB_MMAP_CMD_MASK = 0xff, +}; + +enum { + MLX5_RES_SCAT_DATA32_CQE = 0x1, + MLX5_RES_SCAT_DATA64_CQE = 0x2, + MLX5_REQ_SCAT_DATA32_CQE = 0x11, + MLX5_REQ_SCAT_DATA64_CQE = 0x22, +}; + +enum mlx5_ib_mad_ifc_flags { + MLX5_MAD_IFC_IGNORE_MKEY = 1, + MLX5_MAD_IFC_IGNORE_BKEY = 2, + MLX5_MAD_IFC_NET_VIEW = 4, +}; + +enum { + MLX5_CROSS_CHANNEL_BFREG = 0, +}; + +enum { + MLX5_CQE_VERSION_V0, + MLX5_CQE_VERSION_V1, +}; + +enum { + MLX5_TM_MAX_RNDV_MSG_SIZE = 64, + MLX5_TM_MAX_SGE = 1, +}; + +enum { + MLX5_IB_INVALID_UAR_INDEX = BIT(31), + MLX5_IB_INVALID_BFREG = BIT(31), +}; + +enum { + MLX5_MAX_MEMIC_PAGES = 0x100, + MLX5_MEMIC_ALLOC_SIZE_MASK = 0x3f, +}; + +enum { + MLX5_MEMIC_BASE_ALIGN = 6, + MLX5_MEMIC_BASE_SIZE = 1 << MLX5_MEMIC_BASE_ALIGN, +}; + +struct mlx5_ib_vma_private_data { + struct list_head list; + struct vm_area_struct *vma; + /* protect vma_private_list add/del */ + struct mutex *vma_private_list_mutex; +}; + +struct mlx5_ib_ucontext { + struct ib_ucontext ibucontext; + struct list_head db_page_list; + + /* protect doorbell record alloc/free + */ + struct mutex db_page_mutex; + struct mlx5_bfreg_info bfregi; + u8 cqe_version; + /* Transport Domain number */ + u32 tdn; + struct list_head vma_private_list; + /* protect vma_private_list add/del */ + struct mutex vma_private_list_mutex; + + u64 lib_caps; + DECLARE_BITMAP(dm_pages, MLX5_MAX_MEMIC_PAGES); + u16 devx_uid; + /* For RoCE LAG TX affinity */ + atomic_t tx_port_affinity; +}; + +static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct mlx5_ib_ucontext, ibucontext); +} + +struct mlx5_ib_pd { + struct ib_pd ibpd; + u32 pdn; +}; + +#define MLX5_IB_FLOW_MCAST_PRIO (MLX5_BY_PASS_NUM_PRIOS - 1) +#define MLX5_IB_FLOW_LAST_PRIO (MLX5_BY_PASS_NUM_REGULAR_PRIOS - 1) +#if (MLX5_IB_FLOW_LAST_PRIO <= 0) +#error "Invalid number of bypass priorities" +#endif +#define MLX5_IB_FLOW_LEFTOVERS_PRIO (MLX5_IB_FLOW_MCAST_PRIO + 1) + +#define MLX5_IB_NUM_FLOW_FT (MLX5_IB_FLOW_LEFTOVERS_PRIO + 1) +#define MLX5_IB_NUM_SNIFFER_FTS 2 +#define MLX5_IB_NUM_EGRESS_FTS 1 +struct mlx5_ib_flow_prio { + struct mlx5_flow_table *flow_table; + unsigned int refcount; +}; + +struct mlx5_ib_flow_handler { + struct list_head list; + struct ib_flow ibflow; + struct mlx5_ib_flow_prio *prio; + struct mlx5_flow_handle *rule; + struct ib_counters *ibcounters; + struct mlx5_ib_dev *dev; + struct mlx5_ib_flow_matcher *flow_matcher; +}; + +struct mlx5_ib_flow_matcher { + struct mlx5_ib_match_params matcher_mask; + int mask_len; + enum mlx5_ib_flow_type flow_type; + u16 priority; + struct mlx5_core_dev *mdev; + atomic_t usecnt; + u8 match_criteria_enable; +}; + +struct mlx5_ib_flow_db { + struct mlx5_ib_flow_prio prios[MLX5_IB_NUM_FLOW_FT]; + struct mlx5_ib_flow_prio sniffer[MLX5_IB_NUM_SNIFFER_FTS]; + struct mlx5_ib_flow_prio egress[MLX5_IB_NUM_EGRESS_FTS]; + struct mlx5_flow_table *lag_demux_ft; + /* Protect flow steering bypass flow tables + * when add/del flow rules. + * only single add/removal of flow steering rule could be done + * simultaneously. + */ + struct mutex lock; +}; + +/* Use macros here so that don't have to duplicate + * enum ib_send_flags and enum ib_qp_type for low-level driver + */ + +#define MLX5_IB_SEND_UMR_ENABLE_MR (IB_SEND_RESERVED_START << 0) +#define MLX5_IB_SEND_UMR_DISABLE_MR (IB_SEND_RESERVED_START << 1) +#define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 2) +#define MLX5_IB_SEND_UMR_UPDATE_XLT (IB_SEND_RESERVED_START << 3) +#define MLX5_IB_SEND_UMR_UPDATE_TRANSLATION (IB_SEND_RESERVED_START << 4) +#define MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS IB_SEND_RESERVED_END + +#define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 +/* + * IB_QPT_GSI creates the software wrapper around GSI, and MLX5_IB_QPT_HW_GSI + * creates the actual hardware QP. + */ +#define MLX5_IB_QPT_HW_GSI IB_QPT_RESERVED2 +#define MLX5_IB_QPT_DCI IB_QPT_RESERVED3 +#define MLX5_IB_QPT_DCT IB_QPT_RESERVED4 +#define MLX5_IB_WR_UMR IB_WR_RESERVED1 + +#define MLX5_IB_UMR_OCTOWORD 16 +#define MLX5_IB_UMR_XLT_ALIGNMENT 64 + +#define MLX5_IB_UPD_XLT_ZAP BIT(0) +#define MLX5_IB_UPD_XLT_ENABLE BIT(1) +#define MLX5_IB_UPD_XLT_ATOMIC BIT(2) +#define MLX5_IB_UPD_XLT_ADDR BIT(3) +#define MLX5_IB_UPD_XLT_PD BIT(4) +#define MLX5_IB_UPD_XLT_ACCESS BIT(5) +#define MLX5_IB_UPD_XLT_INDIRECT BIT(6) + +/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. + * + * These flags are intended for internal use by the mlx5_ib driver, and they + * rely on the range reserved for that use in the ib_qp_create_flags enum. + */ + +/* Create a UD QP whose source QP number is 1 */ +static inline enum ib_qp_create_flags mlx5_ib_create_qp_sqpn_qp1(void) +{ + return IB_QP_CREATE_RESERVED_START; +} + +struct wr_list { + u16 opcode; + u16 next; +}; + +enum mlx5_ib_rq_flags { + MLX5_IB_RQ_CVLAN_STRIPPING = 1 << 0, + MLX5_IB_RQ_PCI_WRITE_END_PADDING = 1 << 1, +}; + +struct mlx5_ib_wq { + u64 *wrid; + u32 *wr_data; + struct wr_list *w_list; + unsigned *wqe_head; + u16 unsig_count; + + /* serialize post to the work queue + */ + spinlock_t lock; + int wqe_cnt; + int max_post; + int max_gs; + int offset; + int wqe_shift; + unsigned head; + unsigned tail; + u16 cur_post; + u16 last_poll; + void *qend; +}; + +enum mlx5_ib_wq_flags { + MLX5_IB_WQ_FLAGS_DELAY_DROP = 0x1, + MLX5_IB_WQ_FLAGS_STRIDING_RQ = 0x2, +}; + +#define MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES 9 +#define MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES 16 +#define MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES 6 +#define MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES 13 + +struct mlx5_ib_rwq { + struct ib_wq ibwq; + struct mlx5_core_qp core_qp; + u32 rq_num_pas; + u32 log_rq_stride; + u32 log_rq_size; + u32 rq_page_offset; + u32 log_page_size; + u32 log_num_strides; + u32 two_byte_shift_en; + u32 single_stride_log_num_of_bytes; + struct ib_umem *umem; + size_t buf_size; + unsigned int page_shift; + int create_type; + struct mlx5_db db; + u32 user_index; + u32 wqe_count; + u32 wqe_shift; + int wq_sig; + u32 create_flags; /* Use enum mlx5_ib_wq_flags */ +}; + +enum { + MLX5_QP_USER, + MLX5_QP_KERNEL, + MLX5_QP_EMPTY +}; + +enum { + MLX5_WQ_USER, + MLX5_WQ_KERNEL +}; + +struct mlx5_ib_rwq_ind_table { + struct ib_rwq_ind_table ib_rwq_ind_tbl; + u32 rqtn; +}; + +struct mlx5_ib_ubuffer { + struct ib_umem *umem; + int buf_size; + u64 buf_addr; +}; + +struct mlx5_ib_qp_base { + struct mlx5_ib_qp *container_mibqp; + struct mlx5_core_qp mqp; + struct mlx5_ib_ubuffer ubuffer; +}; + +struct mlx5_ib_qp_trans { + struct mlx5_ib_qp_base base; + u16 xrcdn; + u8 alt_port; + u8 atomic_rd_en; + u8 resp_depth; +}; + +struct mlx5_ib_rss_qp { + u32 tirn; +}; + +struct mlx5_ib_rq { + struct mlx5_ib_qp_base base; + struct mlx5_ib_wq *rq; + struct mlx5_ib_ubuffer ubuffer; + struct mlx5_db *doorbell; + u32 tirn; + u8 state; + u32 flags; +}; + +struct mlx5_ib_sq { + struct mlx5_ib_qp_base base; + struct mlx5_ib_wq *sq; + struct mlx5_ib_ubuffer ubuffer; + struct mlx5_db *doorbell; + struct mlx5_flow_handle *flow_rule; + u32 tisn; + u8 state; +}; + +struct mlx5_ib_raw_packet_qp { + struct mlx5_ib_sq sq; + struct mlx5_ib_rq rq; +}; + +struct mlx5_bf { + int buf_size; + unsigned long offset; + struct mlx5_sq_bfreg *bfreg; +}; + +struct mlx5_ib_dct { + struct mlx5_core_dct mdct; + u32 *in; +}; + +struct mlx5_ib_qp { + struct ib_qp ibqp; + union { + struct mlx5_ib_qp_trans trans_qp; + struct mlx5_ib_raw_packet_qp raw_packet_qp; + struct mlx5_ib_rss_qp rss_qp; + struct mlx5_ib_dct dct; + }; + struct mlx5_frag_buf buf; + + struct mlx5_db db; + struct mlx5_ib_wq rq; + + u8 sq_signal_bits; + u8 next_fence; + struct mlx5_ib_wq sq; + + /* serialize qp state modifications + */ + struct mutex mutex; + u32 flags; + u8 port; + u8 state; + int wq_sig; + int scat_cqe; + int max_inline_data; + struct mlx5_bf bf; + int has_rq; + + /* only for user space QPs. For kernel + * we have it from the bf object + */ + int bfregn; + + int create_type; + + /* Store signature errors */ + bool signature_en; + + struct list_head qps_list; + struct list_head cq_recv_list; + struct list_head cq_send_list; + struct mlx5_rate_limit rl; + u32 underlay_qpn; + bool tunnel_offload_en; + /* storage for qp sub type when core qp type is IB_QPT_DRIVER */ + enum ib_qp_type qp_sub_type; +}; + +struct mlx5_ib_cq_buf { + struct mlx5_frag_buf_ctrl fbc; + struct ib_umem *umem; + int cqe_size; + int nent; +}; + +enum mlx5_ib_qp_flags { + MLX5_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, + MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, + MLX5_IB_QP_CROSS_CHANNEL = IB_QP_CREATE_CROSS_CHANNEL, + MLX5_IB_QP_MANAGED_SEND = IB_QP_CREATE_MANAGED_SEND, + MLX5_IB_QP_MANAGED_RECV = IB_QP_CREATE_MANAGED_RECV, + MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 5, + /* QP uses 1 as its source QP number */ + MLX5_IB_QP_SQPN_QP1 = 1 << 6, + MLX5_IB_QP_CAP_SCATTER_FCS = 1 << 7, + MLX5_IB_QP_RSS = 1 << 8, + MLX5_IB_QP_CVLAN_STRIPPING = 1 << 9, + MLX5_IB_QP_UNDERLAY = 1 << 10, + MLX5_IB_QP_PCI_WRITE_END_PADDING = 1 << 11, + MLX5_IB_QP_TUNNEL_OFFLOAD = 1 << 12, +}; + +struct mlx5_umr_wr { + struct ib_send_wr wr; + u64 virt_addr; + u64 offset; + struct ib_pd *pd; + unsigned int page_shift; + unsigned int xlt_size; + u64 length; + int access_flags; + u32 mkey; + u8 ignore_free_state:1; +}; + +static inline const struct mlx5_umr_wr *umr_wr(const struct ib_send_wr *wr) +{ + return container_of(wr, struct mlx5_umr_wr, wr); +} + +struct mlx5_shared_mr_info { + int mr_id; + struct ib_umem *umem; +}; + +enum mlx5_ib_cq_pr_flags { + MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD = 1 << 0, +}; + +struct mlx5_ib_cq { + struct ib_cq ibcq; + struct mlx5_core_cq mcq; + struct mlx5_ib_cq_buf buf; + struct mlx5_db db; + + /* serialize access to the CQ + */ + spinlock_t lock; + + /* protect resize cq + */ + struct mutex resize_mutex; + struct mlx5_ib_cq_buf *resize_buf; + struct ib_umem *resize_umem; + int cqe_size; + struct list_head list_send_qp; + struct list_head list_recv_qp; + u32 create_flags; + struct list_head wc_list; + enum ib_cq_notify_flags notify_flags; + struct work_struct notify_work; + u16 private_flags; /* Use mlx5_ib_cq_pr_flags */ +}; + +struct mlx5_ib_wc { + struct ib_wc wc; + struct list_head list; +}; + +struct mlx5_ib_srq { + struct ib_srq ibsrq; + struct mlx5_core_srq msrq; + struct mlx5_frag_buf buf; + struct mlx5_db db; + u64 *wrid; + /* protect SRQ hanlding + */ + spinlock_t lock; + int head; + int tail; + u16 wqe_ctr; + struct ib_umem *umem; + /* serialize arming a SRQ + */ + struct mutex mutex; + int wq_sig; +}; + +struct mlx5_ib_xrcd { + struct ib_xrcd ibxrcd; + u32 xrcdn; +}; + +enum mlx5_ib_mtt_access_flags { + MLX5_IB_MTT_READ = (1 << 0), + MLX5_IB_MTT_WRITE = (1 << 1), +}; + +struct mlx5_ib_dm { + struct ib_dm ibdm; + phys_addr_t dev_addr; +}; + +#define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE) + +#define MLX5_IB_DM_ALLOWED_ACCESS (IB_ACCESS_LOCAL_WRITE |\ + IB_ACCESS_REMOTE_WRITE |\ + IB_ACCESS_REMOTE_READ |\ + IB_ACCESS_REMOTE_ATOMIC |\ + IB_ZERO_BASED) + +struct mlx5_ib_mr { + struct ib_mr ibmr; + void *descs; + dma_addr_t desc_map; + int ndescs; + int max_descs; + int desc_size; + int access_mode; + struct mlx5_core_mkey mmkey; + struct ib_umem *umem; + struct mlx5_shared_mr_info *smr_info; + struct list_head list; + int order; + bool allocated_from_cache; + int npages; + struct mlx5_ib_dev *dev; + u32 out[MLX5_ST_SZ_DW(create_mkey_out)]; + struct mlx5_core_sig_ctx *sig; + int live; + void *descs_alloc; + int access_flags; /* Needed for rereg MR */ + + struct mlx5_ib_mr *parent; + atomic_t num_leaf_free; + wait_queue_head_t q_leaf_free; +}; + +struct mlx5_ib_mw { + struct ib_mw ibmw; + struct mlx5_core_mkey mmkey; + int ndescs; +}; + +struct mlx5_ib_umr_context { + struct ib_cqe cqe; + enum ib_wc_status status; + struct completion done; +}; + +struct umr_common { + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; + /* control access to UMR QP + */ + struct semaphore sem; +}; + +enum { + MLX5_FMR_INVALID, + MLX5_FMR_VALID, + MLX5_FMR_BUSY, +}; + +struct mlx5_cache_ent { + struct list_head head; + /* sync access to the cahce entry + */ + spinlock_t lock; + + + struct dentry *dir; + char name[4]; + u32 order; + u32 xlt; + u32 access_mode; + u32 page; + + u32 size; + u32 cur; + u32 miss; + u32 limit; + + struct dentry *fsize; + struct dentry *fcur; + struct dentry *fmiss; + struct dentry *flimit; + + struct mlx5_ib_dev *dev; + struct work_struct work; + struct delayed_work dwork; + int pending; + struct completion compl; +}; + +struct mlx5_mr_cache { + struct workqueue_struct *wq; + struct mlx5_cache_ent ent[MAX_MR_CACHE_ENTRIES]; + int stopped; + struct dentry *root; + unsigned long last_add; +}; + +struct mlx5_ib_gsi_qp; + +struct mlx5_ib_port_resources { + struct mlx5_ib_resources *devr; + struct mlx5_ib_gsi_qp *gsi; + struct work_struct pkey_change_work; +}; + +struct mlx5_ib_resources { + struct ib_cq *c0; + struct ib_xrcd *x0; + struct ib_xrcd *x1; + struct ib_pd *p0; + struct ib_srq *s0; + struct ib_srq *s1; + struct mlx5_ib_port_resources ports[2]; + /* Protects changes to the port resources */ + struct mutex mutex; +}; + +struct mlx5_ib_counters { + const char **names; + size_t *offsets; + u32 num_q_counters; + u32 num_cong_counters; + u32 num_ext_ppcnt_counters; + u16 set_id; + bool set_id_valid; +}; + +struct mlx5_ib_multiport_info; + +struct mlx5_ib_multiport { + struct mlx5_ib_multiport_info *mpi; + /* To be held when accessing the multiport info */ + spinlock_t mpi_lock; +}; + +struct mlx5_ib_port { + struct mlx5_ib_counters cnts; + struct mlx5_ib_multiport mp; + struct mlx5_ib_dbg_cc_params *dbg_cc_params; +}; + +struct mlx5_roce { + /* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL + * netdev pointer + */ + rwlock_t netdev_lock; + struct net_device *netdev; + struct notifier_block nb; + atomic_t tx_port_affinity; + enum ib_port_state last_port_state; + struct mlx5_ib_dev *dev; + u8 native_port_num; +}; + +struct mlx5_ib_dbg_param { + int offset; + struct mlx5_ib_dev *dev; + struct dentry *dentry; + u8 port_num; +}; + +enum mlx5_ib_dbg_cc_types { + MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE, + MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE_ATI, + MLX5_IB_DBG_CC_RP_TIME_RESET, + MLX5_IB_DBG_CC_RP_BYTE_RESET, + MLX5_IB_DBG_CC_RP_THRESHOLD, + MLX5_IB_DBG_CC_RP_AI_RATE, + MLX5_IB_DBG_CC_RP_HAI_RATE, + MLX5_IB_DBG_CC_RP_MIN_DEC_FAC, + MLX5_IB_DBG_CC_RP_MIN_RATE, + MLX5_IB_DBG_CC_RP_RATE_TO_SET_ON_FIRST_CNP, + MLX5_IB_DBG_CC_RP_DCE_TCP_G, + MLX5_IB_DBG_CC_RP_DCE_TCP_RTT, + MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD, + MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE, + MLX5_IB_DBG_CC_RP_GD, + MLX5_IB_DBG_CC_NP_CNP_DSCP, + MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE, + MLX5_IB_DBG_CC_NP_CNP_PRIO, + MLX5_IB_DBG_CC_MAX, +}; + +struct mlx5_ib_dbg_cc_params { + struct dentry *root; + struct mlx5_ib_dbg_param params[MLX5_IB_DBG_CC_MAX]; +}; + +enum { + MLX5_MAX_DELAY_DROP_TIMEOUT_MS = 100, +}; + +struct mlx5_ib_dbg_delay_drop { + struct dentry *dir_debugfs; + struct dentry *rqs_cnt_debugfs; + struct dentry *events_cnt_debugfs; + struct dentry *timeout_debugfs; +}; + +struct mlx5_ib_delay_drop { + struct mlx5_ib_dev *dev; + struct work_struct delay_drop_work; + /* serialize setting of delay drop */ + struct mutex lock; + u32 timeout; + bool activate; + atomic_t events_cnt; + atomic_t rqs_cnt; + struct mlx5_ib_dbg_delay_drop *dbg; +}; + +enum mlx5_ib_stages { + MLX5_IB_STAGE_INIT, + MLX5_IB_STAGE_FLOW_DB, + MLX5_IB_STAGE_CAPS, + MLX5_IB_STAGE_NON_DEFAULT_CB, + MLX5_IB_STAGE_ROCE, + MLX5_IB_STAGE_DEVICE_RESOURCES, + MLX5_IB_STAGE_ODP, + MLX5_IB_STAGE_COUNTERS, + MLX5_IB_STAGE_CONG_DEBUGFS, + MLX5_IB_STAGE_UAR, + MLX5_IB_STAGE_BFREG, + MLX5_IB_STAGE_PRE_IB_REG_UMR, + MLX5_IB_STAGE_SPECS, + MLX5_IB_STAGE_IB_REG, + MLX5_IB_STAGE_POST_IB_REG_UMR, + MLX5_IB_STAGE_DELAY_DROP, + MLX5_IB_STAGE_CLASS_ATTR, + MLX5_IB_STAGE_REP_REG, + MLX5_IB_STAGE_MAX, +}; + +struct mlx5_ib_stage { + int (*init)(struct mlx5_ib_dev *dev); + void (*cleanup)(struct mlx5_ib_dev *dev); +}; + +#define STAGE_CREATE(_stage, _init, _cleanup) \ + .stage[_stage] = {.init = _init, .cleanup = _cleanup} + +struct mlx5_ib_profile { + struct mlx5_ib_stage stage[MLX5_IB_STAGE_MAX]; +}; + +struct mlx5_ib_multiport_info { + struct list_head list; + struct mlx5_ib_dev *ibdev; + struct mlx5_core_dev *mdev; + struct completion unref_comp; + u64 sys_image_guid; + u32 mdev_refcnt; + bool is_master; + bool unaffiliate; +}; + +struct mlx5_ib_flow_action { + struct ib_flow_action ib_action; + union { + struct { + u64 ib_flags; + struct mlx5_accel_esp_xfrm *ctx; + } esp_aes_gcm; + }; +}; + +struct mlx5_memic { + struct mlx5_core_dev *dev; + spinlock_t memic_lock; + DECLARE_BITMAP(memic_alloc_pages, MLX5_MAX_MEMIC_PAGES); +}; + +struct mlx5_read_counters_attr { + struct mlx5_fc *hw_cntrs_hndl; + u64 *out; + u32 flags; +}; + +enum mlx5_ib_counters_type { + MLX5_IB_COUNTERS_FLOW, +}; + +struct mlx5_ib_mcounters { + struct ib_counters ibcntrs; + enum mlx5_ib_counters_type type; + /* number of counters supported for this counters type */ + u32 counters_num; + struct mlx5_fc *hw_cntrs_hndl; + /* read function for this counters type */ + int (*read_counters)(struct ib_device *ibdev, + struct mlx5_read_counters_attr *read_attr); + /* max index set as part of create_flow */ + u32 cntrs_max_index; + /* number of counters data entries (<description,index> pair) */ + u32 ncounters; + /* counters data array for descriptions and indexes */ + struct mlx5_ib_flow_counters_desc *counters_data; + /* protects access to mcounters internal data */ + struct mutex mcntrs_mutex; +}; + +static inline struct mlx5_ib_mcounters * +to_mcounters(struct ib_counters *ibcntrs) +{ + return container_of(ibcntrs, struct mlx5_ib_mcounters, ibcntrs); +} + +struct mlx5_ib_dev { + struct ib_device ib_dev; + const struct uverbs_object_tree_def *driver_trees[6]; + struct mlx5_core_dev *mdev; + struct mlx5_roce roce[MLX5_MAX_PORTS]; + int num_ports; + /* serialize update of capability mask + */ + struct mutex cap_mask_mutex; + bool ib_active; + struct umr_common umrc; + /* sync used page count stats + */ + struct mlx5_ib_resources devr; + struct mlx5_mr_cache cache; + struct timer_list delay_timer; + /* Prevents soft lock on massive reg MRs */ + struct mutex slow_path_mutex; + int fill_delay; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + struct ib_odp_caps odp_caps; + u64 odp_max_size; + /* + * Sleepable RCU that prevents destruction of MRs while they are still + * being used by a page fault handler. + */ + struct srcu_struct mr_srcu; + u32 null_mkey; +#endif + struct mlx5_ib_flow_db *flow_db; + /* protect resources needed as part of reset flow */ + spinlock_t reset_flow_resource_lock; + struct list_head qp_list; + /* Array with num_ports elements */ + struct mlx5_ib_port *port; + struct mlx5_sq_bfreg bfreg; + struct mlx5_sq_bfreg fp_bfreg; + struct mlx5_ib_delay_drop delay_drop; + const struct mlx5_ib_profile *profile; + struct mlx5_eswitch_rep *rep; + + /* protect the user_td */ + struct mutex lb_mutex; + u32 user_td; + u8 umr_fence; + struct list_head ib_dev_list; + u64 sys_image_guid; + struct mlx5_memic memic; +}; + +static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) +{ + return container_of(mcq, struct mlx5_ib_cq, mcq); +} + +static inline struct mlx5_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd) +{ + return container_of(ibxrcd, struct mlx5_ib_xrcd, ibxrcd); +} + +static inline struct mlx5_ib_dev *to_mdev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct mlx5_ib_dev, ib_dev); +} + +static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct mlx5_ib_cq, ibcq); +} + +static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp) +{ + return container_of(mqp, struct mlx5_ib_qp_base, mqp)->container_mibqp; +} + +static inline struct mlx5_ib_rwq *to_mibrwq(struct mlx5_core_qp *core_qp) +{ + return container_of(core_qp, struct mlx5_ib_rwq, core_qp); +} + +static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mkey *mmkey) +{ + return container_of(mmkey, struct mlx5_ib_mr, mmkey); +} + +static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct mlx5_ib_pd, ibpd); +} + +static inline struct mlx5_ib_srq *to_msrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct mlx5_ib_srq, ibsrq); +} + +static inline struct mlx5_ib_qp *to_mqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct mlx5_ib_qp, ibqp); +} + +static inline struct mlx5_ib_rwq *to_mrwq(struct ib_wq *ibwq) +{ + return container_of(ibwq, struct mlx5_ib_rwq, ibwq); +} + +static inline struct mlx5_ib_rwq_ind_table *to_mrwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) +{ + return container_of(ib_rwq_ind_tbl, struct mlx5_ib_rwq_ind_table, ib_rwq_ind_tbl); +} + +static inline struct mlx5_ib_srq *to_mibsrq(struct mlx5_core_srq *msrq) +{ + return container_of(msrq, struct mlx5_ib_srq, msrq); +} + +static inline struct mlx5_ib_dm *to_mdm(struct ib_dm *ibdm) +{ + return container_of(ibdm, struct mlx5_ib_dm, ibdm); +} + +static inline struct mlx5_ib_mr *to_mmr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct mlx5_ib_mr, ibmr); +} + +static inline struct mlx5_ib_mw *to_mmw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct mlx5_ib_mw, ibmw); +} + +static inline struct mlx5_ib_flow_action * +to_mflow_act(struct ib_flow_action *ibact) +{ + return container_of(ibact, struct mlx5_ib_flow_action, ib_action); +} + +int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, + struct mlx5_db *db); +void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db); +void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); +void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); +void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index); +int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, + u8 port, const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const void *in_mad, void *response_mad); +struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, + struct ib_udata *udata); +int mlx5_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr); +int mlx5_ib_destroy_ah(struct ib_ah *ah); +struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); +int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr); +int mlx5_ib_destroy_srq(struct ib_srq *srq); +int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int mlx5_ib_destroy_qp(struct ib_qp *qp); +void mlx5_ib_drain_sq(struct ib_qp *qp); +void mlx5_ib_drain_rq(struct ib_qp *qp); +int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n); +int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, + void *buffer, u32 length, + struct mlx5_ib_qp_base *base); +struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, + const struct ib_cq_init_attr *attr, + struct ib_ucontext *context, + struct ib_udata *udata); +int mlx5_ib_destroy_cq(struct ib_cq *cq); +int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); +int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); +int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); +struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc); +struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata); +struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata); +int mlx5_ib_dealloc_mw(struct ib_mw *mw); +int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, + int page_shift, int flags); +struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, + int access_flags); +void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr); +int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, + u64 length, u64 virt_addr, int access_flags, + struct ib_pd *pd, struct ib_udata *udata); +int mlx5_ib_dereg_mr(struct ib_mr *ibmr); +struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_num_sg); +int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); +int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad_hdr *in, size_t in_mad_size, + struct ib_mad_hdr *out, size_t *out_mad_size, + u16 *out_mad_pkey_index); +struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata); +int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd); +int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset); +int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port); +int mlx5_query_mad_ifc_smp_attr_node_info(struct ib_device *ibdev, + struct ib_smp *out_mad); +int mlx5_query_mad_ifc_system_image_guid(struct ib_device *ibdev, + __be64 *sys_image_guid); +int mlx5_query_mad_ifc_max_pkeys(struct ib_device *ibdev, + u16 *max_pkeys); +int mlx5_query_mad_ifc_vendor_id(struct ib_device *ibdev, + u32 *vendor_id); +int mlx5_query_mad_ifc_node_desc(struct mlx5_ib_dev *dev, char *node_desc); +int mlx5_query_mad_ifc_node_guid(struct mlx5_ib_dev *dev, __be64 *node_guid); +int mlx5_query_mad_ifc_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey); +int mlx5_query_mad_ifc_gids(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid); +int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props); +int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props); +int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev); +void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev); +void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, + unsigned long max_page_shift, + int *count, int *shift, + int *ncont, int *order); +void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, size_t offset, size_t num_pages, + __be64 *pas, int access_flags); +void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, __be64 *pas, int access_flags); +void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); +int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); +int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); +int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); + +struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry); +void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); +int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, + struct ib_mr_status *mr_status); +struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_destroy_wq(struct ib_wq *wq); +int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask, struct ib_udata *udata); +struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); +bool mlx5_ib_dc_atomic_is_supported(struct mlx5_ib_dev *dev); +struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_dm_alloc_attr *attr, + struct uverbs_attr_bundle *attrs); +int mlx5_ib_dealloc_dm(struct ib_dm *ibdm); +struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, + struct ib_dm_mr_attr *attr, + struct uverbs_attr_bundle *attrs); + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev); +void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, + struct mlx5_pagefault *pfault); +int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); +int __init mlx5_ib_odp_init(void); +void mlx5_ib_odp_cleanup(void); +void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, + unsigned long end); +void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent); +void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, + size_t nentries, struct mlx5_ib_mr *mr, int flags); +#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ +static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) +{ + return; +} + +static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; } +static inline int mlx5_ib_odp_init(void) { return 0; } +static inline void mlx5_ib_odp_cleanup(void) {} +static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {} +static inline void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, + size_t nentries, struct mlx5_ib_mr *mr, + int flags) {} + +#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ + +/* Needed for rep profile */ +int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev); +void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev); +int mlx5_ib_stage_rep_flow_db_init(struct mlx5_ib_dev *dev); +int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev); +int mlx5_ib_stage_rep_non_default_cb(struct mlx5_ib_dev *dev); +int mlx5_ib_stage_rep_roce_init(struct mlx5_ib_dev *dev); +void mlx5_ib_stage_rep_roce_cleanup(struct mlx5_ib_dev *dev); +int mlx5_ib_stage_dev_res_init(struct mlx5_ib_dev *dev); +void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev); +int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev); +void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev); +int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev); +void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev); +void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev); +int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev); +void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev); +int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev); +int mlx5_ib_stage_class_attr_init(struct mlx5_ib_dev *dev); +void __mlx5_ib_remove(struct mlx5_ib_dev *dev, + const struct mlx5_ib_profile *profile, + int stage); +void *__mlx5_ib_add(struct mlx5_ib_dev *dev, + const struct mlx5_ib_profile *profile); + +int mlx5_ib_get_vf_config(struct ib_device *device, int vf, + u8 port, struct ifla_vf_info *info); +int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf, + u8 port, int state); +int mlx5_ib_get_vf_stats(struct ib_device *device, int vf, + u8 port, struct ifla_vf_stats *stats); +int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u8 port, + u64 guid, int type); + +__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, + const struct ib_gid_attr *attr); + +void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num); +int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num); + +/* GSI QP helper functions */ +struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr); +int mlx5_ib_gsi_destroy_qp(struct ib_qp *qp); +int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, + int attr_mask); +int mlx5_ib_gsi_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int mlx5_ib_gsi_post_send(struct ib_qp *qp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int mlx5_ib_gsi_post_recv(struct ib_qp *qp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi); + +int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc); + +void mlx5_ib_free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, + int bfregn); +struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi); +struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *dev, + u8 ib_port_num, + u8 *native_port_num); +void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *dev, + u8 port_num); + +#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) +int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, + struct mlx5_ib_ucontext *context); +void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, + struct mlx5_ib_ucontext *context); +const struct uverbs_object_tree_def *mlx5_ib_get_devx_tree(void); +struct mlx5_ib_flow_handler *mlx5_ib_raw_fs_rule_add( + struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher, + void *cmd_in, int inlen, int dest_id, int dest_type); +bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type); +int mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root); +#else +static inline int +mlx5_ib_devx_create(struct mlx5_ib_dev *dev, + struct mlx5_ib_ucontext *context) { return -EOPNOTSUPP; }; +static inline void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, + struct mlx5_ib_ucontext *context) {} +static inline const struct uverbs_object_tree_def * +mlx5_ib_get_devx_tree(void) { return NULL; } +static inline bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, + int *dest_type) +{ + return false; +} +static inline int +mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root) +{ + return 0; +} +#endif +static inline void init_query_mad(struct ib_smp *mad) +{ + mad->base_version = 1; + mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + mad->class_version = 1; + mad->method = IB_MGMT_METHOD_GET; +} + +static inline u8 convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | + MLX5_PERM_LOCAL_READ; +} + +static inline int is_qp1(enum ib_qp_type qp_type) +{ + return qp_type == MLX5_IB_QPT_HW_GSI; +} + +#define MLX5_MAX_UMR_SHIFT 16 +#define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT) + +static inline u32 check_cq_create_flags(u32 flags) +{ + /* + * It returns non-zero value for unsupported CQ + * create flags, otherwise it returns zero. + */ + return (flags & ~(IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN | + IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION)); +} + +static inline int verify_assign_uidx(u8 cqe_version, u32 cmd_uidx, + u32 *user_index) +{ + if (cqe_version) { + if ((cmd_uidx == MLX5_IB_DEFAULT_UIDX) || + (cmd_uidx & ~MLX5_USER_ASSIGNED_UIDX_MASK)) + return -EINVAL; + *user_index = cmd_uidx; + } else { + *user_index = MLX5_IB_DEFAULT_UIDX; + } + + return 0; +} + +static inline int get_qp_user_index(struct mlx5_ib_ucontext *ucontext, + struct mlx5_ib_create_qp *ucmd, + int inlen, + u32 *user_index) +{ + u8 cqe_version = ucontext->cqe_version; + + if (field_avail(struct mlx5_ib_create_qp, uidx, inlen) && + !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX)) + return 0; + + if (!!(field_avail(struct mlx5_ib_create_qp, uidx, inlen) != + !!cqe_version)) + return -EINVAL; + + return verify_assign_uidx(cqe_version, ucmd->uidx, user_index); +} + +static inline int get_srq_user_index(struct mlx5_ib_ucontext *ucontext, + struct mlx5_ib_create_srq *ucmd, + int inlen, + u32 *user_index) +{ + u8 cqe_version = ucontext->cqe_version; + + if (field_avail(struct mlx5_ib_create_srq, uidx, inlen) && + !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX)) + return 0; + + if (!!(field_avail(struct mlx5_ib_create_srq, uidx, inlen) != + !!cqe_version)) + return -EINVAL; + + return verify_assign_uidx(cqe_version, ucmd->uidx, user_index); +} + +static inline int get_uars_per_sys_page(struct mlx5_ib_dev *dev, bool lib_support) +{ + return lib_support && MLX5_CAP_GEN(dev->mdev, uar_4k) ? + MLX5_UARS_IN_PAGE : 1; +} + +static inline int get_num_static_uars(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi) +{ + return get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * bfregi->num_static_sys_pages; +} + +unsigned long mlx5_ib_get_xlt_emergency_page(void); +void mlx5_ib_put_xlt_emergency_page(void); + +int bfregn_to_uar_index(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi, u32 bfregn, + bool dyn_bfreg); +#endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c new file mode 100644 index 000000000..18fd9aa65 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -0,0 +1,1984 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include <linux/kref.h> +#include <linux/random.h> +#include <linux/debugfs.h> +#include <linux/export.h> +#include <linux/delay.h> +#include <rdma/ib_umem.h> +#include <rdma/ib_umem_odp.h> +#include <rdma/ib_verbs.h> +#include "mlx5_ib.h" + +enum { + MAX_PENDING_REG_MR = 8, +}; + +#define MLX5_UMR_ALIGN 2048 + +static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); +static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); +static int mr_cache_max_order(struct mlx5_ib_dev *dev); +static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); + +static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) +{ + return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled); +} + +static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + /* Wait until all page fault handlers using the mr complete. */ + synchronize_srcu(&dev->mr_srcu); +#endif + + return err; +} + +static int order2idx(struct mlx5_ib_dev *dev, int order) +{ + struct mlx5_mr_cache *cache = &dev->cache; + + if (order < cache->ent[0].order) + return 0; + else + return order - cache->ent[0].order; +} + +static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length) +{ + return ((u64)1 << mr->order) * MLX5_ADAPTER_PAGE_SIZE >= + length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1)); +} + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +static void update_odp_mr(struct mlx5_ib_mr *mr) +{ + if (mr->umem->odp_data) { + /* + * This barrier prevents the compiler from moving the + * setting of umem->odp_data->private to point to our + * MR, before reg_umr finished, to ensure that the MR + * initialization have finished before starting to + * handle invalidations. + */ + smp_wmb(); + mr->umem->odp_data->private = mr; + /* + * Make sure we will see the new + * umem->odp_data->private value in the invalidation + * routines, before we can get page faults on the + * MR. Page faults can happen once we put the MR in + * the tree, below this line. Without the barrier, + * there can be a fault handling and an invalidation + * before umem->odp_data->private == mr is visible to + * the invalidation handler. + */ + smp_wmb(); + } +} +#endif + +static void reg_mr_callback(int status, void *context) +{ + struct mlx5_ib_mr *mr = context; + struct mlx5_ib_dev *dev = mr->dev; + struct mlx5_mr_cache *cache = &dev->cache; + int c = order2idx(dev, mr->order); + struct mlx5_cache_ent *ent = &cache->ent[c]; + u8 key; + unsigned long flags; + struct mlx5_mkey_table *table = &dev->mdev->priv.mkey_table; + int err; + + spin_lock_irqsave(&ent->lock, flags); + ent->pending--; + spin_unlock_irqrestore(&ent->lock, flags); + if (status) { + mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); + kfree(mr); + dev->fill_delay = 1; + mod_timer(&dev->delay_timer, jiffies + HZ); + return; + } + + mr->mmkey.type = MLX5_MKEY_MR; + spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags); + key = dev->mdev->priv.mkey_key++; + spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags); + mr->mmkey.key = mlx5_idx_to_mkey(MLX5_GET(create_mkey_out, mr->out, mkey_index)) | key; + + cache->last_add = jiffies; + + spin_lock_irqsave(&ent->lock, flags); + list_add_tail(&mr->list, &ent->head); + ent->cur++; + ent->size++; + spin_unlock_irqrestore(&ent->lock, flags); + + write_lock_irqsave(&table->lock, flags); + err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmkey.key), + &mr->mmkey); + if (err) + pr_err("Error inserting to mkey tree. 0x%x\n", -err); + write_unlock_irqrestore(&table->lock, flags); + + if (!completion_done(&ent->compl)) + complete(&ent->compl); +} + +static int add_keys(struct mlx5_ib_dev *dev, int c, int num) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + struct mlx5_ib_mr *mr; + void *mkc; + u32 *in; + int err = 0; + int i; + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + for (i = 0; i < num; i++) { + if (ent->pending >= MAX_PENDING_REG_MR) { + err = -EAGAIN; + break; + } + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + err = -ENOMEM; + break; + } + mr->order = ent->order; + mr->allocated_from_cache = 1; + mr->dev = dev; + + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, umr_en, 1); + MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3); + MLX5_SET(mkc, mkc, access_mode_4_2, + (ent->access_mode >> 2) & 0x7); + + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt); + MLX5_SET(mkc, mkc, log_page_size, ent->page); + + spin_lock_irq(&ent->lock); + ent->pending++; + spin_unlock_irq(&ent->lock); + err = mlx5_core_create_mkey_cb(dev->mdev, &mr->mmkey, + in, inlen, + mr->out, sizeof(mr->out), + reg_mr_callback, mr); + if (err) { + spin_lock_irq(&ent->lock); + ent->pending--; + spin_unlock_irq(&ent->lock); + mlx5_ib_warn(dev, "create mkey failed %d\n", err); + kfree(mr); + break; + } + } + + kfree(in); + return err; +} + +static void remove_keys(struct mlx5_ib_dev *dev, int c, int num) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_ib_mr *tmp_mr; + struct mlx5_ib_mr *mr; + LIST_HEAD(del_list); + int i; + + for (i = 0; i < num; i++) { + spin_lock_irq(&ent->lock); + if (list_empty(&ent->head)) { + spin_unlock_irq(&ent->lock); + break; + } + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); + list_move(&mr->list, &del_list); + ent->cur--; + ent->size--; + spin_unlock_irq(&ent->lock); + mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); + } + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + synchronize_srcu(&dev->mr_srcu); +#endif + + list_for_each_entry_safe(mr, tmp_mr, &del_list, list) { + list_del(&mr->list); + kfree(mr); + } +} + +static ssize_t size_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_cache_ent *ent = filp->private_data; + struct mlx5_ib_dev *dev = ent->dev; + char lbuf[20] = {0}; + u32 var; + int err; + int c; + + count = min(count, sizeof(lbuf) - 1); + if (copy_from_user(lbuf, buf, count)) + return -EFAULT; + + c = order2idx(dev, ent->order); + + if (sscanf(lbuf, "%u", &var) != 1) + return -EINVAL; + + if (var < ent->limit) + return -EINVAL; + + if (var > ent->size) { + do { + err = add_keys(dev, c, var - ent->size); + if (err && err != -EAGAIN) + return err; + + usleep_range(3000, 5000); + } while (err); + } else if (var < ent->size) { + remove_keys(dev, c, ent->size - var); + } + + return count; +} + +static ssize_t size_read(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_cache_ent *ent = filp->private_data; + char lbuf[20]; + int err; + + err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->size); + if (err < 0) + return err; + + return simple_read_from_buffer(buf, count, pos, lbuf, err); +} + +static const struct file_operations size_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = size_write, + .read = size_read, +}; + +static ssize_t limit_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_cache_ent *ent = filp->private_data; + struct mlx5_ib_dev *dev = ent->dev; + char lbuf[20] = {0}; + u32 var; + int err; + int c; + + count = min(count, sizeof(lbuf) - 1); + if (copy_from_user(lbuf, buf, count)) + return -EFAULT; + + c = order2idx(dev, ent->order); + + if (sscanf(lbuf, "%u", &var) != 1) + return -EINVAL; + + if (var > ent->size) + return -EINVAL; + + ent->limit = var; + + if (ent->cur < ent->limit) { + err = add_keys(dev, c, 2 * ent->limit - ent->cur); + if (err) + return err; + } + + return count; +} + +static ssize_t limit_read(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_cache_ent *ent = filp->private_data; + char lbuf[20]; + int err; + + err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit); + if (err < 0) + return err; + + return simple_read_from_buffer(buf, count, pos, lbuf, err); +} + +static const struct file_operations limit_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = limit_write, + .read = limit_read, +}; + +static int someone_adding(struct mlx5_mr_cache *cache) +{ + int i; + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + if (cache->ent[i].cur < cache->ent[i].limit) + return 1; + } + + return 0; +} + +static void __cache_work_func(struct mlx5_cache_ent *ent) +{ + struct mlx5_ib_dev *dev = ent->dev; + struct mlx5_mr_cache *cache = &dev->cache; + int i = order2idx(dev, ent->order); + int err; + + if (cache->stopped) + return; + + ent = &dev->cache.ent[i]; + if (ent->cur < 2 * ent->limit && !dev->fill_delay) { + err = add_keys(dev, i, 1); + if (ent->cur < 2 * ent->limit) { + if (err == -EAGAIN) { + mlx5_ib_dbg(dev, "returned eagain, order %d\n", + i + 2); + queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(3)); + } else if (err) { + mlx5_ib_warn(dev, "command failed order %d, err %d\n", + i + 2, err); + queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(1000)); + } else { + queue_work(cache->wq, &ent->work); + } + } + } else if (ent->cur > 2 * ent->limit) { + /* + * The remove_keys() logic is performed as garbage collection + * task. Such task is intended to be run when no other active + * processes are running. + * + * The need_resched() will return TRUE if there are user tasks + * to be activated in near future. + * + * In such case, we don't execute remove_keys() and postpone + * the garbage collection work to try to run in next cycle, + * in order to free CPU resources to other tasks. + */ + if (!need_resched() && !someone_adding(cache) && + time_after(jiffies, cache->last_add + 300 * HZ)) { + remove_keys(dev, i, 1); + if (ent->cur > ent->limit) + queue_work(cache->wq, &ent->work); + } else { + queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); + } + } +} + +static void delayed_cache_work_func(struct work_struct *work) +{ + struct mlx5_cache_ent *ent; + + ent = container_of(work, struct mlx5_cache_ent, dwork.work); + __cache_work_func(ent); +} + +static void cache_work_func(struct work_struct *work) +{ + struct mlx5_cache_ent *ent; + + ent = container_of(work, struct mlx5_cache_ent, work); + __cache_work_func(ent); +} + +struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + struct mlx5_ib_mr *mr; + int err; + + if (entry < 0 || entry >= MAX_MR_CACHE_ENTRIES) { + mlx5_ib_err(dev, "cache entry %d is out of range\n", entry); + return ERR_PTR(-EINVAL); + } + + ent = &cache->ent[entry]; + while (1) { + spin_lock_irq(&ent->lock); + if (list_empty(&ent->head)) { + spin_unlock_irq(&ent->lock); + + err = add_keys(dev, entry, 1); + if (err && err != -EAGAIN) + return ERR_PTR(err); + + wait_for_completion(&ent->compl); + } else { + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, + list); + list_del(&mr->list); + ent->cur--; + spin_unlock_irq(&ent->lock); + if (ent->cur < ent->limit) + queue_work(cache->wq, &ent->work); + return mr; + } + } +} + +static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_ib_mr *mr = NULL; + struct mlx5_cache_ent *ent; + int last_umr_cache_entry; + int c; + int i; + + c = order2idx(dev, order); + last_umr_cache_entry = order2idx(dev, mr_cache_max_order(dev)); + if (c < 0 || c > last_umr_cache_entry) { + mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c); + return NULL; + } + + for (i = c; i <= last_umr_cache_entry; i++) { + ent = &cache->ent[i]; + + mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i); + + spin_lock_irq(&ent->lock); + if (!list_empty(&ent->head)) { + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, + list); + list_del(&mr->list); + ent->cur--; + spin_unlock_irq(&ent->lock); + if (ent->cur < ent->limit) + queue_work(cache->wq, &ent->work); + break; + } + spin_unlock_irq(&ent->lock); + + queue_work(cache->wq, &ent->work); + } + + if (!mr) + cache->ent[c].miss++; + + return mr; +} + +void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int shrink = 0; + int c; + + if (!mr->allocated_from_cache) + return; + + c = order2idx(dev, mr->order); + WARN_ON(c < 0 || c >= MAX_MR_CACHE_ENTRIES); + + if (unreg_umr(dev, mr)) { + mr->allocated_from_cache = false; + destroy_mkey(dev, mr); + ent = &cache->ent[c]; + if (ent->cur < ent->limit) + queue_work(cache->wq, &ent->work); + return; + } + + ent = &cache->ent[c]; + spin_lock_irq(&ent->lock); + list_add_tail(&mr->list, &ent->head); + ent->cur++; + if (ent->cur > 2 * ent->limit) + shrink = 1; + spin_unlock_irq(&ent->lock); + + if (shrink) + queue_work(cache->wq, &ent->work); +} + +static void clean_keys(struct mlx5_ib_dev *dev, int c) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_ib_mr *tmp_mr; + struct mlx5_ib_mr *mr; + LIST_HEAD(del_list); + + cancel_delayed_work(&ent->dwork); + while (1) { + spin_lock_irq(&ent->lock); + if (list_empty(&ent->head)) { + spin_unlock_irq(&ent->lock); + break; + } + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); + list_move(&mr->list, &del_list); + ent->cur--; + ent->size--; + spin_unlock_irq(&ent->lock); + mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); + } + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + synchronize_srcu(&dev->mr_srcu); +#endif + + list_for_each_entry_safe(mr, tmp_mr, &del_list, list) { + list_del(&mr->list); + kfree(mr); + } +} + +static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) +{ + if (!mlx5_debugfs_root || dev->rep) + return; + + debugfs_remove_recursive(dev->cache.root); + dev->cache.root = NULL; +} + +static int mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int i; + + if (!mlx5_debugfs_root || dev->rep) + return 0; + + cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root); + if (!cache->root) + return -ENOMEM; + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + ent = &cache->ent[i]; + sprintf(ent->name, "%d", ent->order); + ent->dir = debugfs_create_dir(ent->name, cache->root); + if (!ent->dir) + goto err; + + ent->fsize = debugfs_create_file("size", 0600, ent->dir, ent, + &size_fops); + if (!ent->fsize) + goto err; + + ent->flimit = debugfs_create_file("limit", 0600, ent->dir, ent, + &limit_fops); + if (!ent->flimit) + goto err; + + ent->fcur = debugfs_create_u32("cur", 0400, ent->dir, + &ent->cur); + if (!ent->fcur) + goto err; + + ent->fmiss = debugfs_create_u32("miss", 0600, ent->dir, + &ent->miss); + if (!ent->fmiss) + goto err; + } + + return 0; +err: + mlx5_mr_cache_debugfs_cleanup(dev); + + return -ENOMEM; +} + +static void delay_time_func(struct timer_list *t) +{ + struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer); + + dev->fill_delay = 0; +} + +int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int err; + int i; + + mutex_init(&dev->slow_path_mutex); + cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); + if (!cache->wq) { + mlx5_ib_warn(dev, "failed to create work queue\n"); + return -ENOMEM; + } + + timer_setup(&dev->delay_timer, delay_time_func, 0); + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + ent = &cache->ent[i]; + INIT_LIST_HEAD(&ent->head); + spin_lock_init(&ent->lock); + ent->order = i + 2; + ent->dev = dev; + ent->limit = 0; + + init_completion(&ent->compl); + INIT_WORK(&ent->work, cache_work_func); + INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); + + if (i > MR_CACHE_LAST_STD_ENTRY) { + mlx5_odp_init_mr_cache_entry(ent); + continue; + } + + if (ent->order > mr_cache_max_order(dev)) + continue; + + ent->page = PAGE_SHIFT; + ent->xlt = (1 << ent->order) * sizeof(struct mlx5_mtt) / + MLX5_IB_UMR_OCTOWORD; + ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; + if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) && + !dev->rep && + mlx5_core_is_pf(dev->mdev)) + ent->limit = dev->mdev->profile->mr_cache[i].limit; + else + ent->limit = 0; + queue_work(cache->wq, &ent->work); + } + + err = mlx5_mr_cache_debugfs_init(dev); + if (err) + mlx5_ib_warn(dev, "cache debugfs failure\n"); + + /* + * We don't want to fail driver if debugfs failed to initialize, + * so we are not forwarding error to the user. + */ + + return 0; +} + +static void wait_for_async_commands(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int total = 0; + int i; + int j; + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + ent = &cache->ent[i]; + for (j = 0 ; j < 1000; j++) { + if (!ent->pending) + break; + msleep(50); + } + } + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + ent = &cache->ent[i]; + total += ent->pending; + } + + if (total) + mlx5_ib_warn(dev, "aborted while there are %d pending mr requests\n", total); + else + mlx5_ib_warn(dev, "done with all pending requests\n"); +} + +int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) +{ + int i; + + if (!dev->cache.wq) + return 0; + + dev->cache.stopped = 1; + flush_workqueue(dev->cache.wq); + + mlx5_mr_cache_debugfs_cleanup(dev); + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) + clean_keys(dev, i); + + destroy_workqueue(dev->cache.wq); + wait_for_async_commands(dev); + del_timer_sync(&dev->delay_timer); + + return 0; +} + +struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_mr *mr; + void *mkc; + u32 *in; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); + MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); + MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); + MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); + MLX5_SET(mkc, mkc, lr, 1); + + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET64(mkc, mkc, start_addr, 0); + + err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen); + if (err) + goto err_in; + + kfree(in); + mr->mmkey.type = MLX5_MKEY_MR; + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; + mr->umem = NULL; + + return &mr->ibmr; + +err_in: + kfree(in); + +err_free: + kfree(mr); + + return ERR_PTR(err); +} + +static int get_octo_len(u64 addr, u64 len, int page_shift) +{ + u64 page_size = 1ULL << page_shift; + u64 offset; + int npages; + + offset = addr & (page_size - 1); + npages = ALIGN(len + offset, page_size) >> page_shift; + return (npages + 1) / 2; +} + +static int mr_cache_max_order(struct mlx5_ib_dev *dev) +{ + if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) + return MR_CACHE_LAST_STD_ENTRY + 2; + return MLX5_MAX_UMR_SHIFT; +} + +static int mr_umem_get(struct ib_pd *pd, u64 start, u64 length, + int access_flags, struct ib_umem **umem, + int *npages, int *page_shift, int *ncont, + int *order) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct ib_umem *u; + int err; + + *umem = NULL; + + u = ib_umem_get(pd->uobject->context, start, length, access_flags, 0); + err = PTR_ERR_OR_ZERO(u); + if (err) { + mlx5_ib_dbg(dev, "umem get failed (%d)\n", err); + return err; + } + + mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages, + page_shift, ncont, order); + if (!*npages) { + mlx5_ib_warn(dev, "avoid zero region\n"); + ib_umem_release(u); + return -EINVAL; + } + + *umem = u; + + mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n", + *npages, *ncont, *order, *page_shift); + + return 0; +} + +static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct mlx5_ib_umr_context *context = + container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe); + + context->status = wc->status; + complete(&context->done); +} + +static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) +{ + context->cqe.done = mlx5_ib_umr_done; + context->status = -1; + init_completion(&context->done); +} + +static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev, + struct mlx5_umr_wr *umrwr) +{ + struct umr_common *umrc = &dev->umrc; + const struct ib_send_wr *bad; + int err; + struct mlx5_ib_umr_context umr_context; + + mlx5_ib_init_umr_context(&umr_context); + umrwr->wr.wr_cqe = &umr_context.cqe; + + down(&umrc->sem); + err = ib_post_send(umrc->qp, &umrwr->wr, &bad); + if (err) { + mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err); + } else { + wait_for_completion(&umr_context.done); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "reg umr failed (%u)\n", + umr_context.status); + err = -EFAULT; + } + } + up(&umrc->sem); + return err; +} + +static struct mlx5_ib_mr *alloc_mr_from_cache( + struct ib_pd *pd, struct ib_umem *umem, + u64 virt_addr, u64 len, int npages, + int page_shift, int order, int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_mr *mr; + int err = 0; + int i; + + for (i = 0; i < 1; i++) { + mr = alloc_cached_mr(dev, order); + if (mr) + break; + + err = add_keys(dev, order2idx(dev, order), 1); + if (err && err != -EAGAIN) { + mlx5_ib_warn(dev, "add_keys failed, err %d\n", err); + break; + } + } + + if (!mr) + return ERR_PTR(-EAGAIN); + + mr->ibmr.pd = pd; + mr->umem = umem; + mr->access_flags = access_flags; + mr->desc_size = sizeof(struct mlx5_mtt); + mr->mmkey.iova = virt_addr; + mr->mmkey.size = len; + mr->mmkey.pd = to_mpd(pd)->pdn; + + return mr; +} + +static inline int populate_xlt(struct mlx5_ib_mr *mr, int idx, int npages, + void *xlt, int page_shift, size_t size, + int flags) +{ + struct mlx5_ib_dev *dev = mr->dev; + struct ib_umem *umem = mr->umem; + + if (flags & MLX5_IB_UPD_XLT_INDIRECT) { + if (!umr_can_use_indirect_mkey(dev)) + return -EPERM; + mlx5_odp_populate_klm(xlt, idx, npages, mr, flags); + return npages; + } + + npages = min_t(size_t, npages, ib_umem_num_pages(umem) - idx); + + if (!(flags & MLX5_IB_UPD_XLT_ZAP)) { + __mlx5_ib_populate_pas(dev, umem, page_shift, + idx, npages, xlt, + MLX5_IB_MTT_PRESENT); + /* Clear padding after the pages + * brought from the umem. + */ + memset(xlt + (npages * sizeof(struct mlx5_mtt)), 0, + size - npages * sizeof(struct mlx5_mtt)); + } + + return npages; +} + +#define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \ + MLX5_UMR_MTT_ALIGNMENT) +#define MLX5_SPARE_UMR_CHUNK 0x10000 + +int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, + int page_shift, int flags) +{ + struct mlx5_ib_dev *dev = mr->dev; + struct device *ddev = dev->ib_dev.dev.parent; + int size; + void *xlt; + dma_addr_t dma; + struct mlx5_umr_wr wr; + struct ib_sge sg; + int err = 0; + int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT) + ? sizeof(struct mlx5_klm) + : sizeof(struct mlx5_mtt); + const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size; + const int page_mask = page_align - 1; + size_t pages_mapped = 0; + size_t pages_to_map = 0; + size_t pages_iter = 0; + gfp_t gfp; + bool use_emergency_page = false; + + if ((flags & MLX5_IB_UPD_XLT_INDIRECT) && + !umr_can_use_indirect_mkey(dev)) + return -EPERM; + + /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes, + * so we need to align the offset and length accordingly + */ + if (idx & page_mask) { + npages += idx & page_mask; + idx &= ~page_mask; + } + + gfp = flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC : GFP_KERNEL; + gfp |= __GFP_ZERO | __GFP_NOWARN; + + pages_to_map = ALIGN(npages, page_align); + size = desc_size * pages_to_map; + size = min_t(int, size, MLX5_MAX_UMR_CHUNK); + + xlt = (void *)__get_free_pages(gfp, get_order(size)); + if (!xlt && size > MLX5_SPARE_UMR_CHUNK) { + mlx5_ib_dbg(dev, "Failed to allocate %d bytes of order %d. fallback to spare UMR allocation od %d bytes\n", + size, get_order(size), MLX5_SPARE_UMR_CHUNK); + + size = MLX5_SPARE_UMR_CHUNK; + xlt = (void *)__get_free_pages(gfp, get_order(size)); + } + + if (!xlt) { + mlx5_ib_warn(dev, "Using XLT emergency buffer\n"); + xlt = (void *)mlx5_ib_get_xlt_emergency_page(); + size = PAGE_SIZE; + memset(xlt, 0, size); + use_emergency_page = true; + } + pages_iter = size / desc_size; + dma = dma_map_single(ddev, xlt, size, DMA_TO_DEVICE); + if (dma_mapping_error(ddev, dma)) { + mlx5_ib_err(dev, "unable to map DMA during XLT update.\n"); + err = -ENOMEM; + goto free_xlt; + } + + sg.addr = dma; + sg.lkey = dev->umrc.pd->local_dma_lkey; + + memset(&wr, 0, sizeof(wr)); + wr.wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT; + if (!(flags & MLX5_IB_UPD_XLT_ENABLE)) + wr.wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE; + wr.wr.sg_list = &sg; + wr.wr.num_sge = 1; + wr.wr.opcode = MLX5_IB_WR_UMR; + + wr.pd = mr->ibmr.pd; + wr.mkey = mr->mmkey.key; + wr.length = mr->mmkey.size; + wr.virt_addr = mr->mmkey.iova; + wr.access_flags = mr->access_flags; + wr.page_shift = page_shift; + + for (pages_mapped = 0; + pages_mapped < pages_to_map && !err; + pages_mapped += pages_iter, idx += pages_iter) { + npages = min_t(int, pages_iter, pages_to_map - pages_mapped); + dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE); + npages = populate_xlt(mr, idx, npages, xlt, + page_shift, size, flags); + + dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE); + + sg.length = ALIGN(npages * desc_size, + MLX5_UMR_MTT_ALIGNMENT); + + if (pages_mapped + pages_iter >= pages_to_map) { + if (flags & MLX5_IB_UPD_XLT_ENABLE) + wr.wr.send_flags |= + MLX5_IB_SEND_UMR_ENABLE_MR | + MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS | + MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; + if (flags & MLX5_IB_UPD_XLT_PD || + flags & MLX5_IB_UPD_XLT_ACCESS) + wr.wr.send_flags |= + MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS; + if (flags & MLX5_IB_UPD_XLT_ADDR) + wr.wr.send_flags |= + MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; + } + + wr.offset = idx * desc_size; + wr.xlt_size = sg.length; + + err = mlx5_ib_post_send_wait(dev, &wr); + } + dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); + +free_xlt: + if (use_emergency_page) + mlx5_ib_put_xlt_emergency_page(); + else + free_pages((unsigned long)xlt, get_order(size)); + + return err; +} + +/* + * If ibmr is NULL it will be allocated by reg_create. + * Else, the given ibmr will be used. + */ +static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, + u64 virt_addr, u64 length, + struct ib_umem *umem, int npages, + int page_shift, int access_flags, + bool populate) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_mr *mr; + __be64 *pas; + void *mkc; + int inlen; + u32 *in; + int err; + bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); + + mr = ibmr ? to_mmr(ibmr) : kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + mr->ibmr.pd = pd; + mr->access_flags = access_flags; + + inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + if (populate) + inlen += sizeof(*pas) * roundup(npages, 2); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_1; + } + pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); + if (populate && !(access_flags & IB_ACCESS_ON_DEMAND)) + mlx5_ib_populate_pas(dev, umem, page_shift, pas, + pg_cap ? MLX5_IB_MTT_PRESENT : 0); + + /* The pg_access bit allows setting the access flags + * in the page list submitted with the command. */ + MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap)); + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, free, !populate); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); + MLX5_SET(mkc, mkc, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC)); + MLX5_SET(mkc, mkc, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE)); + MLX5_SET(mkc, mkc, rr, !!(access_flags & IB_ACCESS_REMOTE_READ)); + MLX5_SET(mkc, mkc, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE)); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, umr_en, 1); + + MLX5_SET64(mkc, mkc, start_addr, virt_addr); + MLX5_SET64(mkc, mkc, len, length); + MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); + MLX5_SET(mkc, mkc, bsf_octword_size, 0); + MLX5_SET(mkc, mkc, translations_octword_size, + get_octo_len(virt_addr, length, page_shift)); + MLX5_SET(mkc, mkc, log_page_size, page_shift); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + if (populate) { + MLX5_SET(create_mkey_in, in, translations_octword_actual_size, + get_octo_len(virt_addr, length, page_shift)); + } + + err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen); + if (err) { + mlx5_ib_warn(dev, "create mkey failed\n"); + goto err_2; + } + mr->mmkey.type = MLX5_MKEY_MR; + mr->desc_size = sizeof(struct mlx5_mtt); + mr->dev = dev; + kvfree(in); + + mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); + + return mr; + +err_2: + kvfree(in); + +err_1: + if (!ibmr) + kfree(mr); + + return ERR_PTR(err); +} + +static void set_mr_fileds(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, + int npages, u64 length, int access_flags) +{ + mr->npages = npages; + atomic_add(npages, &dev->mdev->priv.reg_pages); + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; + mr->ibmr.length = length; + mr->access_flags = access_flags; +} + +static struct ib_mr *mlx5_ib_get_memic_mr(struct ib_pd *pd, u64 memic_addr, + u64 length, int acc) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_mr *mr; + void *mkc; + u32 *in; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MEMIC & 0x3); + MLX5_SET(mkc, mkc, access_mode_4_2, + (MLX5_MKC_ACCESS_MODE_MEMIC >> 2) & 0x7); + MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); + MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); + MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); + MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); + MLX5_SET(mkc, mkc, lr, 1); + + MLX5_SET64(mkc, mkc, len, length); + MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET64(mkc, mkc, start_addr, + memic_addr - pci_resource_start(dev->mdev->pdev, 0)); + + err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen); + if (err) + goto err_in; + + kfree(in); + + mr->umem = NULL; + set_mr_fileds(dev, mr, 0, length, acc); + + return &mr->ibmr; + +err_in: + kfree(in); + +err_free: + kfree(mr); + + return ERR_PTR(err); +} + +struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, + struct ib_dm_mr_attr *attr, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_dm *mdm = to_mdm(dm); + u64 memic_addr; + + if (attr->access_flags & ~MLX5_IB_DM_ALLOWED_ACCESS) + return ERR_PTR(-EINVAL); + + memic_addr = mdm->dev_addr + attr->offset; + + return mlx5_ib_get_memic_mr(pd, memic_addr, attr->length, + attr->access_flags); +} + +struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_mr *mr = NULL; + bool use_umr; + struct ib_umem *umem; + int page_shift; + int npages; + int ncont; + int order; + int err; + + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) + return ERR_PTR(-EOPNOTSUPP); + + mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", + start, virt_addr, length, access_flags); + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (!start && length == U64_MAX) { + if (!(access_flags & IB_ACCESS_ON_DEMAND) || + !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) + return ERR_PTR(-EINVAL); + + mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags); + if (IS_ERR(mr)) + return ERR_CAST(mr); + return &mr->ibmr; + } +#endif + + err = mr_umem_get(pd, start, length, access_flags, &umem, &npages, + &page_shift, &ncont, &order); + + if (err < 0) + return ERR_PTR(err); + + use_umr = !MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled) && + (!MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled) || + !MLX5_CAP_GEN(dev->mdev, atomic)); + + if (order <= mr_cache_max_order(dev) && use_umr) { + mr = alloc_mr_from_cache(pd, umem, virt_addr, length, ncont, + page_shift, order, access_flags); + if (PTR_ERR(mr) == -EAGAIN) { + mlx5_ib_dbg(dev, "cache empty for order %d\n", order); + mr = NULL; + } + } else if (!MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) { + if (access_flags & IB_ACCESS_ON_DEMAND) { + err = -EINVAL; + pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB\n"); + goto error; + } + use_umr = false; + } + + if (!mr) { + mutex_lock(&dev->slow_path_mutex); + mr = reg_create(NULL, pd, virt_addr, length, umem, ncont, + page_shift, access_flags, !use_umr); + mutex_unlock(&dev->slow_path_mutex); + } + + if (IS_ERR(mr)) { + err = PTR_ERR(mr); + goto error; + } + + mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); + + mr->umem = umem; + set_mr_fileds(dev, mr, npages, length, access_flags); + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + update_odp_mr(mr); +#endif + + if (use_umr) { + int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE; + + if (access_flags & IB_ACCESS_ON_DEMAND) + update_xlt_flags |= MLX5_IB_UPD_XLT_ZAP; + + err = mlx5_ib_update_xlt(mr, 0, ncont, page_shift, + update_xlt_flags); + + if (err) { + dereg_mr(dev, mr); + return ERR_PTR(err); + } + } + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + mr->live = 1; +#endif + return &mr->ibmr; +error: + ib_umem_release(umem); + return ERR_PTR(err); +} + +static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_umr_wr umrwr = {}; + + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + return 0; + + umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR | + MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS; + umrwr.wr.opcode = MLX5_IB_WR_UMR; + umrwr.pd = dev->umrc.pd; + umrwr.mkey = mr->mmkey.key; + umrwr.ignore_free_state = 1; + + return mlx5_ib_post_send_wait(dev, &umrwr); +} + +static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, + int access_flags, int flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_umr_wr umrwr = {}; + int err; + + umrwr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE; + + umrwr.wr.opcode = MLX5_IB_WR_UMR; + umrwr.mkey = mr->mmkey.key; + + if (flags & IB_MR_REREG_PD || flags & IB_MR_REREG_ACCESS) { + umrwr.pd = pd; + umrwr.access_flags = access_flags; + umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS; + } + + err = mlx5_ib_post_send_wait(dev, &umrwr); + + return err; +} + +int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, + u64 length, u64 virt_addr, int new_access_flags, + struct ib_pd *new_pd, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); + struct mlx5_ib_mr *mr = to_mmr(ib_mr); + struct ib_pd *pd = (flags & IB_MR_REREG_PD) ? new_pd : ib_mr->pd; + int access_flags = flags & IB_MR_REREG_ACCESS ? + new_access_flags : + mr->access_flags; + int page_shift = 0; + int upd_flags = 0; + int npages = 0; + int ncont = 0; + int order = 0; + u64 addr, len; + int err; + + mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", + start, virt_addr, length, access_flags); + + atomic_sub(mr->npages, &dev->mdev->priv.reg_pages); + + if (!mr->umem) + return -EINVAL; + + if (flags & IB_MR_REREG_TRANS) { + addr = virt_addr; + len = length; + } else { + addr = mr->umem->address; + len = mr->umem->length; + } + + if (flags != IB_MR_REREG_PD) { + /* + * Replace umem. This needs to be done whether or not UMR is + * used. + */ + flags |= IB_MR_REREG_TRANS; + ib_umem_release(mr->umem); + mr->umem = NULL; + err = mr_umem_get(pd, addr, len, access_flags, &mr->umem, + &npages, &page_shift, &ncont, &order); + if (err) + goto err; + } + + if (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len)) { + /* + * UMR can't be used - MKey needs to be replaced. + */ + if (mr->allocated_from_cache) + err = unreg_umr(dev, mr); + else + err = destroy_mkey(dev, mr); + if (err) + goto err; + + mr = reg_create(ib_mr, pd, addr, len, mr->umem, ncont, + page_shift, access_flags, true); + + if (IS_ERR(mr)) { + err = PTR_ERR(mr); + mr = to_mmr(ib_mr); + goto err; + } + + mr->allocated_from_cache = 0; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + mr->live = 1; +#endif + } else { + /* + * Send a UMR WQE + */ + mr->ibmr.pd = pd; + mr->access_flags = access_flags; + mr->mmkey.iova = addr; + mr->mmkey.size = len; + mr->mmkey.pd = to_mpd(pd)->pdn; + + if (flags & IB_MR_REREG_TRANS) { + upd_flags = MLX5_IB_UPD_XLT_ADDR; + if (flags & IB_MR_REREG_PD) + upd_flags |= MLX5_IB_UPD_XLT_PD; + if (flags & IB_MR_REREG_ACCESS) + upd_flags |= MLX5_IB_UPD_XLT_ACCESS; + err = mlx5_ib_update_xlt(mr, 0, npages, page_shift, + upd_flags); + } else { + err = rereg_umr(pd, mr, access_flags, flags); + } + + if (err) + goto err; + } + + set_mr_fileds(dev, mr, npages, len, access_flags); + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + update_odp_mr(mr); +#endif + return 0; + +err: + if (mr->umem) { + ib_umem_release(mr->umem); + mr->umem = NULL; + } + clean_mr(dev, mr); + return err; +} + +static int +mlx5_alloc_priv_descs(struct ib_device *device, + struct mlx5_ib_mr *mr, + int ndescs, + int desc_size) +{ + int size = ndescs * desc_size; + int add_size; + int ret; + + add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0); + + mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL); + if (!mr->descs_alloc) + return -ENOMEM; + + mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN); + + mr->desc_map = dma_map_single(device->dev.parent, mr->descs, + size, DMA_TO_DEVICE); + if (dma_mapping_error(device->dev.parent, mr->desc_map)) { + ret = -ENOMEM; + goto err; + } + + return 0; +err: + kfree(mr->descs_alloc); + + return ret; +} + +static void +mlx5_free_priv_descs(struct mlx5_ib_mr *mr) +{ + if (mr->descs) { + struct ib_device *device = mr->ibmr.device; + int size = mr->max_descs * mr->desc_size; + + dma_unmap_single(device->dev.parent, mr->desc_map, + size, DMA_TO_DEVICE); + kfree(mr->descs_alloc); + mr->descs = NULL; + } +} + +static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + int allocated_from_cache = mr->allocated_from_cache; + + if (mr->sig) { + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_memory.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", + mr->sig->psv_memory.psv_idx); + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_wire.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", + mr->sig->psv_wire.psv_idx); + kfree(mr->sig); + mr->sig = NULL; + } + + if (!allocated_from_cache) { + destroy_mkey(dev, mr); + mlx5_free_priv_descs(mr); + } +} + +static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + int npages = mr->npages; + struct ib_umem *umem = mr->umem; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (umem && umem->odp_data) { + /* Prevent new page faults from succeeding */ + mr->live = 0; + /* Wait for all running page-fault handlers to finish. */ + synchronize_srcu(&dev->mr_srcu); + /* Destroy all page mappings */ + if (umem->odp_data->page_list) + mlx5_ib_invalidate_range(umem, ib_umem_start(umem), + ib_umem_end(umem)); + else + mlx5_ib_free_implicit_mr(mr); + /* + * We kill the umem before the MR for ODP, + * so that there will not be any invalidations in + * flight, looking at the *mr struct. + */ + ib_umem_release(umem); + atomic_sub(npages, &dev->mdev->priv.reg_pages); + + /* Avoid double-freeing the umem. */ + umem = NULL; + } +#endif + clean_mr(dev, mr); + + /* + * We should unregister the DMA address from the HCA before + * remove the DMA mapping. + */ + mlx5_mr_cache_free(dev, mr); + if (umem) { + ib_umem_release(umem); + atomic_sub(npages, &dev->mdev->priv.reg_pages); + } + if (!mr->allocated_from_cache) + kfree(mr); +} + +int mlx5_ib_dereg_mr(struct ib_mr *ibmr) +{ + dereg_mr(to_mdev(ibmr->device), to_mmr(ibmr)); + return 0; +} + +struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_num_sg) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + int ndescs = ALIGN(max_num_sg, 4); + struct mlx5_ib_mr *mr; + void *mkc; + u32 *in; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, translations_octword_size, ndescs); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); + + if (mr_type == IB_MR_TYPE_MEM_REG) { + mr->access_mode = MLX5_MKC_ACCESS_MODE_MTT; + MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); + err = mlx5_alloc_priv_descs(pd->device, mr, + ndescs, sizeof(struct mlx5_mtt)); + if (err) + goto err_free_in; + + mr->desc_size = sizeof(struct mlx5_mtt); + mr->max_descs = ndescs; + } else if (mr_type == IB_MR_TYPE_SG_GAPS) { + mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS; + + err = mlx5_alloc_priv_descs(pd->device, mr, + ndescs, sizeof(struct mlx5_klm)); + if (err) + goto err_free_in; + mr->desc_size = sizeof(struct mlx5_klm); + mr->max_descs = ndescs; + } else if (mr_type == IB_MR_TYPE_SIGNATURE) { + u32 psv_index[2]; + + MLX5_SET(mkc, mkc, bsf_en, 1); + MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); + mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); + if (!mr->sig) { + err = -ENOMEM; + goto err_free_in; + } + + /* create mem & wire PSVs */ + err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, + 2, psv_index); + if (err) + goto err_free_sig; + + mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS; + mr->sig->psv_memory.psv_idx = psv_index[0]; + mr->sig->psv_wire.psv_idx = psv_index[1]; + + mr->sig->sig_status_checked = true; + mr->sig->sig_err_exists = false; + /* Next UMR, Arm SIGERR */ + ++mr->sig->sigerr_count; + } else { + mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); + err = -EINVAL; + goto err_free_in; + } + + MLX5_SET(mkc, mkc, access_mode_1_0, mr->access_mode & 0x3); + MLX5_SET(mkc, mkc, access_mode_4_2, (mr->access_mode >> 2) & 0x7); + MLX5_SET(mkc, mkc, umr_en, 1); + + mr->ibmr.device = pd->device; + err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen); + if (err) + goto err_destroy_psv; + + mr->mmkey.type = MLX5_MKEY_MR; + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; + mr->umem = NULL; + kfree(in); + + return &mr->ibmr; + +err_destroy_psv: + if (mr->sig) { + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_memory.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", + mr->sig->psv_memory.psv_idx); + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_wire.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", + mr->sig->psv_wire.psv_idx); + } + mlx5_free_priv_descs(mr); +err_free_sig: + kfree(mr->sig); +err_free_in: + kfree(in); +err_free: + kfree(mr); + return ERR_PTR(err); +} + +struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + struct mlx5_ib_mw *mw = NULL; + u32 *in = NULL; + void *mkc; + int ndescs; + int err; + struct mlx5_ib_alloc_mw req = {}; + struct { + __u32 comp_mask; + __u32 response_length; + } resp = {}; + + err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); + if (err) + return ERR_PTR(err); + + if (req.comp_mask || req.reserved1 || req.reserved2) + return ERR_PTR(-EOPNOTSUPP); + + if (udata->inlen > sizeof(req) && + !ib_is_udata_cleared(udata, sizeof(req), + udata->inlen - sizeof(req))) + return ERR_PTR(-EOPNOTSUPP); + + ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); + + mw = kzalloc(sizeof(*mw), GFP_KERNEL); + in = kzalloc(inlen, GFP_KERNEL); + if (!mw || !in) { + err = -ENOMEM; + goto free; + } + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, translations_octword_size, ndescs); + MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); + MLX5_SET(mkc, mkc, umr_en, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS); + MLX5_SET(mkc, mkc, en_rinval, !!((type == IB_MW_TYPE_2))); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + + err = mlx5_core_create_mkey(dev->mdev, &mw->mmkey, in, inlen); + if (err) + goto free; + + mw->mmkey.type = MLX5_MKEY_MW; + mw->ibmw.rkey = mw->mmkey.key; + mw->ndescs = ndescs; + + resp.response_length = min(offsetof(typeof(resp), response_length) + + sizeof(resp.response_length), udata->outlen); + if (resp.response_length) { + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) { + mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey); + goto free; + } + } + + kfree(in); + return &mw->ibmw; + +free: + kfree(mw); + kfree(in); + return ERR_PTR(err); +} + +int mlx5_ib_dealloc_mw(struct ib_mw *mw) +{ + struct mlx5_ib_mw *mmw = to_mmw(mw); + int err; + + err = mlx5_core_destroy_mkey((to_mdev(mw->device))->mdev, + &mmw->mmkey); + if (!err) + kfree(mmw); + return err; +} + +int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, + struct ib_mr_status *mr_status) +{ + struct mlx5_ib_mr *mmr = to_mmr(ibmr); + int ret = 0; + + if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { + pr_err("Invalid status check mask\n"); + ret = -EINVAL; + goto done; + } + + mr_status->fail_status = 0; + if (check_mask & IB_MR_CHECK_SIG_STATUS) { + if (!mmr->sig) { + ret = -EINVAL; + pr_err("signature status check requested on a non-signature enabled MR\n"); + goto done; + } + + mmr->sig->sig_status_checked = true; + if (!mmr->sig->sig_err_exists) + goto done; + + if (ibmr->lkey == mmr->sig->err_item.key) + memcpy(&mr_status->sig_err, &mmr->sig->err_item, + sizeof(mr_status->sig_err)); + else { + mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; + mr_status->sig_err.sig_err_offset = 0; + mr_status->sig_err.key = mmr->sig->err_item.key; + } + + mmr->sig->sig_err_exists = false; + mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; + } + +done: + return ret; +} + +static int +mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, + struct scatterlist *sgl, + unsigned short sg_nents, + unsigned int *sg_offset_p) +{ + struct scatterlist *sg = sgl; + struct mlx5_klm *klms = mr->descs; + unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; + u32 lkey = mr->ibmr.pd->local_dma_lkey; + int i; + + mr->ibmr.iova = sg_dma_address(sg) + sg_offset; + mr->ibmr.length = 0; + + for_each_sg(sgl, sg, sg_nents, i) { + if (unlikely(i >= mr->max_descs)) + break; + klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset); + klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset); + klms[i].key = cpu_to_be32(lkey); + mr->ibmr.length += sg_dma_len(sg) - sg_offset; + + sg_offset = 0; + } + mr->ndescs = i; + + if (sg_offset_p) + *sg_offset_p = sg_offset; + + return i; +} + +static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + __be64 *descs; + + if (unlikely(mr->ndescs == mr->max_descs)) + return -ENOMEM; + + descs = mr->descs; + descs[mr->ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); + + return 0; +} + +int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + int n; + + mr->ndescs = 0; + + ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map, + mr->desc_size * mr->max_descs, + DMA_TO_DEVICE); + + if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) + n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset); + else + n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, + mlx5_set_page); + + ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, + mr->desc_size * mr->max_descs, + DMA_TO_DEVICE); + + return n; +} diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c new file mode 100644 index 000000000..453e5c4ac --- /dev/null +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -0,0 +1,1230 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/ib_umem.h> +#include <rdma/ib_umem_odp.h> +#include <linux/kernel.h> + +#include "mlx5_ib.h" +#include "cmd.h" + +#define MAX_PREFETCH_LEN (4*1024*1024U) + +/* Timeout in ms to wait for an active mmu notifier to complete when handling + * a pagefault. */ +#define MMU_NOTIFIER_TIMEOUT 1000 + +#define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT) +#define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT) +#define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS) +#define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT) +#define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1)) + +#define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT + +static u64 mlx5_imr_ksm_entries; + +static int check_parent(struct ib_umem_odp *odp, + struct mlx5_ib_mr *parent) +{ + struct mlx5_ib_mr *mr = odp->private; + + return mr && mr->parent == parent && !odp->dying; +} + +static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) +{ + struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent; + struct ib_ucontext *ctx = odp->umem->context; + struct rb_node *rb; + + down_read(&ctx->umem_rwsem); + while (1) { + rb = rb_next(&odp->interval_tree.rb); + if (!rb) + goto not_found; + odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); + if (check_parent(odp, parent)) + goto end; + } +not_found: + odp = NULL; +end: + up_read(&ctx->umem_rwsem); + return odp; +} + +static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx, + u64 start, u64 length, + struct mlx5_ib_mr *parent) +{ + struct ib_umem_odp *odp; + struct rb_node *rb; + + down_read(&ctx->umem_rwsem); + odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length); + if (!odp) + goto end; + + while (1) { + if (check_parent(odp, parent)) + goto end; + rb = rb_next(&odp->interval_tree.rb); + if (!rb) + goto not_found; + odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); + if (ib_umem_start(odp->umem) > start + length) + goto not_found; + } +not_found: + odp = NULL; +end: + up_read(&ctx->umem_rwsem); + return odp; +} + +void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, + size_t nentries, struct mlx5_ib_mr *mr, int flags) +{ + struct ib_pd *pd = mr->ibmr.pd; + struct ib_ucontext *ctx = pd->uobject->context; + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct ib_umem_odp *odp; + unsigned long va; + int i; + + if (flags & MLX5_IB_UPD_XLT_ZAP) { + for (i = 0; i < nentries; i++, pklm++) { + pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); + pklm->key = cpu_to_be32(dev->null_mkey); + pklm->va = 0; + } + return; + } + + odp = odp_lookup(ctx, offset * MLX5_IMR_MTT_SIZE, + nentries * MLX5_IMR_MTT_SIZE, mr); + + for (i = 0; i < nentries; i++, pklm++) { + pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); + va = (offset + i) * MLX5_IMR_MTT_SIZE; + if (odp && odp->umem->address == va) { + struct mlx5_ib_mr *mtt = odp->private; + + pklm->key = cpu_to_be32(mtt->ibmr.lkey); + odp = odp_next(odp); + } else { + pklm->key = cpu_to_be32(dev->null_mkey); + } + mlx5_ib_dbg(dev, "[%d] va %lx key %x\n", + i, va, be32_to_cpu(pklm->key)); + } +} + +static void mr_leaf_free_action(struct work_struct *work) +{ + struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work); + int idx = ib_umem_start(odp->umem) >> MLX5_IMR_MTT_SHIFT; + struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent; + + mr->parent = NULL; + synchronize_srcu(&mr->dev->mr_srcu); + + ib_umem_release(odp->umem); + if (imr->live) + mlx5_ib_update_xlt(imr, idx, 1, 0, + MLX5_IB_UPD_XLT_INDIRECT | + MLX5_IB_UPD_XLT_ATOMIC); + mlx5_mr_cache_free(mr->dev, mr); + + if (atomic_dec_and_test(&imr->num_leaf_free)) + wake_up(&imr->q_leaf_free); +} + +void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, + unsigned long end) +{ + struct mlx5_ib_mr *mr; + const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / + sizeof(struct mlx5_mtt)) - 1; + u64 idx = 0, blk_start_idx = 0; + int in_block = 0; + u64 addr; + + if (!umem || !umem->odp_data) { + pr_err("invalidation called on NULL umem or non-ODP umem\n"); + return; + } + + mr = umem->odp_data->private; + + if (!mr || !mr->ibmr.pd) + return; + + start = max_t(u64, ib_umem_start(umem), start); + end = min_t(u64, ib_umem_end(umem), end); + + /* + * Iteration one - zap the HW's MTTs. The notifiers_count ensures that + * while we are doing the invalidation, no page fault will attempt to + * overwrite the same MTTs. Concurent invalidations might race us, + * but they will write 0s as well, so no difference in the end result. + */ + + for (addr = start; addr < end; addr += BIT(umem->page_shift)) { + idx = (addr - ib_umem_start(umem)) >> umem->page_shift; + /* + * Strive to write the MTTs in chunks, but avoid overwriting + * non-existing MTTs. The huristic here can be improved to + * estimate the cost of another UMR vs. the cost of bigger + * UMR. + */ + if (umem->odp_data->dma_list[idx] & + (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { + if (!in_block) { + blk_start_idx = idx; + in_block = 1; + } + } else { + u64 umr_offset = idx & umr_block_mask; + + if (in_block && umr_offset == 0) { + mlx5_ib_update_xlt(mr, blk_start_idx, + idx - blk_start_idx, 0, + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ATOMIC); + in_block = 0; + } + } + } + if (in_block) + mlx5_ib_update_xlt(mr, blk_start_idx, + idx - blk_start_idx + 1, 0, + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ATOMIC); + /* + * We are now sure that the device will not access the + * memory. We can safely unmap it, and mark it as dirty if + * needed. + */ + + ib_umem_odp_unmap_dma_pages(umem, start, end); + + if (unlikely(!umem->npages && mr->parent && + !umem->odp_data->dying)) { + WRITE_ONCE(umem->odp_data->dying, 1); + atomic_inc(&mr->parent->num_leaf_free); + schedule_work(&umem->odp_data->work); + } +} + +void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) +{ + struct ib_odp_caps *caps = &dev->odp_caps; + + memset(caps, 0, sizeof(*caps)); + + if (!MLX5_CAP_GEN(dev->mdev, pg)) + return; + + caps->general_caps = IB_ODP_SUPPORT; + + if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) + dev->odp_max_size = U64_MAX; + else + dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT); + + if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send)) + caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; + + if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send)) + caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; + + if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive)) + caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; + + if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write)) + caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; + + if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read)) + caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; + + if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic)) + caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; + + if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && + MLX5_CAP_GEN(dev->mdev, null_mkey) && + MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) + caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT; + + return; +} + +static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, + struct mlx5_pagefault *pfault, + int error) +{ + int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? + pfault->wqe.wq_num : pfault->token; + int ret = mlx5_core_page_fault_resume(dev->mdev, + pfault->token, + wq_num, + pfault->type, + error); + if (ret) + mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n", + wq_num); +} + +static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, + struct ib_umem *umem, + bool ksm, int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_mr *mr; + int err; + + mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY : + MLX5_IMR_MTT_CACHE_ENTRY); + + if (IS_ERR(mr)) + return mr; + + mr->ibmr.pd = pd; + + mr->dev = dev; + mr->access_flags = access_flags; + mr->mmkey.iova = 0; + mr->umem = umem; + + if (ksm) { + err = mlx5_ib_update_xlt(mr, 0, + mlx5_imr_ksm_entries, + MLX5_KSM_PAGE_SHIFT, + MLX5_IB_UPD_XLT_INDIRECT | + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ENABLE); + + } else { + err = mlx5_ib_update_xlt(mr, 0, + MLX5_IMR_MTT_ENTRIES, + PAGE_SHIFT, + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ENABLE | + MLX5_IB_UPD_XLT_ATOMIC); + } + + if (err) + goto fail; + + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; + + mr->live = 1; + + mlx5_ib_dbg(dev, "key %x dev %p mr %p\n", + mr->mmkey.key, dev->mdev, mr); + + return mr; + +fail: + mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); + mlx5_mr_cache_free(dev, mr); + + return ERR_PTR(err); +} + +static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, + u64 io_virt, size_t bcnt) +{ + struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context; + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device); + struct ib_umem_odp *odp, *result = NULL; + u64 addr = io_virt & MLX5_IMR_MTT_MASK; + int nentries = 0, start_idx = 0, ret; + struct mlx5_ib_mr *mtt; + struct ib_umem *umem; + + mutex_lock(&mr->umem->odp_data->umem_mutex); + odp = odp_lookup(ctx, addr, 1, mr); + + mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n", + io_virt, bcnt, addr, odp); + +next_mr: + if (likely(odp)) { + if (nentries) + nentries++; + } else { + umem = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE); + if (IS_ERR(umem)) { + mutex_unlock(&mr->umem->odp_data->umem_mutex); + return ERR_CAST(umem); + } + + mtt = implicit_mr_alloc(mr->ibmr.pd, umem, 0, mr->access_flags); + if (IS_ERR(mtt)) { + mutex_unlock(&mr->umem->odp_data->umem_mutex); + ib_umem_release(umem); + return ERR_CAST(mtt); + } + + odp = umem->odp_data; + odp->private = mtt; + mtt->umem = umem; + mtt->mmkey.iova = addr; + mtt->parent = mr; + INIT_WORK(&odp->work, mr_leaf_free_action); + + if (!nentries) + start_idx = addr >> MLX5_IMR_MTT_SHIFT; + nentries++; + } + + /* Return first odp if region not covered by single one */ + if (likely(!result)) + result = odp; + + addr += MLX5_IMR_MTT_SIZE; + if (unlikely(addr < io_virt + bcnt)) { + odp = odp_next(odp); + if (odp && odp->umem->address != addr) + odp = NULL; + goto next_mr; + } + + if (unlikely(nentries)) { + ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0, + MLX5_IB_UPD_XLT_INDIRECT | + MLX5_IB_UPD_XLT_ATOMIC); + if (ret) { + mlx5_ib_err(dev, "Failed to update PAS\n"); + result = ERR_PTR(ret); + } + } + + mutex_unlock(&mr->umem->odp_data->umem_mutex); + return result; +} + +struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, + int access_flags) +{ + struct ib_ucontext *ctx = pd->ibpd.uobject->context; + struct mlx5_ib_mr *imr; + struct ib_umem *umem; + + umem = ib_umem_get(ctx, 0, 0, IB_ACCESS_ON_DEMAND, 0); + if (IS_ERR(umem)) + return ERR_CAST(umem); + + imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags); + if (IS_ERR(imr)) { + ib_umem_release(umem); + return ERR_CAST(imr); + } + + imr->umem = umem; + init_waitqueue_head(&imr->q_leaf_free); + atomic_set(&imr->num_leaf_free, 0); + + return imr; +} + +static int mr_leaf_free(struct ib_umem *umem, u64 start, + u64 end, void *cookie) +{ + struct mlx5_ib_mr *mr = umem->odp_data->private, *imr = cookie; + + if (mr->parent != imr) + return 0; + + ib_umem_odp_unmap_dma_pages(umem, + ib_umem_start(umem), + ib_umem_end(umem)); + + if (umem->odp_data->dying) + return 0; + + WRITE_ONCE(umem->odp_data->dying, 1); + atomic_inc(&imr->num_leaf_free); + schedule_work(&umem->odp_data->work); + + return 0; +} + +void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) +{ + struct ib_ucontext *ctx = imr->ibmr.pd->uobject->context; + + down_read(&ctx->umem_rwsem); + rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX, + mr_leaf_free, true, imr); + up_read(&ctx->umem_rwsem); + + wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); +} + +static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, + u64 io_virt, size_t bcnt, u32 *bytes_mapped) +{ + u64 access_mask; + int npages = 0, page_shift, np; + u64 start_idx, page_mask; + struct ib_umem_odp *odp; + int current_seq; + size_t size; + int ret; + + if (!mr->umem->odp_data->page_list) { + odp = implicit_mr_get_data(mr, io_virt, bcnt); + + if (IS_ERR(odp)) + return PTR_ERR(odp); + mr = odp->private; + + } else { + odp = mr->umem->odp_data; + } + +next_mr: + size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt); + + page_shift = mr->umem->page_shift; + page_mask = ~(BIT(page_shift) - 1); + start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; + access_mask = ODP_READ_ALLOWED_BIT; + + if (mr->umem->writable) + access_mask |= ODP_WRITE_ALLOWED_BIT; + + current_seq = READ_ONCE(odp->notifiers_seq); + /* + * Ensure the sequence number is valid for some time before we call + * gup. + */ + smp_rmb(); + + ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size, + access_mask, current_seq); + + if (ret < 0) + goto out; + + np = ret; + + mutex_lock(&odp->umem_mutex); + if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { + /* + * No need to check whether the MTTs really belong to + * this MR, since ib_umem_odp_map_dma_pages already + * checks this. + */ + ret = mlx5_ib_update_xlt(mr, start_idx, np, + page_shift, MLX5_IB_UPD_XLT_ATOMIC); + } else { + ret = -EAGAIN; + } + mutex_unlock(&odp->umem_mutex); + + if (ret < 0) { + if (ret != -EAGAIN) + mlx5_ib_err(dev, "Failed to update mkey page tables\n"); + goto out; + } + + if (bytes_mapped) { + u32 new_mappings = (np << page_shift) - + (io_virt - round_down(io_virt, 1 << page_shift)); + *bytes_mapped += min_t(u32, new_mappings, size); + } + + npages += np << (page_shift - PAGE_SHIFT); + bcnt -= size; + + if (unlikely(bcnt)) { + struct ib_umem_odp *next; + + io_virt += size; + next = odp_next(odp); + if (unlikely(!next || next->umem->address != io_virt)) { + mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", + io_virt, next); + return -EAGAIN; + } + odp = next; + mr = odp->private; + goto next_mr; + } + + return npages; + +out: + if (ret == -EAGAIN) { + if (mr->parent || !odp->dying) { + unsigned long timeout = + msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); + + if (!wait_for_completion_timeout( + &odp->notifier_completion, + timeout)) { + mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d\n", + current_seq, odp->notifiers_seq); + } + } else { + /* The MR is being killed, kill the QP as well. */ + ret = -EFAULT; + } + } + + return ret; +} + +struct pf_frame { + struct pf_frame *next; + u32 key; + u64 io_virt; + size_t bcnt; + int depth; +}; + +/* + * Handle a single data segment in a page-fault WQE or RDMA region. + * + * Returns number of OS pages retrieved on success. The caller may continue to + * the next data segment. + * Can return the following error codes: + * -EAGAIN to designate a temporary error. The caller will abort handling the + * page fault and resolve it. + * -EFAULT when there's an error mapping the requested pages. The caller will + * abort the page fault handling. + */ +static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, + u32 key, u64 io_virt, size_t bcnt, + u32 *bytes_committed, + u32 *bytes_mapped) +{ + int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0; + struct pf_frame *head = NULL, *frame; + struct mlx5_core_mkey *mmkey; + struct mlx5_ib_mw *mw; + struct mlx5_ib_mr *mr; + struct mlx5_klm *pklm; + u32 *out = NULL; + size_t offset; + + srcu_key = srcu_read_lock(&dev->mr_srcu); + + io_virt += *bytes_committed; + bcnt -= *bytes_committed; + +next_mr: + mmkey = __mlx5_mr_lookup(dev->mdev, mlx5_base_mkey(key)); + if (!mmkey || mmkey->key != key) { + mlx5_ib_dbg(dev, "failed to find mkey %x\n", key); + ret = -EFAULT; + goto srcu_unlock; + } + + switch (mmkey->type) { + case MLX5_MKEY_MR: + mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + if (!mr->live || !mr->ibmr.pd) { + mlx5_ib_dbg(dev, "got dead MR\n"); + ret = -EFAULT; + goto srcu_unlock; + } + + ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped); + if (ret < 0) + goto srcu_unlock; + + npages += ret; + ret = 0; + break; + + case MLX5_MKEY_MW: + mw = container_of(mmkey, struct mlx5_ib_mw, mmkey); + + if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) { + mlx5_ib_dbg(dev, "indirection level exceeded\n"); + ret = -EFAULT; + goto srcu_unlock; + } + + outlen = MLX5_ST_SZ_BYTES(query_mkey_out) + + sizeof(*pklm) * (mw->ndescs - 2); + + if (outlen > cur_outlen) { + kfree(out); + out = kzalloc(outlen, GFP_KERNEL); + if (!out) { + ret = -ENOMEM; + goto srcu_unlock; + } + cur_outlen = outlen; + } + + pklm = (struct mlx5_klm *)MLX5_ADDR_OF(query_mkey_out, out, + bsf0_klm0_pas_mtt0_1); + + ret = mlx5_core_query_mkey(dev->mdev, &mw->mmkey, out, outlen); + if (ret) + goto srcu_unlock; + + offset = io_virt - MLX5_GET64(query_mkey_out, out, + memory_key_mkey_entry.start_addr); + + for (i = 0; bcnt && i < mw->ndescs; i++, pklm++) { + if (offset >= be32_to_cpu(pklm->bcount)) { + offset -= be32_to_cpu(pklm->bcount); + continue; + } + + frame = kzalloc(sizeof(*frame), GFP_KERNEL); + if (!frame) { + ret = -ENOMEM; + goto srcu_unlock; + } + + frame->key = be32_to_cpu(pklm->key); + frame->io_virt = be64_to_cpu(pklm->va) + offset; + frame->bcnt = min_t(size_t, bcnt, + be32_to_cpu(pklm->bcount) - offset); + frame->depth = depth + 1; + frame->next = head; + head = frame; + + bcnt -= frame->bcnt; + offset = 0; + } + break; + + default: + mlx5_ib_dbg(dev, "wrong mkey type %d\n", mmkey->type); + ret = -EFAULT; + goto srcu_unlock; + } + + if (head) { + frame = head; + head = frame->next; + + key = frame->key; + io_virt = frame->io_virt; + bcnt = frame->bcnt; + depth = frame->depth; + kfree(frame); + + goto next_mr; + } + +srcu_unlock: + while (head) { + frame = head; + head = frame->next; + kfree(frame); + } + kfree(out); + + srcu_read_unlock(&dev->mr_srcu, srcu_key); + *bytes_committed = 0; + return ret ? ret : npages; +} + +/** + * Parse a series of data segments for page fault handling. + * + * @qp the QP on which the fault occurred. + * @pfault contains page fault information. + * @wqe points at the first data segment in the WQE. + * @wqe_end points after the end of the WQE. + * @bytes_mapped receives the number of bytes that the function was able to + * map. This allows the caller to decide intelligently whether + * enough memory was mapped to resolve the page fault + * successfully (e.g. enough for the next MTU, or the entire + * WQE). + * @total_wqe_bytes receives the total data size of this WQE in bytes (minus + * the committed bytes). + * + * Returns the number of pages loaded if positive, zero for an empty WQE, or a + * negative error code. + */ +static int pagefault_data_segments(struct mlx5_ib_dev *dev, + struct mlx5_pagefault *pfault, + struct mlx5_ib_qp *qp, void *wqe, + void *wqe_end, u32 *bytes_mapped, + u32 *total_wqe_bytes, int receive_queue) +{ + int ret = 0, npages = 0; + u64 io_virt; + u32 key; + u32 byte_count; + size_t bcnt; + int inline_segment; + + /* Skip SRQ next-WQE segment. */ + if (receive_queue && qp->ibqp.srq) + wqe += sizeof(struct mlx5_wqe_srq_next_seg); + + if (bytes_mapped) + *bytes_mapped = 0; + if (total_wqe_bytes) + *total_wqe_bytes = 0; + + while (wqe < wqe_end) { + struct mlx5_wqe_data_seg *dseg = wqe; + + io_virt = be64_to_cpu(dseg->addr); + key = be32_to_cpu(dseg->lkey); + byte_count = be32_to_cpu(dseg->byte_count); + inline_segment = !!(byte_count & MLX5_INLINE_SEG); + bcnt = byte_count & ~MLX5_INLINE_SEG; + + if (inline_segment) { + bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK; + wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt, + 16); + } else { + wqe += sizeof(*dseg); + } + + /* receive WQE end of sg list. */ + if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && + io_virt == 0) + break; + + if (!inline_segment && total_wqe_bytes) { + *total_wqe_bytes += bcnt - min_t(size_t, bcnt, + pfault->bytes_committed); + } + + /* A zero length data segment designates a length of 2GB. */ + if (bcnt == 0) + bcnt = 1U << 31; + + if (inline_segment || bcnt <= pfault->bytes_committed) { + pfault->bytes_committed -= + min_t(size_t, bcnt, + pfault->bytes_committed); + continue; + } + + ret = pagefault_single_data_segment(dev, key, io_virt, bcnt, + &pfault->bytes_committed, + bytes_mapped); + if (ret < 0) + break; + npages += ret; + } + + return ret < 0 ? ret : npages; +} + +static const u32 mlx5_ib_odp_opcode_cap[] = { + [MLX5_OPCODE_SEND] = IB_ODP_SUPPORT_SEND, + [MLX5_OPCODE_SEND_IMM] = IB_ODP_SUPPORT_SEND, + [MLX5_OPCODE_SEND_INVAL] = IB_ODP_SUPPORT_SEND, + [MLX5_OPCODE_RDMA_WRITE] = IB_ODP_SUPPORT_WRITE, + [MLX5_OPCODE_RDMA_WRITE_IMM] = IB_ODP_SUPPORT_WRITE, + [MLX5_OPCODE_RDMA_READ] = IB_ODP_SUPPORT_READ, + [MLX5_OPCODE_ATOMIC_CS] = IB_ODP_SUPPORT_ATOMIC, + [MLX5_OPCODE_ATOMIC_FA] = IB_ODP_SUPPORT_ATOMIC, +}; + +/* + * Parse initiator WQE. Advances the wqe pointer to point at the + * scatter-gather list, and set wqe_end to the end of the WQE. + */ +static int mlx5_ib_mr_initiator_pfault_handler( + struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, + struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) +{ + struct mlx5_wqe_ctrl_seg *ctrl = *wqe; + u16 wqe_index = pfault->wqe.wqe_index; + u32 transport_caps; + struct mlx5_base_av *av; + unsigned ds, opcode; +#if defined(DEBUG) + u32 ctrl_wqe_index, ctrl_qpn; +#endif + u32 qpn = qp->trans_qp.base.mqp.qpn; + + ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; + if (ds * MLX5_WQE_DS_UNITS > wqe_length) { + mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n", + ds, wqe_length); + return -EFAULT; + } + + if (ds == 0) { + mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n", + wqe_index, qpn); + return -EFAULT; + } + +#if defined(DEBUG) + ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) & + MLX5_WQE_CTRL_WQE_INDEX_MASK) >> + MLX5_WQE_CTRL_WQE_INDEX_SHIFT; + if (wqe_index != ctrl_wqe_index) { + mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n", + wqe_index, qpn, + ctrl_wqe_index); + return -EFAULT; + } + + ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >> + MLX5_WQE_CTRL_QPN_SHIFT; + if (qpn != ctrl_qpn) { + mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n", + wqe_index, qpn, + ctrl_qpn); + return -EFAULT; + } +#endif /* DEBUG */ + + *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; + *wqe += sizeof(*ctrl); + + opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & + MLX5_WQE_CTRL_OPCODE_MASK; + + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + transport_caps = dev->odp_caps.per_transport_caps.rc_odp_caps; + break; + case IB_QPT_UD: + transport_caps = dev->odp_caps.per_transport_caps.ud_odp_caps; + break; + default: + mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport 0x%x\n", + qp->ibqp.qp_type); + return -EFAULT; + } + + if (unlikely(opcode >= ARRAY_SIZE(mlx5_ib_odp_opcode_cap) || + !(transport_caps & mlx5_ib_odp_opcode_cap[opcode]))) { + mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode 0x%x\n", + opcode); + return -EFAULT; + } + + if (qp->ibqp.qp_type != IB_QPT_RC) { + av = *wqe; + if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV)) + *wqe += sizeof(struct mlx5_av); + else + *wqe += sizeof(struct mlx5_base_av); + } + + switch (opcode) { + case MLX5_OPCODE_RDMA_WRITE: + case MLX5_OPCODE_RDMA_WRITE_IMM: + case MLX5_OPCODE_RDMA_READ: + *wqe += sizeof(struct mlx5_wqe_raddr_seg); + break; + case MLX5_OPCODE_ATOMIC_CS: + case MLX5_OPCODE_ATOMIC_FA: + *wqe += sizeof(struct mlx5_wqe_raddr_seg); + *wqe += sizeof(struct mlx5_wqe_atomic_seg); + break; + } + + return 0; +} + +/* + * Parse responder WQE. Advances the wqe pointer to point at the + * scatter-gather list, and set wqe_end to the end of the WQE. + */ +static int mlx5_ib_mr_responder_pfault_handler( + struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, + struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) +{ + struct mlx5_ib_wq *wq = &qp->rq; + int wqe_size = 1 << wq->wqe_shift; + + if (qp->ibqp.srq) { + mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n"); + return -EFAULT; + } + + if (qp->wq_sig) { + mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n"); + return -EFAULT; + } + + if (wqe_size > wqe_length) { + mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); + return -EFAULT; + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_RECV)) + goto invalid_transport_or_opcode; + break; + default: +invalid_transport_or_opcode: + mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n", + qp->ibqp.qp_type); + return -EFAULT; + } + + *wqe_end = *wqe + wqe_size; + + return 0; +} + +static struct mlx5_ib_qp *mlx5_ib_odp_find_qp(struct mlx5_ib_dev *dev, + u32 wq_num) +{ + struct mlx5_core_qp *mqp = __mlx5_qp_lookup(dev->mdev, wq_num); + + if (!mqp) { + mlx5_ib_err(dev, "QPN 0x%6x not found\n", wq_num); + return NULL; + } + + return to_mibqp(mqp); +} + +static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, + struct mlx5_pagefault *pfault) +{ + int ret; + void *wqe, *wqe_end; + u32 bytes_mapped, total_wqe_bytes; + char *buffer = NULL; + int resume_with_error = 1; + u16 wqe_index = pfault->wqe.wqe_index; + int requestor = pfault->type & MLX5_PFAULT_REQUESTOR; + struct mlx5_ib_qp *qp; + + buffer = (char *)__get_free_page(GFP_KERNEL); + if (!buffer) { + mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); + goto resolve_page_fault; + } + + qp = mlx5_ib_odp_find_qp(dev, pfault->wqe.wq_num); + if (!qp) + goto resolve_page_fault; + + ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, + PAGE_SIZE, &qp->trans_qp.base); + if (ret < 0) { + mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%d, wqe_index=%x, qpn=%x\n", + ret, wqe_index, pfault->token); + goto resolve_page_fault; + } + + wqe = buffer; + if (requestor) + ret = mlx5_ib_mr_initiator_pfault_handler(dev, pfault, qp, &wqe, + &wqe_end, ret); + else + ret = mlx5_ib_mr_responder_pfault_handler(dev, pfault, qp, &wqe, + &wqe_end, ret); + if (ret < 0) + goto resolve_page_fault; + + if (wqe >= wqe_end) { + mlx5_ib_err(dev, "ODP fault on invalid WQE.\n"); + goto resolve_page_fault; + } + + ret = pagefault_data_segments(dev, pfault, qp, wqe, wqe_end, + &bytes_mapped, &total_wqe_bytes, + !requestor); + if (ret == -EAGAIN) { + resume_with_error = 0; + goto resolve_page_fault; + } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { + goto resolve_page_fault; + } + + resume_with_error = 0; +resolve_page_fault: + mlx5_ib_page_fault_resume(dev, pfault, resume_with_error); + mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n", + pfault->wqe.wq_num, resume_with_error, + pfault->type); + free_page((unsigned long)buffer); +} + +static int pages_in_range(u64 address, u32 length) +{ + return (ALIGN(address + length, PAGE_SIZE) - + (address & PAGE_MASK)) >> PAGE_SHIFT; +} + +static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, + struct mlx5_pagefault *pfault) +{ + u64 address; + u32 length; + u32 prefetch_len = pfault->bytes_committed; + int prefetch_activated = 0; + u32 rkey = pfault->rdma.r_key; + int ret; + + /* The RDMA responder handler handles the page fault in two parts. + * First it brings the necessary pages for the current packet + * (and uses the pfault context), and then (after resuming the QP) + * prefetches more pages. The second operation cannot use the pfault + * context and therefore uses the dummy_pfault context allocated on + * the stack */ + pfault->rdma.rdma_va += pfault->bytes_committed; + pfault->rdma.rdma_op_len -= min(pfault->bytes_committed, + pfault->rdma.rdma_op_len); + pfault->bytes_committed = 0; + + address = pfault->rdma.rdma_va; + length = pfault->rdma.rdma_op_len; + + /* For some operations, the hardware cannot tell the exact message + * length, and in those cases it reports zero. Use prefetch + * logic. */ + if (length == 0) { + prefetch_activated = 1; + length = pfault->rdma.packet_size; + prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); + } + + ret = pagefault_single_data_segment(dev, rkey, address, length, + &pfault->bytes_committed, NULL); + if (ret == -EAGAIN) { + /* We're racing with an invalidation, don't prefetch */ + prefetch_activated = 0; + } else if (ret < 0 || pages_in_range(address, length) > ret) { + mlx5_ib_page_fault_resume(dev, pfault, 1); + if (ret != -ENOENT) + mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n", + ret, pfault->token, pfault->type); + return; + } + + mlx5_ib_page_fault_resume(dev, pfault, 0); + mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n", + pfault->token, pfault->type, + prefetch_activated); + + /* At this point, there might be a new pagefault already arriving in + * the eq, switch to the dummy pagefault for the rest of the + * processing. We're still OK with the objects being alive as the + * work-queue is being fenced. */ + + if (prefetch_activated) { + u32 bytes_committed = 0; + + ret = pagefault_single_data_segment(dev, rkey, address, + prefetch_len, + &bytes_committed, NULL); + if (ret < 0 && ret != -EAGAIN) { + mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", + ret, pfault->token, address, prefetch_len); + } + } +} + +void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, + struct mlx5_pagefault *pfault) +{ + struct mlx5_ib_dev *dev = context; + u8 event_subtype = pfault->event_subtype; + + switch (event_subtype) { + case MLX5_PFAULT_SUBTYPE_WQE: + mlx5_ib_mr_wqe_pfault_handler(dev, pfault); + break; + case MLX5_PFAULT_SUBTYPE_RDMA: + mlx5_ib_mr_rdma_pfault_handler(dev, pfault); + break; + default: + mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n", + event_subtype); + mlx5_ib_page_fault_resume(dev, pfault, 1); + } +} + +void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) +{ + if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) + return; + + switch (ent->order - 2) { + case MLX5_IMR_MTT_CACHE_ENTRY: + ent->page = PAGE_SHIFT; + ent->xlt = MLX5_IMR_MTT_ENTRIES * + sizeof(struct mlx5_mtt) / + MLX5_IB_UMR_OCTOWORD; + ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; + ent->limit = 0; + break; + + case MLX5_IMR_KSM_CACHE_ENTRY: + ent->page = MLX5_KSM_PAGE_SHIFT; + ent->xlt = mlx5_imr_ksm_entries * + sizeof(struct mlx5_klm) / + MLX5_IB_UMR_OCTOWORD; + ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM; + ent->limit = 0; + break; + } +} + +int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) +{ + int ret; + + if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) { + ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey); + if (ret) { + mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret); + return ret; + } + } + + return 0; +} + +int mlx5_ib_odp_init(void) +{ + mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - + MLX5_IMR_MTT_BITS); + + return 0; +} + diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c new file mode 100644 index 000000000..361b1b859 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -0,0 +1,5891 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/module.h> +#include <rdma/ib_umem.h> +#include <rdma/ib_cache.h> +#include <rdma/ib_user_verbs.h> +#include <linux/mlx5/fs.h> +#include "mlx5_ib.h" +#include "ib_rep.h" + +/* not supported currently */ +static int wq_signature; + +enum { + MLX5_IB_ACK_REQ_FREQ = 8, +}; + +enum { + MLX5_IB_DEFAULT_SCHED_QUEUE = 0x83, + MLX5_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f, + MLX5_IB_LINK_TYPE_IB = 0, + MLX5_IB_LINK_TYPE_ETH = 1 +}; + +enum { + MLX5_IB_SQ_STRIDE = 6, + MLX5_IB_SQ_UMR_INLINE_THRESHOLD = 64, +}; + +static const u32 mlx5_ib_opcode[] = { + [IB_WR_SEND] = MLX5_OPCODE_SEND, + [IB_WR_LSO] = MLX5_OPCODE_LSO, + [IB_WR_SEND_WITH_IMM] = MLX5_OPCODE_SEND_IMM, + [IB_WR_RDMA_WRITE] = MLX5_OPCODE_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = MLX5_OPCODE_RDMA_WRITE_IMM, + [IB_WR_RDMA_READ] = MLX5_OPCODE_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_CS, + [IB_WR_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_FA, + [IB_WR_SEND_WITH_INV] = MLX5_OPCODE_SEND_INVAL, + [IB_WR_LOCAL_INV] = MLX5_OPCODE_UMR, + [IB_WR_REG_MR] = MLX5_OPCODE_UMR, + [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_MASKED_CS, + [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_MASKED_FA, + [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR, +}; + +struct mlx5_wqe_eth_pad { + u8 rsvd0[16]; +}; + +enum raw_qp_set_mask_map { + MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID = 1UL << 0, + MLX5_RAW_QP_RATE_LIMIT = 1UL << 1, +}; + +struct mlx5_modify_raw_qp_param { + u16 operation; + + u32 set_mask; /* raw_qp_set_mask_map */ + + struct mlx5_rate_limit rl; + + u8 rq_q_ctr_id; +}; + +static void get_cqs(enum ib_qp_type qp_type, + struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq, + struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq); + +static int is_qp0(enum ib_qp_type qp_type) +{ + return qp_type == IB_QPT_SMI; +} + +static int is_sqp(enum ib_qp_type qp_type) +{ + return is_qp0(qp_type) || is_qp1(qp_type); +} + +static void *get_wqe(struct mlx5_ib_qp *qp, int offset) +{ + return mlx5_buf_offset(&qp->buf, offset); +} + +static void *get_recv_wqe(struct mlx5_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift)); +} + +void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE)); +} + +/** + * mlx5_ib_read_user_wqe() - Copy a user-space WQE to kernel space. + * + * @qp: QP to copy from. + * @send: copy from the send queue when non-zero, use the receive queue + * otherwise. + * @wqe_index: index to start copying from. For send work queues, the + * wqe_index is in units of MLX5_SEND_WQE_BB. + * For receive work queue, it is the number of work queue + * element in the queue. + * @buffer: destination buffer. + * @length: maximum number of bytes to copy. + * + * Copies at least a single WQE, but may copy more data. + * + * Return: the number of bytes copied, or an error code. + */ +int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, + void *buffer, u32 length, + struct mlx5_ib_qp_base *base) +{ + struct ib_device *ibdev = qp->ibqp.device; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq; + size_t offset; + size_t wq_end; + struct ib_umem *umem = base->ubuffer.umem; + u32 first_copy_length; + int wqe_length; + int ret; + + if (wq->wqe_cnt == 0) { + mlx5_ib_dbg(dev, "mlx5_ib_read_user_wqe for a QP with wqe_cnt == 0. qp_type: 0x%x\n", + qp->ibqp.qp_type); + return -EINVAL; + } + + offset = wq->offset + ((wqe_index % wq->wqe_cnt) << wq->wqe_shift); + wq_end = wq->offset + (wq->wqe_cnt << wq->wqe_shift); + + if (send && length < sizeof(struct mlx5_wqe_ctrl_seg)) + return -EINVAL; + + if (offset > umem->length || + (send && offset + sizeof(struct mlx5_wqe_ctrl_seg) > umem->length)) + return -EINVAL; + + first_copy_length = min_t(u32, offset + length, wq_end) - offset; + ret = ib_umem_copy_from(buffer, umem, offset, first_copy_length); + if (ret) + return ret; + + if (send) { + struct mlx5_wqe_ctrl_seg *ctrl = buffer; + int ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; + + wqe_length = ds * MLX5_WQE_DS_UNITS; + } else { + wqe_length = 1 << wq->wqe_shift; + } + + if (wqe_length <= first_copy_length) + return first_copy_length; + + ret = ib_umem_copy_from(buffer + first_copy_length, umem, wq->offset, + wqe_length - first_copy_length); + if (ret) + return ret; + + return wqe_length; +} + +static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) +{ + struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; + struct ib_event event; + + if (type == MLX5_EVENT_TYPE_PATH_MIG) { + /* This event is only valid for trans_qps */ + to_mibqp(qp)->port = to_mibqp(qp)->trans_qp.alt_port; + } + + if (ibqp->event_handler) { + event.device = ibqp->device; + event.element.qp = ibqp; + switch (type) { + case MLX5_EVENT_TYPE_PATH_MIG: + event.event = IB_EVENT_PATH_MIG; + break; + case MLX5_EVENT_TYPE_COMM_EST: + event.event = IB_EVENT_COMM_EST; + break; + case MLX5_EVENT_TYPE_SQ_DRAINED: + event.event = IB_EVENT_SQ_DRAINED; + break; + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + event.event = IB_EVENT_QP_LAST_WQE_REACHED; + break; + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + event.event = IB_EVENT_QP_FATAL; + break; + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + event.event = IB_EVENT_PATH_MIG_ERR; + break; + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + event.event = IB_EVENT_QP_REQ_ERR; + break; + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + event.event = IB_EVENT_QP_ACCESS_ERR; + break; + default: + pr_warn("mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn); + return; + } + + ibqp->event_handler(&event, ibqp->qp_context); + } +} + +static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, + int has_rq, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd) +{ + int wqe_size; + int wq_size; + + /* Sanity check RQ size before proceeding */ + if (cap->max_recv_wr > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) + return -EINVAL; + + if (!has_rq) { + qp->rq.max_gs = 0; + qp->rq.wqe_cnt = 0; + qp->rq.wqe_shift = 0; + cap->max_recv_wr = 0; + cap->max_recv_sge = 0; + } else { + if (ucmd) { + qp->rq.wqe_cnt = ucmd->rq_wqe_count; + if (ucmd->rq_wqe_shift > BITS_PER_BYTE * sizeof(ucmd->rq_wqe_shift)) + return -EINVAL; + qp->rq.wqe_shift = ucmd->rq_wqe_shift; + if ((1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) < qp->wq_sig) + return -EINVAL; + qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig; + qp->rq.max_post = qp->rq.wqe_cnt; + } else { + wqe_size = qp->wq_sig ? sizeof(struct mlx5_wqe_signature_seg) : 0; + wqe_size += cap->max_recv_sge * sizeof(struct mlx5_wqe_data_seg); + wqe_size = roundup_pow_of_two(wqe_size); + wq_size = roundup_pow_of_two(cap->max_recv_wr) * wqe_size; + wq_size = max_t(int, wq_size, MLX5_SEND_WQE_BB); + qp->rq.wqe_cnt = wq_size / wqe_size; + if (wqe_size > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq)) { + mlx5_ib_dbg(dev, "wqe_size %d, max %d\n", + wqe_size, + MLX5_CAP_GEN(dev->mdev, + max_wqe_sz_rq)); + return -EINVAL; + } + qp->rq.wqe_shift = ilog2(wqe_size); + qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig; + qp->rq.max_post = qp->rq.wqe_cnt; + } + } + + return 0; +} + +static int sq_overhead(struct ib_qp_init_attr *attr) +{ + int size = 0; + + switch (attr->qp_type) { + case IB_QPT_XRC_INI: + size += sizeof(struct mlx5_wqe_xrc_seg); + /* fall through */ + case IB_QPT_RC: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + max(sizeof(struct mlx5_wqe_atomic_seg) + + sizeof(struct mlx5_wqe_raddr_seg), + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg) + + MLX5_IB_SQ_UMR_INLINE_THRESHOLD / + MLX5_IB_UMR_OCTOWORD); + break; + + case IB_QPT_XRC_TGT: + return 0; + + case IB_QPT_UC: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + max(sizeof(struct mlx5_wqe_raddr_seg), + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg)); + break; + + case IB_QPT_UD: + if (attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) + size += sizeof(struct mlx5_wqe_eth_pad) + + sizeof(struct mlx5_wqe_eth_seg); + /* fall through */ + case IB_QPT_SMI: + case MLX5_IB_QPT_HW_GSI: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_datagram_seg); + break; + + case MLX5_IB_QPT_REG_UMR: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg); + break; + + default: + return -EINVAL; + } + + return size; +} + +static int calc_send_wqe(struct ib_qp_init_attr *attr) +{ + int inl_size = 0; + int size; + + size = sq_overhead(attr); + if (size < 0) + return size; + + if (attr->cap.max_inline_data) { + inl_size = size + sizeof(struct mlx5_wqe_inline_seg) + + attr->cap.max_inline_data; + } + + size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg); + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN && + ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB) < MLX5_SIG_WQE_SIZE) + return MLX5_SIG_WQE_SIZE; + else + return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB); +} + +static int get_send_sge(struct ib_qp_init_attr *attr, int wqe_size) +{ + int max_sge; + + if (attr->qp_type == IB_QPT_RC) + max_sge = (min_t(int, wqe_size, 512) - + sizeof(struct mlx5_wqe_ctrl_seg) - + sizeof(struct mlx5_wqe_raddr_seg)) / + sizeof(struct mlx5_wqe_data_seg); + else if (attr->qp_type == IB_QPT_XRC_INI) + max_sge = (min_t(int, wqe_size, 512) - + sizeof(struct mlx5_wqe_ctrl_seg) - + sizeof(struct mlx5_wqe_xrc_seg) - + sizeof(struct mlx5_wqe_raddr_seg)) / + sizeof(struct mlx5_wqe_data_seg); + else + max_sge = (wqe_size - sq_overhead(attr)) / + sizeof(struct mlx5_wqe_data_seg); + + return min_t(int, max_sge, wqe_size - sq_overhead(attr) / + sizeof(struct mlx5_wqe_data_seg)); +} + +static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, + struct mlx5_ib_qp *qp) +{ + int wqe_size; + int wq_size; + + if (!attr->cap.max_send_wr) + return 0; + + wqe_size = calc_send_wqe(attr); + mlx5_ib_dbg(dev, "wqe_size %d\n", wqe_size); + if (wqe_size < 0) + return wqe_size; + + if (wqe_size > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)) { + mlx5_ib_dbg(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n", + wqe_size, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)); + return -EINVAL; + } + + qp->max_inline_data = wqe_size - sq_overhead(attr) - + sizeof(struct mlx5_wqe_inline_seg); + attr->cap.max_inline_data = qp->max_inline_data; + + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) + qp->signature_en = true; + + wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size); + qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB; + if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) { + mlx5_ib_dbg(dev, "send queue size (%d * %d / %d -> %d) exceeds limits(%d)\n", + attr->cap.max_send_wr, wqe_size, MLX5_SEND_WQE_BB, + qp->sq.wqe_cnt, + 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz)); + return -ENOMEM; + } + qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); + qp->sq.max_gs = get_send_sge(attr, wqe_size); + if (qp->sq.max_gs < attr->cap.max_send_sge) + return -ENOMEM; + + attr->cap.max_send_sge = qp->sq.max_gs; + qp->sq.max_post = wq_size / wqe_size; + attr->cap.max_send_wr = qp->sq.max_post; + + return wq_size; +} + +static int set_user_buf_size(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, + struct mlx5_ib_create_qp *ucmd, + struct mlx5_ib_qp_base *base, + struct ib_qp_init_attr *attr) +{ + int desc_sz = 1 << qp->sq.wqe_shift; + + if (desc_sz > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)) { + mlx5_ib_warn(dev, "desc_sz %d, max_sq_desc_sz %d\n", + desc_sz, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)); + return -EINVAL; + } + + if (ucmd->sq_wqe_count && ((1 << ilog2(ucmd->sq_wqe_count)) != ucmd->sq_wqe_count)) { + mlx5_ib_warn(dev, "sq_wqe_count %d, sq_wqe_count %d\n", + ucmd->sq_wqe_count, ucmd->sq_wqe_count); + return -EINVAL; + } + + qp->sq.wqe_cnt = ucmd->sq_wqe_count; + + if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) { + mlx5_ib_warn(dev, "wqe_cnt %d, max_wqes %d\n", + qp->sq.wqe_cnt, + 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz)); + return -EINVAL; + } + + if (attr->qp_type == IB_QPT_RAW_PACKET || + qp->flags & MLX5_IB_QP_UNDERLAY) { + base->ubuffer.buf_size = qp->rq.wqe_cnt << qp->rq.wqe_shift; + qp->raw_packet_qp.sq.ubuffer.buf_size = qp->sq.wqe_cnt << 6; + } else { + base->ubuffer.buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << 6); + } + + return 0; +} + +static int qp_has_rq(struct ib_qp_init_attr *attr) +{ + if (attr->qp_type == IB_QPT_XRC_INI || + attr->qp_type == IB_QPT_XRC_TGT || attr->srq || + attr->qp_type == MLX5_IB_QPT_REG_UMR || + !attr->cap.max_recv_wr) + return 0; + + return 1; +} + +enum { + /* this is the first blue flame register in the array of bfregs assigned + * to a processes. Since we do not use it for blue flame but rather + * regular 64 bit doorbells, we do not need a lock for maintaiing + * "odd/even" order + */ + NUM_NON_BLUE_FLAME_BFREGS = 1, +}; + +static int max_bfregs(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi) +{ + return get_num_static_uars(dev, bfregi) * MLX5_NON_FP_BFREGS_PER_UAR; +} + +static int num_med_bfreg(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi) +{ + int n; + + n = max_bfregs(dev, bfregi) - bfregi->num_low_latency_bfregs - + NUM_NON_BLUE_FLAME_BFREGS; + + return n >= 0 ? n : 0; +} + +static int first_med_bfreg(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi) +{ + return num_med_bfreg(dev, bfregi) ? 1 : -ENOMEM; +} + +static int first_hi_bfreg(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi) +{ + int med; + + med = num_med_bfreg(dev, bfregi); + return ++med; +} + +static int alloc_high_class_bfreg(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi) +{ + int i; + + for (i = first_hi_bfreg(dev, bfregi); i < max_bfregs(dev, bfregi); i++) { + if (!bfregi->count[i]) { + bfregi->count[i]++; + return i; + } + } + + return -ENOMEM; +} + +static int alloc_med_class_bfreg(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi) +{ + int minidx = first_med_bfreg(dev, bfregi); + int i; + + if (minidx < 0) + return minidx; + + for (i = minidx; i < first_hi_bfreg(dev, bfregi); i++) { + if (bfregi->count[i] < bfregi->count[minidx]) + minidx = i; + if (!bfregi->count[minidx]) + break; + } + + bfregi->count[minidx]++; + return minidx; +} + +static int alloc_bfreg(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi) +{ + int bfregn = -ENOMEM; + + mutex_lock(&bfregi->lock); + if (bfregi->ver >= 2) { + bfregn = alloc_high_class_bfreg(dev, bfregi); + if (bfregn < 0) + bfregn = alloc_med_class_bfreg(dev, bfregi); + } + + if (bfregn < 0) { + BUILD_BUG_ON(NUM_NON_BLUE_FLAME_BFREGS != 1); + bfregn = 0; + bfregi->count[bfregn]++; + } + mutex_unlock(&bfregi->lock); + + return bfregn; +} + +void mlx5_ib_free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, int bfregn) +{ + mutex_lock(&bfregi->lock); + bfregi->count[bfregn]--; + mutex_unlock(&bfregi->lock); +} + +static enum mlx5_qp_state to_mlx5_state(enum ib_qp_state state) +{ + switch (state) { + case IB_QPS_RESET: return MLX5_QP_STATE_RST; + case IB_QPS_INIT: return MLX5_QP_STATE_INIT; + case IB_QPS_RTR: return MLX5_QP_STATE_RTR; + case IB_QPS_RTS: return MLX5_QP_STATE_RTS; + case IB_QPS_SQD: return MLX5_QP_STATE_SQD; + case IB_QPS_SQE: return MLX5_QP_STATE_SQER; + case IB_QPS_ERR: return MLX5_QP_STATE_ERR; + default: return -1; + } +} + +static int to_mlx5_st(enum ib_qp_type type) +{ + switch (type) { + case IB_QPT_RC: return MLX5_QP_ST_RC; + case IB_QPT_UC: return MLX5_QP_ST_UC; + case IB_QPT_UD: return MLX5_QP_ST_UD; + case MLX5_IB_QPT_REG_UMR: return MLX5_QP_ST_REG_UMR; + case IB_QPT_XRC_INI: + case IB_QPT_XRC_TGT: return MLX5_QP_ST_XRC; + case IB_QPT_SMI: return MLX5_QP_ST_QP0; + case MLX5_IB_QPT_HW_GSI: return MLX5_QP_ST_QP1; + case MLX5_IB_QPT_DCI: return MLX5_QP_ST_DCI; + case IB_QPT_RAW_IPV6: return MLX5_QP_ST_RAW_IPV6; + case IB_QPT_RAW_PACKET: + case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE; + case IB_QPT_MAX: + default: return -EINVAL; + } +} + +static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, + struct mlx5_ib_cq *recv_cq); +static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, + struct mlx5_ib_cq *recv_cq); + +int bfregn_to_uar_index(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi, u32 bfregn, + bool dyn_bfreg) +{ + unsigned int bfregs_per_sys_page; + u32 index_of_sys_page; + u32 offset; + + bfregs_per_sys_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * + MLX5_NON_FP_BFREGS_PER_UAR; + index_of_sys_page = bfregn / bfregs_per_sys_page; + + if (dyn_bfreg) { + index_of_sys_page += bfregi->num_static_sys_pages; + + if (index_of_sys_page >= bfregi->num_sys_pages) + return -EINVAL; + + if (bfregn > bfregi->num_dyn_bfregs || + bfregi->sys_pages[index_of_sys_page] == MLX5_IB_INVALID_UAR_INDEX) { + mlx5_ib_dbg(dev, "Invalid dynamic uar index\n"); + return -EINVAL; + } + } + + offset = bfregn % bfregs_per_sys_page / MLX5_NON_FP_BFREGS_PER_UAR; + return bfregi->sys_pages[index_of_sys_page] + offset; +} + +static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev, + struct ib_pd *pd, + unsigned long addr, size_t size, + struct ib_umem **umem, + int *npages, int *page_shift, int *ncont, + u32 *offset) +{ + int err; + + *umem = ib_umem_get(pd->uobject->context, addr, size, 0, 0); + if (IS_ERR(*umem)) { + mlx5_ib_dbg(dev, "umem_get failed\n"); + return PTR_ERR(*umem); + } + + mlx5_ib_cont_pages(*umem, addr, 0, npages, page_shift, ncont, NULL); + + err = mlx5_ib_get_buf_offset(addr, *page_shift, offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + + mlx5_ib_dbg(dev, "addr 0x%lx, size %zu, npages %d, page_shift %d, ncont %d, offset %d\n", + addr, size, *npages, *page_shift, *ncont, *offset); + + return 0; + +err_umem: + ib_umem_release(*umem); + *umem = NULL; + + return err; +} + +static void destroy_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_rwq *rwq) +{ + struct mlx5_ib_ucontext *context; + + if (rwq->create_flags & MLX5_IB_WQ_FLAGS_DELAY_DROP) + atomic_dec(&dev->delay_drop.rqs_cnt); + + context = to_mucontext(pd->uobject->context); + mlx5_ib_db_unmap_user(context, &rwq->db); + if (rwq->umem) + ib_umem_release(rwq->umem); +} + +static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_rwq *rwq, + struct mlx5_ib_create_wq *ucmd) +{ + struct mlx5_ib_ucontext *context; + int page_shift = 0; + int npages; + u32 offset = 0; + int ncont = 0; + int err; + + if (!ucmd->buf_addr) + return -EINVAL; + + context = to_mucontext(pd->uobject->context); + rwq->umem = ib_umem_get(pd->uobject->context, ucmd->buf_addr, + rwq->buf_size, 0, 0); + if (IS_ERR(rwq->umem)) { + mlx5_ib_dbg(dev, "umem_get failed\n"); + err = PTR_ERR(rwq->umem); + return err; + } + + mlx5_ib_cont_pages(rwq->umem, ucmd->buf_addr, 0, &npages, &page_shift, + &ncont, NULL); + err = mlx5_ib_get_buf_offset(ucmd->buf_addr, page_shift, + &rwq->rq_page_offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + + rwq->rq_num_pas = ncont; + rwq->page_shift = page_shift; + rwq->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + rwq->wq_sig = !!(ucmd->flags & MLX5_WQ_FLAG_SIGNATURE); + + mlx5_ib_dbg(dev, "addr 0x%llx, size %zd, npages %d, page_shift %d, ncont %d, offset %d\n", + (unsigned long long)ucmd->buf_addr, rwq->buf_size, + npages, page_shift, ncont, offset); + + err = mlx5_ib_db_map_user(context, ucmd->db_addr, &rwq->db); + if (err) { + mlx5_ib_dbg(dev, "map failed\n"); + goto err_umem; + } + + rwq->create_type = MLX5_WQ_USER; + return 0; + +err_umem: + ib_umem_release(rwq->umem); + return err; +} + +static int adjust_bfregn(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi, int bfregn) +{ + return bfregn / MLX5_NON_FP_BFREGS_PER_UAR * MLX5_BFREGS_PER_UAR + + bfregn % MLX5_NON_FP_BFREGS_PER_UAR; +} + +static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, struct ib_udata *udata, + struct ib_qp_init_attr *attr, + u32 **in, + struct mlx5_ib_create_qp_resp *resp, int *inlen, + struct mlx5_ib_qp_base *base) +{ + struct mlx5_ib_ucontext *context; + struct mlx5_ib_create_qp ucmd; + struct mlx5_ib_ubuffer *ubuffer = &base->ubuffer; + int page_shift = 0; + int uar_index = 0; + int npages; + u32 offset = 0; + int bfregn; + int ncont = 0; + __be64 *pas; + void *qpc; + int err; + + err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); + if (err) { + mlx5_ib_dbg(dev, "copy failed\n"); + return err; + } + + context = to_mucontext(pd->uobject->context); + if (ucmd.flags & MLX5_QP_FLAG_BFREG_INDEX) { + uar_index = bfregn_to_uar_index(dev, &context->bfregi, + ucmd.bfreg_index, true); + if (uar_index < 0) + return uar_index; + + bfregn = MLX5_IB_INVALID_BFREG; + } else if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) { + /* + * TBD: should come from the verbs when we have the API + */ + /* In CROSS_CHANNEL CQ and QP must use the same UAR */ + bfregn = MLX5_CROSS_CHANNEL_BFREG; + } + else { + bfregn = alloc_bfreg(dev, &context->bfregi); + if (bfregn < 0) + return bfregn; + } + + mlx5_ib_dbg(dev, "bfregn 0x%x, uar_index 0x%x\n", bfregn, uar_index); + if (bfregn != MLX5_IB_INVALID_BFREG) + uar_index = bfregn_to_uar_index(dev, &context->bfregi, bfregn, + false); + + qp->rq.offset = 0; + qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + + err = set_user_buf_size(dev, qp, &ucmd, base, attr); + if (err) + goto err_bfreg; + + if (ucmd.buf_addr && ubuffer->buf_size) { + ubuffer->buf_addr = ucmd.buf_addr; + err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, + ubuffer->buf_size, + &ubuffer->umem, &npages, &page_shift, + &ncont, &offset); + if (err) + goto err_bfreg; + } else { + ubuffer->umem = NULL; + } + + *inlen = MLX5_ST_SZ_BYTES(create_qp_in) + + MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * ncont; + *in = kvzalloc(*inlen, GFP_KERNEL); + if (!*in) { + err = -ENOMEM; + goto err_umem; + } + + pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas); + if (ubuffer->umem) + mlx5_ib_populate_pas(dev, ubuffer->umem, page_shift, pas, 0); + + qpc = MLX5_ADDR_OF(create_qp_in, *in, qpc); + + MLX5_SET(qpc, qpc, log_page_size, page_shift - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(qpc, qpc, page_offset, offset); + + MLX5_SET(qpc, qpc, uar_page, uar_index); + if (bfregn != MLX5_IB_INVALID_BFREG) + resp->bfreg_index = adjust_bfregn(dev, &context->bfregi, bfregn); + else + resp->bfreg_index = MLX5_IB_INVALID_BFREG; + qp->bfregn = bfregn; + + err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db); + if (err) { + mlx5_ib_dbg(dev, "map failed\n"); + goto err_free; + } + + err = ib_copy_to_udata(udata, resp, min(udata->outlen, sizeof(*resp))); + if (err) { + mlx5_ib_dbg(dev, "copy failed\n"); + goto err_unmap; + } + qp->create_type = MLX5_QP_USER; + + return 0; + +err_unmap: + mlx5_ib_db_unmap_user(context, &qp->db); + +err_free: + kvfree(*in); + +err_umem: + if (ubuffer->umem) + ib_umem_release(ubuffer->umem); + +err_bfreg: + if (bfregn != MLX5_IB_INVALID_BFREG) + mlx5_ib_free_bfreg(dev, &context->bfregi, bfregn); + return err; +} + +static void destroy_qp_user(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, struct mlx5_ib_qp_base *base) +{ + struct mlx5_ib_ucontext *context; + + context = to_mucontext(pd->uobject->context); + mlx5_ib_db_unmap_user(context, &qp->db); + if (base->ubuffer.umem) + ib_umem_release(base->ubuffer.umem); + + /* + * Free only the BFREGs which are handled by the kernel. + * BFREGs of UARs allocated dynamically are handled by user. + */ + if (qp->bfregn != MLX5_IB_INVALID_BFREG) + mlx5_ib_free_bfreg(dev, &context->bfregi, qp->bfregn); +} + +static int create_kernel_qp(struct mlx5_ib_dev *dev, + struct ib_qp_init_attr *init_attr, + struct mlx5_ib_qp *qp, + u32 **in, int *inlen, + struct mlx5_ib_qp_base *base) +{ + int uar_index; + void *qpc; + int err; + + if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | + IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | + IB_QP_CREATE_IPOIB_UD_LSO | + IB_QP_CREATE_NETIF_QP | + mlx5_ib_create_qp_sqpn_qp1())) + return -EINVAL; + + if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR) + qp->bf.bfreg = &dev->fp_bfreg; + else + qp->bf.bfreg = &dev->bfreg; + + /* We need to divide by two since each register is comprised of + * two buffers of identical size, namely odd and even + */ + qp->bf.buf_size = (1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size)) / 2; + uar_index = qp->bf.bfreg->index; + + err = calc_sq_size(dev, init_attr, qp); + if (err < 0) { + mlx5_ib_dbg(dev, "err %d\n", err); + return err; + } + + qp->rq.offset = 0; + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + base->ubuffer.buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift); + + err = mlx5_buf_alloc(dev->mdev, base->ubuffer.buf_size, &qp->buf); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + return err; + } + + qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt); + *inlen = MLX5_ST_SZ_BYTES(create_qp_in) + + MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * qp->buf.npages; + *in = kvzalloc(*inlen, GFP_KERNEL); + if (!*in) { + err = -ENOMEM; + goto err_buf; + } + + qpc = MLX5_ADDR_OF(create_qp_in, *in, qpc); + MLX5_SET(qpc, qpc, uar_page, uar_index); + MLX5_SET(qpc, qpc, log_page_size, qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); + + /* Set "fast registration enabled" for all kernel QPs */ + MLX5_SET(qpc, qpc, fre, 1); + MLX5_SET(qpc, qpc, rlky, 1); + + if (init_attr->create_flags & mlx5_ib_create_qp_sqpn_qp1()) { + MLX5_SET(qpc, qpc, deth_sqpn, 1); + qp->flags |= MLX5_IB_QP_SQPN_QP1; + } + + mlx5_fill_page_array(&qp->buf, + (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas)); + + err = mlx5_db_alloc(dev->mdev, &qp->db); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + goto err_free; + } + + qp->sq.wrid = kvmalloc_array(qp->sq.wqe_cnt, + sizeof(*qp->sq.wrid), GFP_KERNEL); + qp->sq.wr_data = kvmalloc_array(qp->sq.wqe_cnt, + sizeof(*qp->sq.wr_data), GFP_KERNEL); + qp->rq.wrid = kvmalloc_array(qp->rq.wqe_cnt, + sizeof(*qp->rq.wrid), GFP_KERNEL); + qp->sq.w_list = kvmalloc_array(qp->sq.wqe_cnt, + sizeof(*qp->sq.w_list), GFP_KERNEL); + qp->sq.wqe_head = kvmalloc_array(qp->sq.wqe_cnt, + sizeof(*qp->sq.wqe_head), GFP_KERNEL); + + if (!qp->sq.wrid || !qp->sq.wr_data || !qp->rq.wrid || + !qp->sq.w_list || !qp->sq.wqe_head) { + err = -ENOMEM; + goto err_wrid; + } + qp->create_type = MLX5_QP_KERNEL; + + return 0; + +err_wrid: + kvfree(qp->sq.wqe_head); + kvfree(qp->sq.w_list); + kvfree(qp->sq.wrid); + kvfree(qp->sq.wr_data); + kvfree(qp->rq.wrid); + mlx5_db_free(dev->mdev, &qp->db); + +err_free: + kvfree(*in); + +err_buf: + mlx5_buf_free(dev->mdev, &qp->buf); + return err; +} + +static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + kvfree(qp->sq.wqe_head); + kvfree(qp->sq.w_list); + kvfree(qp->sq.wrid); + kvfree(qp->sq.wr_data); + kvfree(qp->rq.wrid); + mlx5_db_free(dev->mdev, &qp->db); + mlx5_buf_free(dev->mdev, &qp->buf); +} + +static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) +{ + if (attr->srq || (attr->qp_type == IB_QPT_XRC_TGT) || + (attr->qp_type == MLX5_IB_QPT_DCI) || + (attr->qp_type == IB_QPT_XRC_INI)) + return MLX5_SRQ_RQ; + else if (!qp->has_rq) + return MLX5_ZERO_LEN_RQ; + else + return MLX5_NON_ZERO_RQ; +} + +static int is_connected(enum ib_qp_type qp_type) +{ + if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC) + return 1; + + return 0; +} + +static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, + struct mlx5_ib_sq *sq, u32 tdn) +{ + u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {0}; + void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); + + MLX5_SET(tisc, tisc, transport_domain, tdn); + if (qp->flags & MLX5_IB_QP_UNDERLAY) + MLX5_SET(tisc, tisc, underlay_qpn, qp->underlay_qpn); + + return mlx5_core_create_tis(dev->mdev, in, sizeof(in), &sq->tisn); +} + +static void destroy_raw_packet_qp_tis(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq) +{ + mlx5_core_destroy_tis(dev->mdev, sq->tisn); +} + +static void destroy_flow_rule_vport_sq(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq) +{ + if (sq->flow_rule) + mlx5_del_flow_rules(sq->flow_rule); +} + +static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq, void *qpin, + struct ib_pd *pd) +{ + struct mlx5_ib_ubuffer *ubuffer = &sq->ubuffer; + __be64 *pas; + void *in; + void *sqc; + void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc); + void *wq; + int inlen; + int err; + int page_shift = 0; + int npages; + int ncont = 0; + u32 offset = 0; + + err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, ubuffer->buf_size, + &sq->ubuffer.umem, &npages, &page_shift, + &ncont, &offset); + if (err) + return err; + + inlen = MLX5_ST_SZ_BYTES(create_sq_in) + sizeof(u64) * ncont; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_umem; + } + + sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); + MLX5_SET(sqc, sqc, flush_in_error_en, 1); + if (MLX5_CAP_ETH(dev->mdev, multi_pkt_send_wqe)) + MLX5_SET(sqc, sqc, allow_multi_pkt_send_wqe, 1); + MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST); + MLX5_SET(sqc, sqc, user_index, MLX5_GET(qpc, qpc, user_index)); + MLX5_SET(sqc, sqc, cqn, MLX5_GET(qpc, qpc, cqn_snd)); + MLX5_SET(sqc, sqc, tis_lst_sz, 1); + MLX5_SET(sqc, sqc, tis_num_0, sq->tisn); + if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && + MLX5_CAP_ETH(dev->mdev, swp)) + MLX5_SET(sqc, sqc, allow_swp, 1); + + wq = MLX5_ADDR_OF(sqc, sqc, wq); + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); + MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd)); + MLX5_SET(wq, wq, uar_page, MLX5_GET(qpc, qpc, uar_page)); + MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr)); + MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); + MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_sq_size)); + MLX5_SET(wq, wq, log_wq_pg_sz, page_shift - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(wq, wq, page_offset, offset); + + pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); + mlx5_ib_populate_pas(dev, sq->ubuffer.umem, page_shift, pas, 0); + + err = mlx5_core_create_sq_tracked(dev->mdev, in, inlen, &sq->base.mqp); + + kvfree(in); + + if (err) + goto err_umem; + + err = create_flow_rule_vport_sq(dev, sq); + if (err) + goto err_flow; + + return 0; + +err_flow: + mlx5_core_destroy_sq_tracked(dev->mdev, &sq->base.mqp); + +err_umem: + ib_umem_release(sq->ubuffer.umem); + sq->ubuffer.umem = NULL; + + return err; +} + +static void destroy_raw_packet_qp_sq(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq) +{ + destroy_flow_rule_vport_sq(dev, sq); + mlx5_core_destroy_sq_tracked(dev->mdev, &sq->base.mqp); + ib_umem_release(sq->ubuffer.umem); +} + +static size_t get_rq_pas_size(void *qpc) +{ + u32 log_page_size = MLX5_GET(qpc, qpc, log_page_size) + 12; + u32 log_rq_stride = MLX5_GET(qpc, qpc, log_rq_stride); + u32 log_rq_size = MLX5_GET(qpc, qpc, log_rq_size); + u32 page_offset = MLX5_GET(qpc, qpc, page_offset); + u32 po_quanta = 1 << (log_page_size - 6); + u32 rq_sz = 1 << (log_rq_size + 4 + log_rq_stride); + u32 page_size = 1 << log_page_size; + u32 rq_sz_po = rq_sz + (page_offset * po_quanta); + u32 rq_num_pas = (rq_sz_po + page_size - 1) / page_size; + + return rq_num_pas * sizeof(u64); +} + +static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq, void *qpin, + size_t qpinlen) +{ + struct mlx5_ib_qp *mqp = rq->base.container_mibqp; + __be64 *pas; + __be64 *qp_pas; + void *in; + void *rqc; + void *wq; + void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc); + size_t rq_pas_size = get_rq_pas_size(qpc); + size_t inlen; + int err; + + if (qpinlen < rq_pas_size + MLX5_BYTE_OFF(create_qp_in, pas)) + return -EINVAL; + + inlen = MLX5_ST_SZ_BYTES(create_rq_in) + rq_pas_size; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); + if (!(rq->flags & MLX5_IB_RQ_CVLAN_STRIPPING)) + MLX5_SET(rqc, rqc, vsd, 1); + MLX5_SET(rqc, rqc, mem_rq_type, MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE); + MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); + MLX5_SET(rqc, rqc, flush_in_error_en, 1); + MLX5_SET(rqc, rqc, user_index, MLX5_GET(qpc, qpc, user_index)); + MLX5_SET(rqc, rqc, cqn, MLX5_GET(qpc, qpc, cqn_rcv)); + + if (mqp->flags & MLX5_IB_QP_CAP_SCATTER_FCS) + MLX5_SET(rqc, rqc, scatter_fcs, 1); + + wq = MLX5_ADDR_OF(rqc, rqc, wq); + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); + if (rq->flags & MLX5_IB_RQ_PCI_WRITE_END_PADDING) + MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN); + MLX5_SET(wq, wq, page_offset, MLX5_GET(qpc, qpc, page_offset)); + MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd)); + MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr)); + MLX5_SET(wq, wq, log_wq_stride, MLX5_GET(qpc, qpc, log_rq_stride) + 4); + MLX5_SET(wq, wq, log_wq_pg_sz, MLX5_GET(qpc, qpc, log_page_size)); + MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_rq_size)); + + pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); + qp_pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, qpin, pas); + memcpy(pas, qp_pas, rq_pas_size); + + err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rq->base.mqp); + + kvfree(in); + + return err; +} + +static void destroy_raw_packet_qp_rq(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq) +{ + mlx5_core_destroy_rq_tracked(dev->mdev, &rq->base.mqp); +} + +static bool tunnel_offload_supported(struct mlx5_core_dev *dev) +{ + return (MLX5_CAP_ETH(dev, tunnel_stateless_vxlan) || + MLX5_CAP_ETH(dev, tunnel_stateless_gre) || + MLX5_CAP_ETH(dev, tunnel_stateless_geneve_rx)); +} + +static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq, u32 tdn, + bool tunnel_offload_en) +{ + u32 *in; + void *tirc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(create_tir_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); + MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT); + MLX5_SET(tirc, tirc, inline_rqn, rq->base.mqp.qpn); + MLX5_SET(tirc, tirc, transport_domain, tdn); + if (tunnel_offload_en) + MLX5_SET(tirc, tirc, tunneled_offload_en, 1); + + if (dev->rep) + MLX5_SET(tirc, tirc, self_lb_block, + MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST_); + + err = mlx5_core_create_tir(dev->mdev, in, inlen, &rq->tirn); + + kvfree(in); + + return err; +} + +static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq) +{ + mlx5_core_destroy_tir(dev->mdev, rq->tirn); +} + +static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + u32 *in, size_t inlen, + struct ib_pd *pd) +{ + struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + struct ib_uobject *uobj = pd->uobject; + struct ib_ucontext *ucontext = uobj->context; + struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext); + int err; + u32 tdn = mucontext->tdn; + + if (qp->sq.wqe_cnt) { + err = create_raw_packet_qp_tis(dev, qp, sq, tdn); + if (err) + return err; + + err = create_raw_packet_qp_sq(dev, sq, in, pd); + if (err) + goto err_destroy_tis; + + sq->base.container_mibqp = qp; + sq->base.mqp.event = mlx5_ib_qp_event; + } + + if (qp->rq.wqe_cnt) { + rq->base.container_mibqp = qp; + + if (qp->flags & MLX5_IB_QP_CVLAN_STRIPPING) + rq->flags |= MLX5_IB_RQ_CVLAN_STRIPPING; + if (qp->flags & MLX5_IB_QP_PCI_WRITE_END_PADDING) + rq->flags |= MLX5_IB_RQ_PCI_WRITE_END_PADDING; + err = create_raw_packet_qp_rq(dev, rq, in, inlen); + if (err) + goto err_destroy_sq; + + + err = create_raw_packet_qp_tir(dev, rq, tdn, + qp->tunnel_offload_en); + if (err) + goto err_destroy_rq; + } + + qp->trans_qp.base.mqp.qpn = qp->sq.wqe_cnt ? sq->base.mqp.qpn : + rq->base.mqp.qpn; + + return 0; + +err_destroy_rq: + destroy_raw_packet_qp_rq(dev, rq); +err_destroy_sq: + if (!qp->sq.wqe_cnt) + return err; + destroy_raw_packet_qp_sq(dev, sq); +err_destroy_tis: + destroy_raw_packet_qp_tis(dev, sq); + + return err; +} + +static void destroy_raw_packet_qp(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + + if (qp->rq.wqe_cnt) { + destroy_raw_packet_qp_tir(dev, rq); + destroy_raw_packet_qp_rq(dev, rq); + } + + if (qp->sq.wqe_cnt) { + destroy_raw_packet_qp_sq(dev, sq); + destroy_raw_packet_qp_tis(dev, sq); + } +} + +static void raw_packet_qp_copy_info(struct mlx5_ib_qp *qp, + struct mlx5_ib_raw_packet_qp *raw_packet_qp) +{ + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + + sq->sq = &qp->sq; + rq->rq = &qp->rq; + sq->doorbell = &qp->db; + rq->doorbell = &qp->db; +} + +static void destroy_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + mlx5_core_destroy_tir(dev->mdev, qp->rss_qp.tirn); +} + +static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct ib_uobject *uobj = pd->uobject; + struct ib_ucontext *ucontext = uobj->context; + struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext); + struct mlx5_ib_create_qp_resp resp = {}; + int inlen; + int err; + u32 *in; + void *tirc; + void *hfso; + u32 selected_fields = 0; + u32 outer_l4; + size_t min_resp_len; + u32 tdn = mucontext->tdn; + struct mlx5_ib_create_qp_rss ucmd = {}; + size_t required_cmd_sz; + + if (init_attr->qp_type != IB_QPT_RAW_PACKET) + return -EOPNOTSUPP; + + if (init_attr->create_flags || init_attr->send_cq) + return -EINVAL; + + min_resp_len = offsetof(typeof(resp), bfreg_index) + sizeof(resp.bfreg_index); + if (udata->outlen < min_resp_len) + return -EINVAL; + + required_cmd_sz = offsetof(typeof(ucmd), flags) + sizeof(ucmd.flags); + if (udata->inlen < required_cmd_sz) { + mlx5_ib_dbg(dev, "invalid inlen\n"); + return -EINVAL; + } + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) { + mlx5_ib_dbg(dev, "inlen is not supported\n"); + return -EOPNOTSUPP; + } + + if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) { + mlx5_ib_dbg(dev, "copy failed\n"); + return -EFAULT; + } + + if (ucmd.comp_mask) { + mlx5_ib_dbg(dev, "invalid comp mask\n"); + return -EOPNOTSUPP; + } + + if (ucmd.flags & ~MLX5_QP_FLAG_TUNNEL_OFFLOADS) { + mlx5_ib_dbg(dev, "invalid flags\n"); + return -EOPNOTSUPP; + } + + if (ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS && + !tunnel_offload_supported(dev->mdev)) { + mlx5_ib_dbg(dev, "tunnel offloads isn't supported\n"); + return -EOPNOTSUPP; + } + + if (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_INNER && + !(ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS)) { + mlx5_ib_dbg(dev, "Tunnel offloads must be set for inner RSS\n"); + return -EOPNOTSUPP; + } + + err = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); + if (err) { + mlx5_ib_dbg(dev, "copy failed\n"); + return -EINVAL; + } + + inlen = MLX5_ST_SZ_BYTES(create_tir_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); + MLX5_SET(tirc, tirc, disp_type, + MLX5_TIRC_DISP_TYPE_INDIRECT); + MLX5_SET(tirc, tirc, indirect_table, + init_attr->rwq_ind_tbl->ind_tbl_num); + MLX5_SET(tirc, tirc, transport_domain, tdn); + + hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer); + + if (ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS) + MLX5_SET(tirc, tirc, tunneled_offload_en, 1); + + if (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_INNER) + hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_inner); + else + hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer); + + switch (ucmd.rx_hash_function) { + case MLX5_RX_HASH_FUNC_TOEPLITZ: + { + void *rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key); + size_t len = MLX5_FLD_SZ_BYTES(tirc, rx_hash_toeplitz_key); + + if (len != ucmd.rx_key_len) { + err = -EINVAL; + goto err; + } + + MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ); + memcpy(rss_key, ucmd.rx_hash_key, len); + break; + } + default: + err = -EOPNOTSUPP; + goto err; + } + + if (!ucmd.rx_hash_fields_mask) { + /* special case when this TIR serves as steering entry without hashing */ + if (!init_attr->rwq_ind_tbl->log_ind_tbl_size) + goto create_tir; + err = -EINVAL; + goto err; + } + + if (((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4)) && + ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))) { + err = -EINVAL; + goto err; + } + + /* If none of IPV4 & IPV6 SRC/DST was set - this bit field is ignored */ + if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4)) + MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, + MLX5_L3_PROT_TYPE_IPV4); + else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6)) + MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, + MLX5_L3_PROT_TYPE_IPV6); + + outer_l4 = ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) << 0 | + ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) << 1 | + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_IPSEC_SPI) << 2; + + /* Check that only one l4 protocol is set */ + if (outer_l4 & (outer_l4 - 1)) { + err = -EINVAL; + goto err; + } + + /* If none of TCP & UDP SRC/DST was set - this bit field is ignored */ + if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) + MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, + MLX5_L4_PROT_TYPE_TCP); + else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) + MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, + MLX5_L4_PROT_TYPE_UDP); + + if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6)) + selected_fields |= MLX5_HASH_FIELD_SEL_SRC_IP; + + if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6)) + selected_fields |= MLX5_HASH_FIELD_SEL_DST_IP; + + if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP)) + selected_fields |= MLX5_HASH_FIELD_SEL_L4_SPORT; + + if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) + selected_fields |= MLX5_HASH_FIELD_SEL_L4_DPORT; + + if (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_IPSEC_SPI) + selected_fields |= MLX5_HASH_FIELD_SEL_IPSEC_SPI; + + MLX5_SET(rx_hash_field_select, hfso, selected_fields, selected_fields); + +create_tir: + if (dev->rep) + MLX5_SET(tirc, tirc, self_lb_block, + MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST_); + + err = mlx5_core_create_tir(dev->mdev, in, inlen, &qp->rss_qp.tirn); + + if (err) + goto err; + + kvfree(in); + /* qpn is reserved for that QP */ + qp->trans_qp.base.mqp.qpn = 0; + qp->flags |= MLX5_IB_QP_RSS; + return 0; + +err: + kvfree(in); + return err; +} + +static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata, struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_resources *devr = &dev->devr; + int inlen = MLX5_ST_SZ_BYTES(create_qp_in); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_create_qp_resp resp = {}; + struct mlx5_ib_cq *send_cq; + struct mlx5_ib_cq *recv_cq; + unsigned long flags; + u32 uidx = MLX5_IB_DEFAULT_UIDX; + struct mlx5_ib_create_qp ucmd; + struct mlx5_ib_qp_base *base; + int mlx5_st; + void *qpc; + u32 *in; + int err; + + mutex_init(&qp->mutex); + spin_lock_init(&qp->sq.lock); + spin_lock_init(&qp->rq.lock); + + mlx5_st = to_mlx5_st(init_attr->qp_type); + if (mlx5_st < 0) + return -EINVAL; + + if (init_attr->rwq_ind_tbl) { + if (!udata) + return -ENOSYS; + + err = create_rss_raw_qp_tir(dev, qp, pd, init_attr, udata); + return err; + } + + if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { + if (!MLX5_CAP_GEN(mdev, block_lb_mc)) { + mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n"); + return -EINVAL; + } else { + qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK; + } + } + + if (init_attr->create_flags & + (IB_QP_CREATE_CROSS_CHANNEL | + IB_QP_CREATE_MANAGED_SEND | + IB_QP_CREATE_MANAGED_RECV)) { + if (!MLX5_CAP_GEN(mdev, cd)) { + mlx5_ib_dbg(dev, "cross-channel isn't supported\n"); + return -EINVAL; + } + if (init_attr->create_flags & IB_QP_CREATE_CROSS_CHANNEL) + qp->flags |= MLX5_IB_QP_CROSS_CHANNEL; + if (init_attr->create_flags & IB_QP_CREATE_MANAGED_SEND) + qp->flags |= MLX5_IB_QP_MANAGED_SEND; + if (init_attr->create_flags & IB_QP_CREATE_MANAGED_RECV) + qp->flags |= MLX5_IB_QP_MANAGED_RECV; + } + + if (init_attr->qp_type == IB_QPT_UD && + (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)) + if (!MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) { + mlx5_ib_dbg(dev, "ipoib UD lso qp isn't supported\n"); + return -EOPNOTSUPP; + } + + if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS) { + if (init_attr->qp_type != IB_QPT_RAW_PACKET) { + mlx5_ib_dbg(dev, "Scatter FCS is supported only for Raw Packet QPs"); + return -EOPNOTSUPP; + } + if (!MLX5_CAP_GEN(dev->mdev, eth_net_offloads) || + !MLX5_CAP_ETH(dev->mdev, scatter_fcs)) { + mlx5_ib_dbg(dev, "Scatter FCS isn't supported\n"); + return -EOPNOTSUPP; + } + qp->flags |= MLX5_IB_QP_CAP_SCATTER_FCS; + } + + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; + + if (init_attr->create_flags & IB_QP_CREATE_CVLAN_STRIPPING) { + if (!(MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && + MLX5_CAP_ETH(dev->mdev, vlan_cap)) || + (init_attr->qp_type != IB_QPT_RAW_PACKET)) + return -EOPNOTSUPP; + qp->flags |= MLX5_IB_QP_CVLAN_STRIPPING; + } + + if (pd && pd->uobject) { + if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { + mlx5_ib_dbg(dev, "copy failed\n"); + return -EFAULT; + } + + err = get_qp_user_index(to_mucontext(pd->uobject->context), + &ucmd, udata->inlen, &uidx); + if (err) + return err; + + qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE); + qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE); + if (ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS) { + if (init_attr->qp_type != IB_QPT_RAW_PACKET || + !tunnel_offload_supported(mdev)) { + mlx5_ib_dbg(dev, "Tunnel offload isn't supported\n"); + return -EOPNOTSUPP; + } + qp->tunnel_offload_en = true; + } + + if (init_attr->create_flags & IB_QP_CREATE_SOURCE_QPN) { + if (init_attr->qp_type != IB_QPT_UD || + (MLX5_CAP_GEN(dev->mdev, port_type) != + MLX5_CAP_PORT_TYPE_IB) || + !mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS)) { + mlx5_ib_dbg(dev, "Source QP option isn't supported\n"); + return -EOPNOTSUPP; + } + + qp->flags |= MLX5_IB_QP_UNDERLAY; + qp->underlay_qpn = init_attr->source_qpn; + } + } else { + qp->wq_sig = !!wq_signature; + } + + base = (init_attr->qp_type == IB_QPT_RAW_PACKET || + qp->flags & MLX5_IB_QP_UNDERLAY) ? + &qp->raw_packet_qp.rq.base : + &qp->trans_qp.base; + + qp->has_rq = qp_has_rq(init_attr); + err = set_rq_size(dev, &init_attr->cap, qp->has_rq, + qp, (pd && pd->uobject) ? &ucmd : NULL); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + return err; + } + + if (pd) { + if (pd->uobject) { + __u32 max_wqes = + 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); + mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count); + if (ucmd.rq_wqe_shift != qp->rq.wqe_shift || + ucmd.rq_wqe_count != qp->rq.wqe_cnt) { + mlx5_ib_dbg(dev, "invalid rq params\n"); + return -EINVAL; + } + if (ucmd.sq_wqe_count > max_wqes) { + mlx5_ib_dbg(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n", + ucmd.sq_wqe_count, max_wqes); + return -EINVAL; + } + if (init_attr->create_flags & + mlx5_ib_create_qp_sqpn_qp1()) { + mlx5_ib_dbg(dev, "user-space is not allowed to create UD QPs spoofing as QP1\n"); + return -EINVAL; + } + err = create_user_qp(dev, pd, qp, udata, init_attr, &in, + &resp, &inlen, base); + if (err) + mlx5_ib_dbg(dev, "err %d\n", err); + } else { + err = create_kernel_qp(dev, init_attr, qp, &in, &inlen, + base); + if (err) + mlx5_ib_dbg(dev, "err %d\n", err); + } + + if (err) + return err; + } else { + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + qp->create_type = MLX5_QP_EMPTY; + } + + if (is_sqp(init_attr->qp_type)) + qp->port = init_attr->port_num; + + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + + MLX5_SET(qpc, qpc, st, mlx5_st); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); + + if (init_attr->qp_type != MLX5_IB_QPT_REG_UMR) + MLX5_SET(qpc, qpc, pd, to_mpd(pd ? pd : devr->p0)->pdn); + else + MLX5_SET(qpc, qpc, latency_sensitive, 1); + + + if (qp->wq_sig) + MLX5_SET(qpc, qpc, wq_signature, 1); + + if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) + MLX5_SET(qpc, qpc, block_lb_mc, 1); + + if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) + MLX5_SET(qpc, qpc, cd_master, 1); + if (qp->flags & MLX5_IB_QP_MANAGED_SEND) + MLX5_SET(qpc, qpc, cd_slave_send, 1); + if (qp->flags & MLX5_IB_QP_MANAGED_RECV) + MLX5_SET(qpc, qpc, cd_slave_receive, 1); + + if (qp->scat_cqe && is_connected(init_attr->qp_type)) { + int rcqe_sz; + int scqe_sz; + + rcqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->recv_cq); + scqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->send_cq); + + if (rcqe_sz == 128) + MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE); + else + MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA32_CQE); + + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) { + if (scqe_sz == 128) + MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA64_CQE); + else + MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA32_CQE); + } + } + + if (qp->rq.wqe_cnt) { + MLX5_SET(qpc, qpc, log_rq_stride, qp->rq.wqe_shift - 4); + MLX5_SET(qpc, qpc, log_rq_size, ilog2(qp->rq.wqe_cnt)); + } + + MLX5_SET(qpc, qpc, rq_type, get_rx_type(qp, init_attr)); + + if (qp->sq.wqe_cnt) { + MLX5_SET(qpc, qpc, log_sq_size, ilog2(qp->sq.wqe_cnt)); + } else { + MLX5_SET(qpc, qpc, no_sq, 1); + if (init_attr->srq && + init_attr->srq->srq_type == IB_SRQT_TM) + MLX5_SET(qpc, qpc, offload_type, + MLX5_QPC_OFFLOAD_TYPE_RNDV); + } + + /* Set default resources */ + switch (init_attr->qp_type) { + case IB_QPT_XRC_TGT: + MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn); + MLX5_SET(qpc, qpc, cqn_snd, to_mcq(devr->c0)->mcq.cqn); + MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(devr->s0)->msrq.srqn); + MLX5_SET(qpc, qpc, xrcd, to_mxrcd(init_attr->xrcd)->xrcdn); + break; + case IB_QPT_XRC_INI: + MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn); + MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x1)->xrcdn); + MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(devr->s0)->msrq.srqn); + break; + default: + if (init_attr->srq) { + MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x0)->xrcdn); + MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(init_attr->srq)->msrq.srqn); + } else { + MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x1)->xrcdn); + MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(devr->s1)->msrq.srqn); + } + } + + if (init_attr->send_cq) + MLX5_SET(qpc, qpc, cqn_snd, to_mcq(init_attr->send_cq)->mcq.cqn); + + if (init_attr->recv_cq) + MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(init_attr->recv_cq)->mcq.cqn); + + MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); + + /* 0xffffff means we ask to work with cqe version 0 */ + if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1) + MLX5_SET(qpc, qpc, user_index, uidx); + + /* we use IB_QP_CREATE_IPOIB_UD_LSO to indicates ipoib qp */ + if (init_attr->qp_type == IB_QPT_UD && + (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)) { + MLX5_SET(qpc, qpc, ulp_stateless_offload_mode, 1); + qp->flags |= MLX5_IB_QP_LSO; + } + + if (init_attr->create_flags & IB_QP_CREATE_PCI_WRITE_END_PADDING) { + if (!MLX5_CAP_GEN(dev->mdev, end_pad)) { + mlx5_ib_dbg(dev, "scatter end padding is not supported\n"); + err = -EOPNOTSUPP; + goto err; + } else if (init_attr->qp_type != IB_QPT_RAW_PACKET) { + MLX5_SET(qpc, qpc, end_padding_mode, + MLX5_WQ_END_PAD_MODE_ALIGN); + } else { + qp->flags |= MLX5_IB_QP_PCI_WRITE_END_PADDING; + } + } + + if (inlen < 0) { + err = -EINVAL; + goto err; + } + + if (init_attr->qp_type == IB_QPT_RAW_PACKET || + qp->flags & MLX5_IB_QP_UNDERLAY) { + qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr; + raw_packet_qp_copy_info(qp, &qp->raw_packet_qp); + err = create_raw_packet_qp(dev, qp, in, inlen, pd); + } else { + err = mlx5_core_create_qp(dev->mdev, &base->mqp, in, inlen); + } + + if (err) { + mlx5_ib_dbg(dev, "create qp failed\n"); + goto err_create; + } + + kvfree(in); + + base->container_mibqp = qp; + base->mqp.event = mlx5_ib_qp_event; + + get_cqs(init_attr->qp_type, init_attr->send_cq, init_attr->recv_cq, + &send_cq, &recv_cq); + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + mlx5_ib_lock_cqs(send_cq, recv_cq); + /* Maintain device to QPs access, needed for further handling via reset + * flow + */ + list_add_tail(&qp->qps_list, &dev->qp_list); + /* Maintain CQ to QPs access, needed for further handling via reset flow + */ + if (send_cq) + list_add_tail(&qp->cq_send_list, &send_cq->list_send_qp); + if (recv_cq) + list_add_tail(&qp->cq_recv_list, &recv_cq->list_recv_qp); + mlx5_ib_unlock_cqs(send_cq, recv_cq); + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + + return 0; + +err_create: + if (qp->create_type == MLX5_QP_USER) + destroy_qp_user(dev, pd, qp, base); + else if (qp->create_type == MLX5_QP_KERNEL) + destroy_qp_kernel(dev, qp); + +err: + kvfree(in); + return err; +} + +static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq) + __acquires(&send_cq->lock) __acquires(&recv_cq->lock) +{ + if (send_cq) { + if (recv_cq) { + if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_lock(&send_cq->lock); + spin_lock_nested(&recv_cq->lock, + SINGLE_DEPTH_NESTING); + } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { + spin_lock(&send_cq->lock); + __acquire(&recv_cq->lock); + } else { + spin_lock(&recv_cq->lock); + spin_lock_nested(&send_cq->lock, + SINGLE_DEPTH_NESTING); + } + } else { + spin_lock(&send_cq->lock); + __acquire(&recv_cq->lock); + } + } else if (recv_cq) { + spin_lock(&recv_cq->lock); + __acquire(&send_cq->lock); + } else { + __acquire(&send_cq->lock); + __acquire(&recv_cq->lock); + } +} + +static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq) + __releases(&send_cq->lock) __releases(&recv_cq->lock) +{ + if (send_cq) { + if (recv_cq) { + if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_unlock(&recv_cq->lock); + spin_unlock(&send_cq->lock); + } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { + __release(&recv_cq->lock); + spin_unlock(&send_cq->lock); + } else { + spin_unlock(&send_cq->lock); + spin_unlock(&recv_cq->lock); + } + } else { + __release(&recv_cq->lock); + spin_unlock(&send_cq->lock); + } + } else if (recv_cq) { + __release(&send_cq->lock); + spin_unlock(&recv_cq->lock); + } else { + __release(&recv_cq->lock); + __release(&send_cq->lock); + } +} + +static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp) +{ + return to_mpd(qp->ibqp.pd); +} + +static void get_cqs(enum ib_qp_type qp_type, + struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq, + struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq) +{ + switch (qp_type) { + case IB_QPT_XRC_TGT: + *send_cq = NULL; + *recv_cq = NULL; + break; + case MLX5_IB_QPT_REG_UMR: + case IB_QPT_XRC_INI: + *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL; + *recv_cq = NULL; + break; + + case IB_QPT_SMI: + case MLX5_IB_QPT_HW_GSI: + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + case IB_QPT_RAW_IPV6: + case IB_QPT_RAW_ETHERTYPE: + case IB_QPT_RAW_PACKET: + *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL; + *recv_cq = ib_recv_cq ? to_mcq(ib_recv_cq) : NULL; + break; + + case IB_QPT_MAX: + default: + *send_cq = NULL; + *recv_cq = NULL; + break; + } +} + +static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + const struct mlx5_modify_raw_qp_param *raw_qp_param, + u8 lag_tx_affinity); + +static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_cq *send_cq, *recv_cq; + struct mlx5_ib_qp_base *base; + unsigned long flags; + int err; + + if (qp->ibqp.rwq_ind_tbl) { + destroy_rss_raw_qp_tir(dev, qp); + return; + } + + base = (qp->ibqp.qp_type == IB_QPT_RAW_PACKET || + qp->flags & MLX5_IB_QP_UNDERLAY) ? + &qp->raw_packet_qp.rq.base : + &qp->trans_qp.base; + + if (qp->state != IB_QPS_RESET) { + if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET && + !(qp->flags & MLX5_IB_QP_UNDERLAY)) { + err = mlx5_core_qp_modify(dev->mdev, + MLX5_CMD_OP_2RST_QP, 0, + NULL, &base->mqp); + } else { + struct mlx5_modify_raw_qp_param raw_qp_param = { + .operation = MLX5_CMD_OP_2RST_QP + }; + + err = modify_raw_packet_qp(dev, qp, &raw_qp_param, 0); + } + if (err) + mlx5_ib_warn(dev, "mlx5_ib: modify QP 0x%06x to RESET failed\n", + base->mqp.qpn); + } + + get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq, + &send_cq, &recv_cq); + + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + mlx5_ib_lock_cqs(send_cq, recv_cq); + /* del from lists under both locks above to protect reset flow paths */ + list_del(&qp->qps_list); + if (send_cq) + list_del(&qp->cq_send_list); + + if (recv_cq) + list_del(&qp->cq_recv_list); + + if (qp->create_type == MLX5_QP_KERNEL) { + __mlx5_ib_cq_clean(recv_cq, base->mqp.qpn, + qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); + if (send_cq != recv_cq) + __mlx5_ib_cq_clean(send_cq, base->mqp.qpn, + NULL); + } + mlx5_ib_unlock_cqs(send_cq, recv_cq); + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET || + qp->flags & MLX5_IB_QP_UNDERLAY) { + destroy_raw_packet_qp(dev, qp); + } else { + err = mlx5_core_destroy_qp(dev->mdev, &base->mqp); + if (err) + mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", + base->mqp.qpn); + } + + if (qp->create_type == MLX5_QP_KERNEL) + destroy_qp_kernel(dev, qp); + else if (qp->create_type == MLX5_QP_USER) + destroy_qp_user(dev, &get_pd(qp)->ibpd, qp, base); +} + +static const char *ib_qp_type_str(enum ib_qp_type type) +{ + switch (type) { + case IB_QPT_SMI: + return "IB_QPT_SMI"; + case IB_QPT_GSI: + return "IB_QPT_GSI"; + case IB_QPT_RC: + return "IB_QPT_RC"; + case IB_QPT_UC: + return "IB_QPT_UC"; + case IB_QPT_UD: + return "IB_QPT_UD"; + case IB_QPT_RAW_IPV6: + return "IB_QPT_RAW_IPV6"; + case IB_QPT_RAW_ETHERTYPE: + return "IB_QPT_RAW_ETHERTYPE"; + case IB_QPT_XRC_INI: + return "IB_QPT_XRC_INI"; + case IB_QPT_XRC_TGT: + return "IB_QPT_XRC_TGT"; + case IB_QPT_RAW_PACKET: + return "IB_QPT_RAW_PACKET"; + case MLX5_IB_QPT_REG_UMR: + return "MLX5_IB_QPT_REG_UMR"; + case IB_QPT_DRIVER: + return "IB_QPT_DRIVER"; + case IB_QPT_MAX: + default: + return "Invalid QP type"; + } +} + +static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd, + struct ib_qp_init_attr *attr, + struct mlx5_ib_create_qp *ucmd) +{ + struct mlx5_ib_qp *qp; + int err = 0; + u32 uidx = MLX5_IB_DEFAULT_UIDX; + void *dctc; + + if (!attr->srq || !attr->recv_cq) + return ERR_PTR(-EINVAL); + + err = get_qp_user_index(to_mucontext(pd->uobject->context), + ucmd, sizeof(*ucmd), &uidx); + if (err) + return ERR_PTR(err); + + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->dct.in = kzalloc(MLX5_ST_SZ_BYTES(create_dct_in), GFP_KERNEL); + if (!qp->dct.in) { + err = -ENOMEM; + goto err_free; + } + + dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry); + qp->qp_sub_type = MLX5_IB_QPT_DCT; + MLX5_SET(dctc, dctc, pd, to_mpd(pd)->pdn); + MLX5_SET(dctc, dctc, srqn_xrqn, to_msrq(attr->srq)->msrq.srqn); + MLX5_SET(dctc, dctc, cqn, to_mcq(attr->recv_cq)->mcq.cqn); + MLX5_SET64(dctc, dctc, dc_access_key, ucmd->access_key); + MLX5_SET(dctc, dctc, user_index, uidx); + + qp->state = IB_QPS_RESET; + + return &qp->ibqp; +err_free: + kfree(qp); + return ERR_PTR(err); +} + +static int set_mlx_qp_type(struct mlx5_ib_dev *dev, + struct ib_qp_init_attr *init_attr, + struct mlx5_ib_create_qp *ucmd, + struct ib_udata *udata) +{ + enum { MLX_QP_FLAGS = MLX5_QP_FLAG_TYPE_DCT | MLX5_QP_FLAG_TYPE_DCI }; + int err; + + if (!udata) + return -EINVAL; + + if (udata->inlen < sizeof(*ucmd)) { + mlx5_ib_dbg(dev, "create_qp user command is smaller than expected\n"); + return -EINVAL; + } + err = ib_copy_from_udata(ucmd, udata, sizeof(*ucmd)); + if (err) + return err; + + if ((ucmd->flags & MLX_QP_FLAGS) == MLX5_QP_FLAG_TYPE_DCI) { + init_attr->qp_type = MLX5_IB_QPT_DCI; + } else { + if ((ucmd->flags & MLX_QP_FLAGS) == MLX5_QP_FLAG_TYPE_DCT) { + init_attr->qp_type = MLX5_IB_QPT_DCT; + } else { + mlx5_ib_dbg(dev, "Invalid QP flags\n"); + return -EINVAL; + } + } + + if (!MLX5_CAP_GEN(dev->mdev, dct)) { + mlx5_ib_dbg(dev, "DC transport is not supported\n"); + return -EOPNOTSUPP; + } + + return 0; +} + +struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *verbs_init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev; + struct mlx5_ib_qp *qp; + u16 xrcdn = 0; + int err; + struct ib_qp_init_attr mlx_init_attr; + struct ib_qp_init_attr *init_attr = verbs_init_attr; + + if (pd) { + dev = to_mdev(pd->device); + + if (init_attr->qp_type == IB_QPT_RAW_PACKET) { + if (!pd->uobject) { + mlx5_ib_dbg(dev, "Raw Packet QP is not supported for kernel consumers\n"); + return ERR_PTR(-EINVAL); + } else if (!to_mucontext(pd->uobject->context)->cqe_version) { + mlx5_ib_dbg(dev, "Raw Packet QP is only supported for CQE version > 0\n"); + return ERR_PTR(-EINVAL); + } + } + } else { + /* being cautious here */ + if (init_attr->qp_type != IB_QPT_XRC_TGT && + init_attr->qp_type != MLX5_IB_QPT_REG_UMR) { + pr_warn("%s: no PD for transport %s\n", __func__, + ib_qp_type_str(init_attr->qp_type)); + return ERR_PTR(-EINVAL); + } + dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device); + } + + if (init_attr->qp_type == IB_QPT_DRIVER) { + struct mlx5_ib_create_qp ucmd; + + init_attr = &mlx_init_attr; + memcpy(init_attr, verbs_init_attr, sizeof(*verbs_init_attr)); + err = set_mlx_qp_type(dev, init_attr, &ucmd, udata); + if (err) + return ERR_PTR(err); + + if (init_attr->qp_type == MLX5_IB_QPT_DCI) { + if (init_attr->cap.max_recv_wr || + init_attr->cap.max_recv_sge) { + mlx5_ib_dbg(dev, "DCI QP requires zero size receive queue\n"); + return ERR_PTR(-EINVAL); + } + } else { + return mlx5_ib_create_dct(pd, init_attr, &ucmd); + } + } + + switch (init_attr->qp_type) { + case IB_QPT_XRC_TGT: + case IB_QPT_XRC_INI: + if (!MLX5_CAP_GEN(dev->mdev, xrc)) { + mlx5_ib_dbg(dev, "XRC not supported\n"); + return ERR_PTR(-ENOSYS); + } + init_attr->recv_cq = NULL; + if (init_attr->qp_type == IB_QPT_XRC_TGT) { + xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; + init_attr->send_cq = NULL; + } + + /* fall through */ + case IB_QPT_RAW_PACKET: + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + case IB_QPT_SMI: + case MLX5_IB_QPT_HW_GSI: + case MLX5_IB_QPT_REG_UMR: + case MLX5_IB_QPT_DCI: + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + err = create_qp_common(dev, pd, init_attr, udata, qp); + if (err) { + mlx5_ib_dbg(dev, "create_qp_common failed\n"); + kfree(qp); + return ERR_PTR(err); + } + + if (is_qp0(init_attr->qp_type)) + qp->ibqp.qp_num = 0; + else if (is_qp1(init_attr->qp_type)) + qp->ibqp.qp_num = 1; + else + qp->ibqp.qp_num = qp->trans_qp.base.mqp.qpn; + + mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n", + qp->ibqp.qp_num, qp->trans_qp.base.mqp.qpn, + init_attr->recv_cq ? to_mcq(init_attr->recv_cq)->mcq.cqn : -1, + init_attr->send_cq ? to_mcq(init_attr->send_cq)->mcq.cqn : -1); + + qp->trans_qp.xrcdn = xrcdn; + + break; + + case IB_QPT_GSI: + return mlx5_ib_gsi_create_qp(pd, init_attr); + + case IB_QPT_RAW_IPV6: + case IB_QPT_RAW_ETHERTYPE: + case IB_QPT_MAX: + default: + mlx5_ib_dbg(dev, "unsupported qp type %d\n", + init_attr->qp_type); + /* Don't support raw QPs */ + return ERR_PTR(-EINVAL); + } + + if (verbs_init_attr->qp_type == IB_QPT_DRIVER) + qp->qp_sub_type = init_attr->qp_type; + + return &qp->ibqp; +} + +static int mlx5_ib_destroy_dct(struct mlx5_ib_qp *mqp) +{ + struct mlx5_ib_dev *dev = to_mdev(mqp->ibqp.device); + + if (mqp->state == IB_QPS_RTR) { + int err; + + err = mlx5_core_destroy_dct(dev->mdev, &mqp->dct.mdct); + if (err) { + mlx5_ib_warn(dev, "failed to destroy DCT %d\n", err); + return err; + } + } + + kfree(mqp->dct.in); + kfree(mqp); + return 0; +} + +int mlx5_ib_destroy_qp(struct ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_qp *mqp = to_mqp(qp); + + if (unlikely(qp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_destroy_qp(qp); + + if (mqp->qp_sub_type == MLX5_IB_QPT_DCT) + return mlx5_ib_destroy_dct(mqp); + + destroy_qp_common(dev, mqp); + + kfree(mqp); + + return 0; +} + +static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_attr *attr, + int attr_mask) +{ + u32 hw_access_flags = 0; + u8 dest_rd_atomic; + u32 access_flags; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + dest_rd_atomic = attr->max_dest_rd_atomic; + else + dest_rd_atomic = qp->trans_qp.resp_depth; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + access_flags = attr->qp_access_flags; + else + access_flags = qp->trans_qp.atomic_rd_en; + + if (!dest_rd_atomic) + access_flags &= IB_ACCESS_REMOTE_WRITE; + + if (access_flags & IB_ACCESS_REMOTE_READ) + hw_access_flags |= MLX5_QP_BIT_RRE; + if (access_flags & IB_ACCESS_REMOTE_ATOMIC) + hw_access_flags |= (MLX5_QP_BIT_RAE | MLX5_ATOMIC_MODE_CX); + if (access_flags & IB_ACCESS_REMOTE_WRITE) + hw_access_flags |= MLX5_QP_BIT_RWE; + + return cpu_to_be32(hw_access_flags); +} + +enum { + MLX5_PATH_FLAG_FL = 1 << 0, + MLX5_PATH_FLAG_FREE_AR = 1 << 1, + MLX5_PATH_FLAG_COUNTER = 1 << 2, +}; + +static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) +{ + if (rate == IB_RATE_PORT_CURRENT) + return 0; + + if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS) + return -EINVAL; + + while (rate != IB_RATE_PORT_CURRENT && + !(1 << (rate + MLX5_STAT_RATE_OFFSET) & + MLX5_CAP_GEN(dev->mdev, stat_rate_support))) + --rate; + + return rate ? rate + MLX5_STAT_RATE_OFFSET : rate; +} + +static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev, + struct mlx5_ib_sq *sq, u8 sl) +{ + void *in; + void *tisc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_tis_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_tis_in, in, bitmask.prio, 1); + + tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx); + MLX5_SET(tisc, tisc, prio, ((sl & 0x7) << 1)); + + err = mlx5_core_modify_tis(dev, sq->tisn, in, inlen); + + kvfree(in); + + return err; +} + +static int modify_raw_packet_tx_affinity(struct mlx5_core_dev *dev, + struct mlx5_ib_sq *sq, u8 tx_affinity) +{ + void *in; + void *tisc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_tis_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_tis_in, in, bitmask.lag_tx_port_affinity, 1); + + tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx); + MLX5_SET(tisc, tisc, lag_tx_port_affinity, tx_affinity); + + err = mlx5_core_modify_tis(dev, sq->tisn, in, inlen); + + kvfree(in); + + return err; +} + +static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + const struct rdma_ah_attr *ah, + struct mlx5_qp_path *path, u8 port, int attr_mask, + u32 path_flags, const struct ib_qp_attr *attr, + bool alt) +{ + const struct ib_global_route *grh = rdma_ah_read_grh(ah); + int err; + enum ib_gid_type gid_type; + u8 ah_flags = rdma_ah_get_ah_flags(ah); + u8 sl = rdma_ah_get_sl(ah); + + if (attr_mask & IB_QP_PKEY_INDEX) + path->pkey_index = cpu_to_be16(alt ? attr->alt_pkey_index : + attr->pkey_index); + + if (ah_flags & IB_AH_GRH) { + if (grh->sgid_index >= + dev->mdev->port_caps[port - 1].gid_table_len) { + pr_err("sgid_index (%u) too large. max is %d\n", + grh->sgid_index, + dev->mdev->port_caps[port - 1].gid_table_len); + return -EINVAL; + } + } + + if (ah->type == RDMA_AH_ATTR_TYPE_ROCE) { + if (!(ah_flags & IB_AH_GRH)) + return -EINVAL; + + memcpy(path->rmac, ah->roce.dmac, sizeof(ah->roce.dmac)); + if (qp->ibqp.qp_type == IB_QPT_RC || + qp->ibqp.qp_type == IB_QPT_UC || + qp->ibqp.qp_type == IB_QPT_XRC_INI || + qp->ibqp.qp_type == IB_QPT_XRC_TGT) + path->udp_sport = + mlx5_get_roce_udp_sport(dev, ah->grh.sgid_attr); + path->dci_cfi_prio_sl = (sl & 0x7) << 4; + gid_type = ah->grh.sgid_attr->gid_type; + if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + path->ecn_dscp = (grh->traffic_class >> 2) & 0x3f; + } else { + path->fl_free_ar = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0; + path->fl_free_ar |= + (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x40 : 0; + path->rlid = cpu_to_be16(rdma_ah_get_dlid(ah)); + path->grh_mlid = rdma_ah_get_path_bits(ah) & 0x7f; + if (ah_flags & IB_AH_GRH) + path->grh_mlid |= 1 << 7; + path->dci_cfi_prio_sl = sl & 0xf; + } + + if (ah_flags & IB_AH_GRH) { + path->mgid_index = grh->sgid_index; + path->hop_limit = grh->hop_limit; + path->tclass_flowlabel = + cpu_to_be32((grh->traffic_class << 20) | + (grh->flow_label)); + memcpy(path->rgid, grh->dgid.raw, 16); + } + + err = ib_rate_to_mlx5(dev, rdma_ah_get_static_rate(ah)); + if (err < 0) + return err; + path->static_rate = err; + path->port = port; + + if (attr_mask & IB_QP_TIMEOUT) + path->ackto_lt = (alt ? attr->alt_timeout : attr->timeout) << 3; + + if ((qp->ibqp.qp_type == IB_QPT_RAW_PACKET) && qp->sq.wqe_cnt) + return modify_raw_packet_eth_prio(dev->mdev, + &qp->raw_packet_qp.sq, + sl & 0xf); + + return 0; +} + +static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_QP_ST_MAX] = { + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_PRI_PORT, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_PRI_PORT, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY | + MLX5_QP_OPTPAR_PRI_PORT, + [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_PRI_PORT, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, + }, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_RNR_TIMEOUT, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_RNR_TIMEOUT, + }, + }, + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY | + MLX5_QP_OPTPAR_SRQN | + MLX5_QP_OPTPAR_CQN_RCV, + [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, + }, + }, + [MLX5_QP_STATE_SQER] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE, + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RRE, + [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RRE, + }, + }, +}; + +static int ib_nr_to_mlx5_nr(int ib_mask) +{ + switch (ib_mask) { + case IB_QP_STATE: + return 0; + case IB_QP_CUR_STATE: + return 0; + case IB_QP_EN_SQD_ASYNC_NOTIFY: + return 0; + case IB_QP_ACCESS_FLAGS: + return MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE; + case IB_QP_PKEY_INDEX: + return MLX5_QP_OPTPAR_PKEY_INDEX; + case IB_QP_PORT: + return MLX5_QP_OPTPAR_PRI_PORT; + case IB_QP_QKEY: + return MLX5_QP_OPTPAR_Q_KEY; + case IB_QP_AV: + return MLX5_QP_OPTPAR_PRIMARY_ADDR_PATH | + MLX5_QP_OPTPAR_PRI_PORT; + case IB_QP_PATH_MTU: + return 0; + case IB_QP_TIMEOUT: + return MLX5_QP_OPTPAR_ACK_TIMEOUT; + case IB_QP_RETRY_CNT: + return MLX5_QP_OPTPAR_RETRY_COUNT; + case IB_QP_RNR_RETRY: + return MLX5_QP_OPTPAR_RNR_RETRY; + case IB_QP_RQ_PSN: + return 0; + case IB_QP_MAX_QP_RD_ATOMIC: + return MLX5_QP_OPTPAR_SRA_MAX; + case IB_QP_ALT_PATH: + return MLX5_QP_OPTPAR_ALT_ADDR_PATH; + case IB_QP_MIN_RNR_TIMER: + return MLX5_QP_OPTPAR_RNR_TIMEOUT; + case IB_QP_SQ_PSN: + return 0; + case IB_QP_MAX_DEST_RD_ATOMIC: + return MLX5_QP_OPTPAR_RRA_MAX | MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE; + case IB_QP_PATH_MIG_STATE: + return MLX5_QP_OPTPAR_PM_STATE; + case IB_QP_CAP: + return 0; + case IB_QP_DEST_QPN: + return 0; + } + return 0; +} + +static int ib_mask_to_mlx5_opt(int ib_mask) +{ + int result = 0; + int i; + + for (i = 0; i < 8 * sizeof(int); i++) { + if ((1 << i) & ib_mask) + result |= ib_nr_to_mlx5_nr(1 << i); + } + + return result; +} + +static int modify_raw_packet_qp_rq(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq, int new_state, + const struct mlx5_modify_raw_qp_param *raw_qp_param) +{ + void *in; + void *rqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_rq_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_rq_in, in, rq_state, rq->state); + + rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); + MLX5_SET(rqc, rqc, state, new_state); + + if (raw_qp_param->set_mask & MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID) { + if (MLX5_CAP_GEN(dev->mdev, modify_rq_counter_set_id)) { + MLX5_SET64(modify_rq_in, in, modify_bitmask, + MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID); + MLX5_SET(rqc, rqc, counter_set_id, raw_qp_param->rq_q_ctr_id); + } else + pr_info_once("%s: RAW PACKET QP counters are not supported on current FW\n", + dev->ib_dev.name); + } + + err = mlx5_core_modify_rq(dev->mdev, rq->base.mqp.qpn, in, inlen); + if (err) + goto out; + + rq->state = new_state; + +out: + kvfree(in); + return err; +} + +static int modify_raw_packet_qp_sq(struct mlx5_core_dev *dev, + struct mlx5_ib_sq *sq, + int new_state, + const struct mlx5_modify_raw_qp_param *raw_qp_param) +{ + struct mlx5_ib_qp *ibqp = sq->base.container_mibqp; + struct mlx5_rate_limit old_rl = ibqp->rl; + struct mlx5_rate_limit new_rl = old_rl; + bool new_rate_added = false; + u16 rl_index = 0; + void *in; + void *sqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_sq_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_sq_in, in, sq_state, sq->state); + + sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); + MLX5_SET(sqc, sqc, state, new_state); + + if (raw_qp_param->set_mask & MLX5_RAW_QP_RATE_LIMIT) { + if (new_state != MLX5_SQC_STATE_RDY) + pr_warn("%s: Rate limit can only be changed when SQ is moving to RDY\n", + __func__); + else + new_rl = raw_qp_param->rl; + } + + if (!mlx5_rl_are_equal(&old_rl, &new_rl)) { + if (new_rl.rate) { + err = mlx5_rl_add_rate(dev, &rl_index, &new_rl); + if (err) { + pr_err("Failed configuring rate limit(err %d): \ + rate %u, max_burst_sz %u, typical_pkt_sz %u\n", + err, new_rl.rate, new_rl.max_burst_sz, + new_rl.typical_pkt_sz); + + goto out; + } + new_rate_added = true; + } + + MLX5_SET64(modify_sq_in, in, modify_bitmask, 1); + /* index 0 means no limit */ + MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index); + } + + err = mlx5_core_modify_sq(dev, sq->base.mqp.qpn, in, inlen); + if (err) { + /* Remove new rate from table if failed */ + if (new_rate_added) + mlx5_rl_remove_rate(dev, &new_rl); + goto out; + } + + /* Only remove the old rate after new rate was set */ + if ((old_rl.rate && !mlx5_rl_are_equal(&old_rl, &new_rl)) || + (new_state != MLX5_SQC_STATE_RDY)) { + mlx5_rl_remove_rate(dev, &old_rl); + if (new_state != MLX5_SQC_STATE_RDY) + memset(&new_rl, 0, sizeof(new_rl)); + } + + ibqp->rl = new_rl; + sq->state = new_state; + +out: + kvfree(in); + return err; +} + +static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + const struct mlx5_modify_raw_qp_param *raw_qp_param, + u8 tx_affinity) +{ + struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + int modify_rq = !!qp->rq.wqe_cnt; + int modify_sq = !!qp->sq.wqe_cnt; + int rq_state; + int sq_state; + int err; + + switch (raw_qp_param->operation) { + case MLX5_CMD_OP_RST2INIT_QP: + rq_state = MLX5_RQC_STATE_RDY; + sq_state = MLX5_SQC_STATE_RDY; + break; + case MLX5_CMD_OP_2ERR_QP: + rq_state = MLX5_RQC_STATE_ERR; + sq_state = MLX5_SQC_STATE_ERR; + break; + case MLX5_CMD_OP_2RST_QP: + rq_state = MLX5_RQC_STATE_RST; + sq_state = MLX5_SQC_STATE_RST; + break; + case MLX5_CMD_OP_RTR2RTS_QP: + case MLX5_CMD_OP_RTS2RTS_QP: + if (raw_qp_param->set_mask == + MLX5_RAW_QP_RATE_LIMIT) { + modify_rq = 0; + sq_state = sq->state; + } else { + return raw_qp_param->set_mask ? -EINVAL : 0; + } + break; + case MLX5_CMD_OP_INIT2INIT_QP: + case MLX5_CMD_OP_INIT2RTR_QP: + if (raw_qp_param->set_mask) + return -EINVAL; + else + return 0; + default: + WARN_ON(1); + return -EINVAL; + } + + if (modify_rq) { + err = modify_raw_packet_qp_rq(dev, rq, rq_state, raw_qp_param); + if (err) + return err; + } + + if (modify_sq) { + if (tx_affinity) { + err = modify_raw_packet_tx_affinity(dev->mdev, sq, + tx_affinity); + if (err) + return err; + } + + return modify_raw_packet_qp_sq(dev->mdev, sq, sq_state, raw_qp_param); + } + + return 0; +} + +static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev, + struct mlx5_ib_pd *pd, + struct mlx5_ib_qp_base *qp_base, + u8 port_num) +{ + struct mlx5_ib_ucontext *ucontext = NULL; + unsigned int tx_port_affinity; + + if (pd && pd->ibpd.uobject && pd->ibpd.uobject->context) + ucontext = to_mucontext(pd->ibpd.uobject->context); + + if (ucontext) { + tx_port_affinity = (unsigned int)atomic_add_return( + 1, &ucontext->tx_port_affinity) % + MLX5_MAX_PORTS + + 1; + mlx5_ib_dbg(dev, "Set tx affinity 0x%x to qpn 0x%x ucontext %p\n", + tx_port_affinity, qp_base->mqp.qpn, ucontext); + } else { + tx_port_affinity = + (unsigned int)atomic_add_return( + 1, &dev->roce[port_num].tx_port_affinity) % + MLX5_MAX_PORTS + + 1; + mlx5_ib_dbg(dev, "Set tx affinity 0x%x to qpn 0x%x\n", + tx_port_affinity, qp_base->mqp.qpn); + } + + return tx_port_affinity; +} + +static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, int attr_mask, + enum ib_qp_state cur_state, enum ib_qp_state new_state, + const struct mlx5_ib_modify_qp *ucmd) +{ + static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = { + [MLX5_QP_STATE_RST] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_RST2INIT_QP, + }, + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_INIT2INIT_QP, + [MLX5_QP_STATE_RTR] = MLX5_CMD_OP_INIT2RTR_QP, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTR2RTS_QP, + }, + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTS2RTS_QP, + }, + [MLX5_QP_STATE_SQD] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + }, + [MLX5_QP_STATE_SQER] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_SQERR2RTS_QP, + }, + [MLX5_QP_STATE_ERR] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + } + }; + + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_qp_base *base = &qp->trans_qp.base; + struct mlx5_ib_cq *send_cq, *recv_cq; + struct mlx5_qp_context *context; + struct mlx5_ib_pd *pd; + struct mlx5_ib_port *mibport = NULL; + enum mlx5_qp_state mlx5_cur, mlx5_new; + enum mlx5_qp_optpar optpar; + int mlx5_st; + int err; + u16 op; + u8 tx_affinity = 0; + + mlx5_st = to_mlx5_st(ibqp->qp_type == IB_QPT_DRIVER ? + qp->qp_sub_type : ibqp->qp_type); + if (mlx5_st < 0) + return -EINVAL; + + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return -ENOMEM; + + pd = get_pd(qp); + context->flags = cpu_to_be32(mlx5_st << 16); + + if (!(attr_mask & IB_QP_PATH_MIG_STATE)) { + context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); + } else { + switch (attr->path_mig_state) { + case IB_MIG_MIGRATED: + context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); + break; + case IB_MIG_REARM: + context->flags |= cpu_to_be32(MLX5_QP_PM_REARM << 11); + break; + case IB_MIG_ARMED: + context->flags |= cpu_to_be32(MLX5_QP_PM_ARMED << 11); + break; + } + } + + if ((cur_state == IB_QPS_RESET) && (new_state == IB_QPS_INIT)) { + if ((ibqp->qp_type == IB_QPT_RC) || + (ibqp->qp_type == IB_QPT_UD && + !(qp->flags & MLX5_IB_QP_SQPN_QP1)) || + (ibqp->qp_type == IB_QPT_UC) || + (ibqp->qp_type == IB_QPT_RAW_PACKET) || + (ibqp->qp_type == IB_QPT_XRC_INI) || + (ibqp->qp_type == IB_QPT_XRC_TGT)) { + if (mlx5_lag_is_active(dev->mdev)) { + u8 p = mlx5_core_native_port_num(dev->mdev); + tx_affinity = get_tx_affinity(dev, pd, base, p); + context->flags |= cpu_to_be32(tx_affinity << 24); + } + } + } + + if (is_sqp(ibqp->qp_type)) { + context->mtu_msgmax = (IB_MTU_256 << 5) | 8; + } else if ((ibqp->qp_type == IB_QPT_UD && + !(qp->flags & MLX5_IB_QP_UNDERLAY)) || + ibqp->qp_type == MLX5_IB_QPT_REG_UMR) { + context->mtu_msgmax = (IB_MTU_4096 << 5) | 12; + } else if (attr_mask & IB_QP_PATH_MTU) { + if (attr->path_mtu < IB_MTU_256 || + attr->path_mtu > IB_MTU_4096) { + mlx5_ib_warn(dev, "invalid mtu %d\n", attr->path_mtu); + err = -EINVAL; + goto out; + } + context->mtu_msgmax = (attr->path_mtu << 5) | + (u8)MLX5_CAP_GEN(dev->mdev, log_max_msg); + } + + if (attr_mask & IB_QP_DEST_QPN) + context->log_pg_sz_remote_qpn = cpu_to_be32(attr->dest_qp_num); + + if (attr_mask & IB_QP_PKEY_INDEX) + context->pri_path.pkey_index = cpu_to_be16(attr->pkey_index); + + /* todo implement counter_index functionality */ + + if (is_sqp(ibqp->qp_type)) + context->pri_path.port = qp->port; + + if (attr_mask & IB_QP_PORT) + context->pri_path.port = attr->port_num; + + if (attr_mask & IB_QP_AV) { + err = mlx5_set_path(dev, qp, &attr->ah_attr, &context->pri_path, + attr_mask & IB_QP_PORT ? attr->port_num : qp->port, + attr_mask, 0, attr, false); + if (err) + goto out; + } + + if (attr_mask & IB_QP_TIMEOUT) + context->pri_path.ackto_lt |= attr->timeout << 3; + + if (attr_mask & IB_QP_ALT_PATH) { + err = mlx5_set_path(dev, qp, &attr->alt_ah_attr, + &context->alt_path, + attr->alt_port_num, + attr_mask | IB_QP_PKEY_INDEX | IB_QP_TIMEOUT, + 0, attr, true); + if (err) + goto out; + } + + get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq, + &send_cq, &recv_cq); + + context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn); + context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0; + context->cqn_recv = recv_cq ? cpu_to_be32(recv_cq->mcq.cqn) : 0; + context->params1 = cpu_to_be32(MLX5_IB_ACK_REQ_FREQ << 28); + + if (attr_mask & IB_QP_RNR_RETRY) + context->params1 |= cpu_to_be32(attr->rnr_retry << 13); + + if (attr_mask & IB_QP_RETRY_CNT) + context->params1 |= cpu_to_be32(attr->retry_cnt << 16); + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { + if (attr->max_rd_atomic) + context->params1 |= + cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); + } + + if (attr_mask & IB_QP_SQ_PSN) + context->next_send_psn = cpu_to_be32(attr->sq_psn); + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { + if (attr->max_dest_rd_atomic) + context->params2 |= + cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); + } + + if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) + context->params2 |= to_mlx5_access_flags(qp, attr, attr_mask); + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); + + if (attr_mask & IB_QP_RQ_PSN) + context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); + + if (attr_mask & IB_QP_QKEY) + context->qkey = cpu_to_be32(attr->qkey); + + if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + context->db_rec_addr = cpu_to_be64(qp->db.dma); + + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + u8 port_num = (attr_mask & IB_QP_PORT ? attr->port_num : + qp->port) - 1; + + /* Underlay port should be used - index 0 function per port */ + if (qp->flags & MLX5_IB_QP_UNDERLAY) + port_num = 0; + + mibport = &dev->port[port_num]; + context->qp_counter_set_usr_page |= + cpu_to_be32((u32)(mibport->cnts.set_id) << 24); + } + + if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + context->sq_crq_size |= cpu_to_be16(1 << 4); + + if (qp->flags & MLX5_IB_QP_SQPN_QP1) + context->deth_sqpn = cpu_to_be32(1); + + mlx5_cur = to_mlx5_state(cur_state); + mlx5_new = to_mlx5_state(new_state); + + if (mlx5_cur >= MLX5_QP_NUM_STATE || mlx5_new >= MLX5_QP_NUM_STATE || + !optab[mlx5_cur][mlx5_new]) { + err = -EINVAL; + goto out; + } + + op = optab[mlx5_cur][mlx5_new]; + optpar = ib_mask_to_mlx5_opt(attr_mask); + optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; + + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET || + qp->flags & MLX5_IB_QP_UNDERLAY) { + struct mlx5_modify_raw_qp_param raw_qp_param = {}; + + raw_qp_param.operation = op; + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + raw_qp_param.rq_q_ctr_id = mibport->cnts.set_id; + raw_qp_param.set_mask |= MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID; + } + + if (attr_mask & IB_QP_RATE_LIMIT) { + raw_qp_param.rl.rate = attr->rate_limit; + + if (ucmd->burst_info.max_burst_sz) { + if (attr->rate_limit && + MLX5_CAP_QOS(dev->mdev, packet_pacing_burst_bound)) { + raw_qp_param.rl.max_burst_sz = + ucmd->burst_info.max_burst_sz; + } else { + err = -EINVAL; + goto out; + } + } + + if (ucmd->burst_info.typical_pkt_sz) { + if (attr->rate_limit && + MLX5_CAP_QOS(dev->mdev, packet_pacing_typical_size)) { + raw_qp_param.rl.typical_pkt_sz = + ucmd->burst_info.typical_pkt_sz; + } else { + err = -EINVAL; + goto out; + } + } + + raw_qp_param.set_mask |= MLX5_RAW_QP_RATE_LIMIT; + } + + err = modify_raw_packet_qp(dev, qp, &raw_qp_param, tx_affinity); + } else { + err = mlx5_core_qp_modify(dev->mdev, op, optpar, context, + &base->mqp); + } + + if (err) + goto out; + + qp->state = new_state; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->trans_qp.atomic_rd_en = attr->qp_access_flags; + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->trans_qp.resp_depth = attr->max_dest_rd_atomic; + if (attr_mask & IB_QP_PORT) + qp->port = attr->port_num; + if (attr_mask & IB_QP_ALT_PATH) + qp->trans_qp.alt_port = attr->alt_port_num; + + /* + * If we moved a kernel QP to RESET, clean up all old CQ + * entries and reinitialize the QP. + */ + if (new_state == IB_QPS_RESET && + !ibqp->uobject && ibqp->qp_type != IB_QPT_XRC_TGT) { + mlx5_ib_cq_clean(recv_cq, base->mqp.qpn, + ibqp->srq ? to_msrq(ibqp->srq) : NULL); + if (send_cq != recv_cq) + mlx5_ib_cq_clean(send_cq, base->mqp.qpn, NULL); + + qp->rq.head = 0; + qp->rq.tail = 0; + qp->sq.head = 0; + qp->sq.tail = 0; + qp->sq.cur_post = 0; + qp->sq.last_poll = 0; + qp->db.db[MLX5_RCV_DBR] = 0; + qp->db.db[MLX5_SND_DBR] = 0; + } + +out: + kfree(context); + return err; +} + +static inline bool is_valid_mask(int mask, int req, int opt) +{ + if ((mask & req) != req) + return false; + + if (mask & ~(req | opt)) + return false; + + return true; +} + +/* check valid transition for driver QP types + * for now the only QP type that this function supports is DCI + */ +static bool modify_dci_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state new_state, + enum ib_qp_attr_mask attr_mask) +{ + int req = IB_QP_STATE; + int opt = 0; + + if (new_state == IB_QPS_RESET) { + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + req |= IB_QP_PKEY_INDEX | IB_QP_PORT; + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) { + opt = IB_QP_PKEY_INDEX | IB_QP_PORT; + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { + req |= IB_QP_PATH_MTU; + opt = IB_QP_PKEY_INDEX; + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state == IB_QPS_RTR && new_state == IB_QPS_RTS) { + req |= IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | + IB_QP_MAX_QP_RD_ATOMIC | IB_QP_SQ_PSN; + opt = IB_QP_MIN_RNR_TIMER; + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state == IB_QPS_RTS && new_state == IB_QPS_RTS) { + opt = IB_QP_MIN_RNR_TIMER; + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state != IB_QPS_RESET && new_state == IB_QPS_ERR) { + return is_valid_mask(attr_mask, req, opt); + } + return false; +} + +/* mlx5_ib_modify_dct: modify a DCT QP + * valid transitions are: + * RESET to INIT: must set access_flags, pkey_index and port + * INIT to RTR : must set min_rnr_timer, tclass, flow_label, + * mtu, gid_index and hop_limit + * Other transitions and attributes are illegal + */ +static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + enum ib_qp_state cur_state, new_state; + int err = 0; + int required = IB_QP_STATE; + void *dctc; + + if (!(attr_mask & IB_QP_STATE)) + return -EINVAL; + + cur_state = qp->state; + new_state = attr->qp_state; + + dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry); + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + required |= IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT; + if (!is_valid_mask(attr_mask, required, 0)) + return -EINVAL; + + if (attr->port_num == 0 || + attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports)) { + mlx5_ib_dbg(dev, "invalid port number %d. number of ports is %d\n", + attr->port_num, dev->num_ports); + return -EINVAL; + } + if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) + MLX5_SET(dctc, dctc, rre, 1); + if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) + MLX5_SET(dctc, dctc, rwe, 1); + if (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) { + if (!mlx5_ib_dc_atomic_is_supported(dev)) + return -EOPNOTSUPP; + MLX5_SET(dctc, dctc, rae, 1); + MLX5_SET(dctc, dctc, atomic_mode, MLX5_ATOMIC_MODE_DCT_CX); + } + MLX5_SET(dctc, dctc, pkey_index, attr->pkey_index); + MLX5_SET(dctc, dctc, port, attr->port_num); + MLX5_SET(dctc, dctc, counter_set_id, dev->port[attr->port_num - 1].cnts.set_id); + + } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { + struct mlx5_ib_modify_qp_resp resp = {}; + u32 min_resp_len = offsetof(typeof(resp), dctn) + + sizeof(resp.dctn); + + if (udata->outlen < min_resp_len) + return -EINVAL; + resp.response_length = min_resp_len; + + required |= IB_QP_MIN_RNR_TIMER | IB_QP_AV | IB_QP_PATH_MTU; + if (!is_valid_mask(attr_mask, required, 0)) + return -EINVAL; + MLX5_SET(dctc, dctc, min_rnr_nak, attr->min_rnr_timer); + MLX5_SET(dctc, dctc, tclass, attr->ah_attr.grh.traffic_class); + MLX5_SET(dctc, dctc, flow_label, attr->ah_attr.grh.flow_label); + MLX5_SET(dctc, dctc, mtu, attr->path_mtu); + MLX5_SET(dctc, dctc, my_addr_index, attr->ah_attr.grh.sgid_index); + MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit); + if (attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) + MLX5_SET(dctc, dctc, eth_prio, attr->ah_attr.sl & 0x7); + + err = mlx5_core_create_dct(dev->mdev, &qp->dct.mdct, qp->dct.in, + MLX5_ST_SZ_BYTES(create_dct_in)); + if (err) + return err; + resp.dctn = qp->dct.mdct.mqp.qpn; + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) { + mlx5_core_destroy_dct(dev->mdev, &qp->dct.mdct); + return err; + } + } else { + mlx5_ib_warn(dev, "Modify DCT: Invalid transition from %d to %d\n", cur_state, new_state); + return -EINVAL; + } + if (err) + qp->state = IB_QPS_ERR; + else + qp->state = new_state; + return err; +} + +int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_modify_qp ucmd = {}; + enum ib_qp_type qp_type; + enum ib_qp_state cur_state, new_state; + size_t required_cmd_sz; + int err = -EINVAL; + int port; + enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED; + + if (ibqp->rwq_ind_tbl) + return -ENOSYS; + + if (udata && udata->inlen) { + required_cmd_sz = offsetof(typeof(ucmd), reserved) + + sizeof(ucmd.reserved); + if (udata->inlen < required_cmd_sz) + return -EINVAL; + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) + return -EOPNOTSUPP; + + if (ib_copy_from_udata(&ucmd, udata, + min(udata->inlen, sizeof(ucmd)))) + return -EFAULT; + + if (ucmd.comp_mask || + memchr_inv(&ucmd.reserved, 0, sizeof(ucmd.reserved)) || + memchr_inv(&ucmd.burst_info.reserved, 0, + sizeof(ucmd.burst_info.reserved))) + return -EOPNOTSUPP; + } + + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_modify_qp(ibqp, attr, attr_mask); + + if (ibqp->qp_type == IB_QPT_DRIVER) + qp_type = qp->qp_sub_type; + else + qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ? + IB_QPT_GSI : ibqp->qp_type; + + if (qp_type == MLX5_IB_QPT_DCT) + return mlx5_ib_modify_dct(ibqp, attr, attr_mask, udata); + + mutex_lock(&qp->mutex); + + cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (!(cur_state == new_state && cur_state == IB_QPS_RESET)) { + port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port); + } + + if (qp->flags & MLX5_IB_QP_UNDERLAY) { + if (attr_mask & ~(IB_QP_STATE | IB_QP_CUR_STATE)) { + mlx5_ib_dbg(dev, "invalid attr_mask 0x%x when underlay QP is used\n", + attr_mask); + goto out; + } + } else if (qp_type != MLX5_IB_QPT_REG_UMR && + qp_type != MLX5_IB_QPT_DCI && + !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask, ll)) { + mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n", + cur_state, new_state, ibqp->qp_type, attr_mask); + goto out; + } else if (qp_type == MLX5_IB_QPT_DCI && + !modify_dci_qp_is_ok(cur_state, new_state, attr_mask)) { + mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n", + cur_state, new_state, qp_type, attr_mask); + goto out; + } + + if ((attr_mask & IB_QP_PORT) && + (attr->port_num == 0 || + attr->port_num > dev->num_ports)) { + mlx5_ib_dbg(dev, "invalid port number %d. number of ports is %d\n", + attr->port_num, dev->num_ports); + goto out; + } + + if (attr_mask & IB_QP_PKEY_INDEX) { + port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + if (attr->pkey_index >= + dev->mdev->port_caps[port - 1].pkey_table_len) { + mlx5_ib_dbg(dev, "invalid pkey index %d\n", + attr->pkey_index); + goto out; + } + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && + attr->max_rd_atomic > + (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_res_qp))) { + mlx5_ib_dbg(dev, "invalid max_rd_atomic value %d\n", + attr->max_rd_atomic); + goto out; + } + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && + attr->max_dest_rd_atomic > + (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_req_qp))) { + mlx5_ib_dbg(dev, "invalid max_dest_rd_atomic value %d\n", + attr->max_dest_rd_atomic); + goto out; + } + + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + err = 0; + goto out; + } + + err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state, + new_state, &ucmd); + +out: + mutex_unlock(&qp->mutex); + return err; +} + +static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, struct ib_cq *ib_cq) +{ + struct mlx5_ib_cq *cq; + unsigned cur; + + cur = wq->head - wq->tail; + if (likely(cur + nreq < wq->max_post)) + return 0; + + cq = to_mcq(ib_cq); + spin_lock(&cq->lock); + cur = wq->head - wq->tail; + spin_unlock(&cq->lock); + + return cur + nreq >= wq->max_post; +} + +static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg, + u64 remote_addr, u32 rkey) +{ + rseg->raddr = cpu_to_be64(remote_addr); + rseg->rkey = cpu_to_be32(rkey); + rseg->reserved = 0; +} + +static void *set_eth_seg(struct mlx5_wqe_eth_seg *eseg, + const struct ib_send_wr *wr, void *qend, + struct mlx5_ib_qp *qp, int *size) +{ + void *seg = eseg; + + memset(eseg, 0, sizeof(struct mlx5_wqe_eth_seg)); + + if (wr->send_flags & IB_SEND_IP_CSUM) + eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM | + MLX5_ETH_WQE_L4_CSUM; + + seg += sizeof(struct mlx5_wqe_eth_seg); + *size += sizeof(struct mlx5_wqe_eth_seg) / 16; + + if (wr->opcode == IB_WR_LSO) { + struct ib_ud_wr *ud_wr = container_of(wr, struct ib_ud_wr, wr); + int size_of_inl_hdr_start = sizeof(eseg->inline_hdr.start); + u64 left, leftlen, copysz; + void *pdata = ud_wr->header; + + left = ud_wr->hlen; + eseg->mss = cpu_to_be16(ud_wr->mss); + eseg->inline_hdr.sz = cpu_to_be16(left); + + /* + * check if there is space till the end of queue, if yes, + * copy all in one shot, otherwise copy till the end of queue, + * rollback and than the copy the left + */ + leftlen = qend - (void *)eseg->inline_hdr.start; + copysz = min_t(u64, leftlen, left); + + memcpy(seg - size_of_inl_hdr_start, pdata, copysz); + + if (likely(copysz > size_of_inl_hdr_start)) { + seg += ALIGN(copysz - size_of_inl_hdr_start, 16); + *size += ALIGN(copysz - size_of_inl_hdr_start, 16) / 16; + } + + if (unlikely(copysz < left)) { /* the last wqe in the queue */ + seg = mlx5_get_send_wqe(qp, 0); + left -= copysz; + pdata += copysz; + memcpy(seg, pdata, left); + seg += ALIGN(left, 16); + *size += ALIGN(left, 16) / 16; + } + } + + return seg; +} + +static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg, + const struct ib_send_wr *wr) +{ + memcpy(&dseg->av, &to_mah(ud_wr(wr)->ah)->av, sizeof(struct mlx5_av)); + dseg->av.dqp_dct = cpu_to_be32(ud_wr(wr)->remote_qpn | MLX5_EXTENDED_UD_AV); + dseg->av.key.qkey.qkey = cpu_to_be32(ud_wr(wr)->remote_qkey); +} + +static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg) +{ + dseg->byte_count = cpu_to_be32(sg->length); + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); +} + +static u64 get_xlt_octo(u64 bytes) +{ + return ALIGN(bytes, MLX5_IB_UMR_XLT_ALIGNMENT) / + MLX5_IB_UMR_OCTOWORD; +} + +static __be64 frwr_mkey_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_EN_RINVAL | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_A | + MLX5_MKEY_MASK_SMALL_FENCE | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 sig_mkey_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_EN_SIGERR | + MLX5_MKEY_MASK_EN_RINVAL | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_SMALL_FENCE | + MLX5_MKEY_MASK_FREE | + MLX5_MKEY_MASK_BSF_EN; + + return cpu_to_be64(result); +} + +static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, + struct mlx5_ib_mr *mr, bool umr_inline) +{ + int size = mr->ndescs * mr->desc_size; + + memset(umr, 0, sizeof(*umr)); + + umr->flags = MLX5_UMR_CHECK_NOT_FREE; + if (umr_inline) + umr->flags |= MLX5_UMR_INLINE; + umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size)); + umr->mkey_mask = frwr_mkey_mask(); +} + +static void set_linv_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr) +{ + memset(umr, 0, sizeof(*umr)); + umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); + umr->flags = MLX5_UMR_INLINE; +} + +static __be64 get_umr_enable_mr_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_disable_mr_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_translation_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_access_mask(int atomic) +{ + u64 result; + + result = MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW; + + if (atomic) + result |= MLX5_MKEY_MASK_A; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_pd_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_PD; + + return cpu_to_be64(result); +} + +static int umr_check_mkey_mask(struct mlx5_ib_dev *dev, u64 mask) +{ + if ((mask & MLX5_MKEY_MASK_PAGE_SIZE && + MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled)) || + (mask & MLX5_MKEY_MASK_A && + MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))) + return -EPERM; + return 0; +} + +static int set_reg_umr_segment(struct mlx5_ib_dev *dev, + struct mlx5_wqe_umr_ctrl_seg *umr, + const struct ib_send_wr *wr, int atomic) +{ + const struct mlx5_umr_wr *umrwr = umr_wr(wr); + + memset(umr, 0, sizeof(*umr)); + + if (!umrwr->ignore_free_state) { + if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE) + /* fail if free */ + umr->flags = MLX5_UMR_CHECK_FREE; + else + /* fail if not free */ + umr->flags = MLX5_UMR_CHECK_NOT_FREE; + } + + umr->xlt_octowords = cpu_to_be16(get_xlt_octo(umrwr->xlt_size)); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_XLT) { + u64 offset = get_xlt_octo(umrwr->offset); + + umr->xlt_offset = cpu_to_be16(offset & 0xffff); + umr->xlt_offset_47_16 = cpu_to_be32(offset >> 16); + umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; + } + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION) + umr->mkey_mask |= get_umr_update_translation_mask(); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS) { + umr->mkey_mask |= get_umr_update_access_mask(atomic); + umr->mkey_mask |= get_umr_update_pd_mask(); + } + if (wr->send_flags & MLX5_IB_SEND_UMR_ENABLE_MR) + umr->mkey_mask |= get_umr_enable_mr_mask(); + if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR) + umr->mkey_mask |= get_umr_disable_mr_mask(); + + if (!wr->num_sge) + umr->flags |= MLX5_UMR_INLINE; + + return umr_check_mkey_mask(dev, be64_to_cpu(umr->mkey_mask)); +} + +static u8 get_umr_flags(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | + MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN; +} + +static void set_reg_mkey_seg(struct mlx5_mkey_seg *seg, + struct mlx5_ib_mr *mr, + u32 key, int access) +{ + int ndescs = ALIGN(mr->ndescs, 8) >> 1; + + memset(seg, 0, sizeof(*seg)); + + if (mr->access_mode == MLX5_MKC_ACCESS_MODE_MTT) + seg->log2_page_size = ilog2(mr->ibmr.page_size); + else if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) + /* KLMs take twice the size of MTTs */ + ndescs *= 2; + + seg->flags = get_umr_flags(access) | mr->access_mode; + seg->qpn_mkey7_0 = cpu_to_be32((key & 0xff) | 0xffffff00); + seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); + seg->start_addr = cpu_to_be64(mr->ibmr.iova); + seg->len = cpu_to_be64(mr->ibmr.length); + seg->xlt_oct_size = cpu_to_be32(ndescs); +} + +static void set_linv_mkey_seg(struct mlx5_mkey_seg *seg) +{ + memset(seg, 0, sizeof(*seg)); + seg->status = MLX5_MKEY_STATUS_FREE; +} + +static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, + const struct ib_send_wr *wr) +{ + const struct mlx5_umr_wr *umrwr = umr_wr(wr); + + memset(seg, 0, sizeof(*seg)); + if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR) + seg->status = MLX5_MKEY_STATUS_FREE; + + seg->flags = convert_access(umrwr->access_flags); + if (umrwr->pd) + seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION && + !umrwr->length) + seg->flags_pd |= cpu_to_be32(MLX5_MKEY_LEN64); + + seg->start_addr = cpu_to_be64(umrwr->virt_addr); + seg->len = cpu_to_be64(umrwr->length); + seg->log2_page_size = umrwr->page_shift; + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | + mlx5_mkey_variant(umrwr->mkey)); +} + +static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg, + struct mlx5_ib_mr *mr, + struct mlx5_ib_pd *pd) +{ + int bcount = mr->desc_size * mr->ndescs; + + dseg->addr = cpu_to_be64(mr->desc_map); + dseg->byte_count = cpu_to_be32(ALIGN(bcount, 64)); + dseg->lkey = cpu_to_be32(pd->ibpd.local_dma_lkey); +} + +static void set_reg_umr_inline_seg(void *seg, struct mlx5_ib_qp *qp, + struct mlx5_ib_mr *mr, int mr_list_size) +{ + void *qend = qp->sq.qend; + void *addr = mr->descs; + int copy; + + if (unlikely(seg + mr_list_size > qend)) { + copy = qend - seg; + memcpy(seg, addr, copy); + addr += copy; + mr_list_size -= copy; + seg = mlx5_get_send_wqe(qp, 0); + } + memcpy(seg, addr, mr_list_size); + seg += mr_list_size; +} + +static __be32 send_ieth(const struct ib_send_wr *wr) +{ + switch (wr->opcode) { + case IB_WR_SEND_WITH_IMM: + case IB_WR_RDMA_WRITE_WITH_IMM: + return wr->ex.imm_data; + + case IB_WR_SEND_WITH_INV: + return cpu_to_be32(wr->ex.invalidate_rkey); + + default: + return 0; + } +} + +static u8 calc_sig(void *wqe, int size) +{ + u8 *p = wqe; + u8 res = 0; + int i; + + for (i = 0; i < size; i++) + res ^= p[i]; + + return ~res; +} + +static u8 wq_sig(void *wqe) +{ + return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4); +} + +static int set_data_inl_seg(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr, + void *wqe, int *sz) +{ + struct mlx5_wqe_inline_seg *seg; + void *qend = qp->sq.qend; + void *addr; + int inl = 0; + int copy; + int len; + int i; + + seg = wqe; + wqe += sizeof(*seg); + for (i = 0; i < wr->num_sge; i++) { + addr = (void *)(unsigned long)(wr->sg_list[i].addr); + len = wr->sg_list[i].length; + inl += len; + + if (unlikely(inl > qp->max_inline_data)) + return -ENOMEM; + + if (unlikely(wqe + len > qend)) { + copy = qend - wqe; + memcpy(wqe, addr, copy); + addr += copy; + len -= copy; + wqe = mlx5_get_send_wqe(qp, 0); + } + memcpy(wqe, addr, len); + wqe += len; + } + + seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG); + + *sz = ALIGN(inl + sizeof(seg->byte_count), 16) / 16; + + return 0; +} + +static u16 prot_field_size(enum ib_signature_type type) +{ + switch (type) { + case IB_SIG_TYPE_T10_DIF: + return MLX5_DIF_SIZE; + default: + return 0; + } +} + +static u8 bs_selector(int block_size) +{ + switch (block_size) { + case 512: return 0x1; + case 520: return 0x2; + case 4096: return 0x3; + case 4160: return 0x4; + case 1073741824: return 0x5; + default: return 0; + } +} + +static void mlx5_fill_inl_bsf(struct ib_sig_domain *domain, + struct mlx5_bsf_inl *inl) +{ + /* Valid inline section and allow BSF refresh */ + inl->vld_refresh = cpu_to_be16(MLX5_BSF_INL_VALID | + MLX5_BSF_REFRESH_DIF); + inl->dif_apptag = cpu_to_be16(domain->sig.dif.app_tag); + inl->dif_reftag = cpu_to_be32(domain->sig.dif.ref_tag); + /* repeating block */ + inl->rp_inv_seed = MLX5_BSF_REPEAT_BLOCK; + inl->sig_type = domain->sig.dif.bg_type == IB_T10DIF_CRC ? + MLX5_DIF_CRC : MLX5_DIF_IPCS; + + if (domain->sig.dif.ref_remap) + inl->dif_inc_ref_guard_check |= MLX5_BSF_INC_REFTAG; + + if (domain->sig.dif.app_escape) { + if (domain->sig.dif.ref_escape) + inl->dif_inc_ref_guard_check |= MLX5_BSF_APPREF_ESCAPE; + else + inl->dif_inc_ref_guard_check |= MLX5_BSF_APPTAG_ESCAPE; + } + + inl->dif_app_bitmask_check = + cpu_to_be16(domain->sig.dif.apptag_check_mask); +} + +static int mlx5_set_bsf(struct ib_mr *sig_mr, + struct ib_sig_attrs *sig_attrs, + struct mlx5_bsf *bsf, u32 data_size) +{ + struct mlx5_core_sig_ctx *msig = to_mmr(sig_mr)->sig; + struct mlx5_bsf_basic *basic = &bsf->basic; + struct ib_sig_domain *mem = &sig_attrs->mem; + struct ib_sig_domain *wire = &sig_attrs->wire; + + memset(bsf, 0, sizeof(*bsf)); + + /* Basic + Extended + Inline */ + basic->bsf_size_sbs = 1 << 7; + /* Input domain check byte mask */ + basic->check_byte_mask = sig_attrs->check_mask; + basic->raw_data_size = cpu_to_be32(data_size); + + /* Memory domain */ + switch (sig_attrs->mem.sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + basic->mem.bs_selector = bs_selector(mem->sig.dif.pi_interval); + basic->m_bfs_psv = cpu_to_be32(msig->psv_memory.psv_idx); + mlx5_fill_inl_bsf(mem, &bsf->m_inl); + break; + default: + return -EINVAL; + } + + /* Wire domain */ + switch (sig_attrs->wire.sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + if (mem->sig.dif.pi_interval == wire->sig.dif.pi_interval && + mem->sig_type == wire->sig_type) { + /* Same block structure */ + basic->bsf_size_sbs |= 1 << 4; + if (mem->sig.dif.bg_type == wire->sig.dif.bg_type) + basic->wire.copy_byte_mask |= MLX5_CPY_GRD_MASK; + if (mem->sig.dif.app_tag == wire->sig.dif.app_tag) + basic->wire.copy_byte_mask |= MLX5_CPY_APP_MASK; + if (mem->sig.dif.ref_tag == wire->sig.dif.ref_tag) + basic->wire.copy_byte_mask |= MLX5_CPY_REF_MASK; + } else + basic->wire.bs_selector = bs_selector(wire->sig.dif.pi_interval); + + basic->w_bfs_psv = cpu_to_be32(msig->psv_wire.psv_idx); + mlx5_fill_inl_bsf(wire, &bsf->w_inl); + break; + default: + return -EINVAL; + } + + return 0; +} + +static int set_sig_data_segment(const struct ib_sig_handover_wr *wr, + struct mlx5_ib_qp *qp, void **seg, int *size) +{ + struct ib_sig_attrs *sig_attrs = wr->sig_attrs; + struct ib_mr *sig_mr = wr->sig_mr; + struct mlx5_bsf *bsf; + u32 data_len = wr->wr.sg_list->length; + u32 data_key = wr->wr.sg_list->lkey; + u64 data_va = wr->wr.sg_list->addr; + int ret; + int wqe_size; + + if (!wr->prot || + (data_key == wr->prot->lkey && + data_va == wr->prot->addr && + data_len == wr->prot->length)) { + /** + * Source domain doesn't contain signature information + * or data and protection are interleaved in memory. + * So need construct: + * ------------------ + * | data_klm | + * ------------------ + * | BSF | + * ------------------ + **/ + struct mlx5_klm *data_klm = *seg; + + data_klm->bcount = cpu_to_be32(data_len); + data_klm->key = cpu_to_be32(data_key); + data_klm->va = cpu_to_be64(data_va); + wqe_size = ALIGN(sizeof(*data_klm), 64); + } else { + /** + * Source domain contains signature information + * So need construct a strided block format: + * --------------------------- + * | stride_block_ctrl | + * --------------------------- + * | data_klm | + * --------------------------- + * | prot_klm | + * --------------------------- + * | BSF | + * --------------------------- + **/ + struct mlx5_stride_block_ctrl_seg *sblock_ctrl; + struct mlx5_stride_block_entry *data_sentry; + struct mlx5_stride_block_entry *prot_sentry; + u32 prot_key = wr->prot->lkey; + u64 prot_va = wr->prot->addr; + u16 block_size = sig_attrs->mem.sig.dif.pi_interval; + int prot_size; + + sblock_ctrl = *seg; + data_sentry = (void *)sblock_ctrl + sizeof(*sblock_ctrl); + prot_sentry = (void *)data_sentry + sizeof(*data_sentry); + + prot_size = prot_field_size(sig_attrs->mem.sig_type); + if (!prot_size) { + pr_err("Bad block size given: %u\n", block_size); + return -EINVAL; + } + sblock_ctrl->bcount_per_cycle = cpu_to_be32(block_size + + prot_size); + sblock_ctrl->op = cpu_to_be32(MLX5_STRIDE_BLOCK_OP); + sblock_ctrl->repeat_count = cpu_to_be32(data_len / block_size); + sblock_ctrl->num_entries = cpu_to_be16(2); + + data_sentry->bcount = cpu_to_be16(block_size); + data_sentry->key = cpu_to_be32(data_key); + data_sentry->va = cpu_to_be64(data_va); + data_sentry->stride = cpu_to_be16(block_size); + + prot_sentry->bcount = cpu_to_be16(prot_size); + prot_sentry->key = cpu_to_be32(prot_key); + prot_sentry->va = cpu_to_be64(prot_va); + prot_sentry->stride = cpu_to_be16(prot_size); + + wqe_size = ALIGN(sizeof(*sblock_ctrl) + sizeof(*data_sentry) + + sizeof(*prot_sentry), 64); + } + + *seg += wqe_size; + *size += wqe_size / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + bsf = *seg; + ret = mlx5_set_bsf(sig_mr, sig_attrs, bsf, data_len); + if (ret) + return -EINVAL; + + *seg += sizeof(*bsf); + *size += sizeof(*bsf) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + return 0; +} + +static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, + const struct ib_sig_handover_wr *wr, u32 size, + u32 length, u32 pdn) +{ + struct ib_mr *sig_mr = wr->sig_mr; + u32 sig_key = sig_mr->rkey; + u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1; + + memset(seg, 0, sizeof(*seg)); + + seg->flags = get_umr_flags(wr->access_flags) | + MLX5_MKC_ACCESS_MODE_KLMS; + seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00); + seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 | + MLX5_MKEY_BSF_EN | pdn); + seg->len = cpu_to_be64(length); + seg->xlt_oct_size = cpu_to_be32(get_xlt_octo(size)); + seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); +} + +static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + u32 size) +{ + memset(umr, 0, sizeof(*umr)); + + umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE; + umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size)); + umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE); + umr->mkey_mask = sig_mkey_mask(); +} + + +static int set_sig_umr_wr(const struct ib_send_wr *send_wr, + struct mlx5_ib_qp *qp, void **seg, int *size) +{ + const struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr); + struct mlx5_ib_mr *sig_mr = to_mmr(wr->sig_mr); + u32 pdn = get_pd(qp)->pdn; + u32 xlt_size; + int region_len, ret; + + if (unlikely(wr->wr.num_sge != 1) || + unlikely(wr->access_flags & IB_ACCESS_REMOTE_ATOMIC) || + unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) || + unlikely(!sig_mr->sig->sig_status_checked)) + return -EINVAL; + + /* length of the protected region, data + protection */ + region_len = wr->wr.sg_list->length; + if (wr->prot && + (wr->prot->lkey != wr->wr.sg_list->lkey || + wr->prot->addr != wr->wr.sg_list->addr || + wr->prot->length != wr->wr.sg_list->length)) + region_len += wr->prot->length; + + /** + * KLM octoword size - if protection was provided + * then we use strided block format (3 octowords), + * else we use single KLM (1 octoword) + **/ + xlt_size = wr->prot ? 0x30 : sizeof(struct mlx5_klm); + + set_sig_umr_segment(*seg, xlt_size); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + set_sig_mkey_segment(*seg, wr, xlt_size, region_len, pdn); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + ret = set_sig_data_segment(wr, qp, seg, size); + if (ret) + return ret; + + sig_mr->sig->sig_status_checked = false; + return 0; +} + +static int set_psv_wr(struct ib_sig_domain *domain, + u32 psv_idx, void **seg, int *size) +{ + struct mlx5_seg_set_psv *psv_seg = *seg; + + memset(psv_seg, 0, sizeof(*psv_seg)); + psv_seg->psv_num = cpu_to_be32(psv_idx); + switch (domain->sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + psv_seg->transient_sig = cpu_to_be32(domain->sig.dif.bg << 16 | + domain->sig.dif.app_tag); + psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag); + break; + default: + pr_err("Bad signature type (%d) is given.\n", + domain->sig_type); + return -EINVAL; + } + + *seg += sizeof(*psv_seg); + *size += sizeof(*psv_seg) / 16; + + return 0; +} + +static int set_reg_wr(struct mlx5_ib_qp *qp, + const struct ib_reg_wr *wr, + void **seg, int *size) +{ + struct mlx5_ib_mr *mr = to_mmr(wr->mr); + struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd); + int mr_list_size = mr->ndescs * mr->desc_size; + bool umr_inline = mr_list_size <= MLX5_IB_SQ_UMR_INLINE_THRESHOLD; + + if (unlikely(wr->wr.send_flags & IB_SEND_INLINE)) { + mlx5_ib_warn(to_mdev(qp->ibqp.device), + "Invalid IB_SEND_INLINE send flag\n"); + return -EINVAL; + } + + set_reg_umr_seg(*seg, mr, umr_inline); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + set_reg_mkey_seg(*seg, mr, wr->key, wr->access); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + if (umr_inline) { + set_reg_umr_inline_seg(*seg, qp, mr, mr_list_size); + *size += get_xlt_octo(mr_list_size); + } else { + set_reg_data_seg(*seg, mr, pd); + *seg += sizeof(struct mlx5_wqe_data_seg); + *size += (sizeof(struct mlx5_wqe_data_seg) / 16); + } + return 0; +} + +static void set_linv_wr(struct mlx5_ib_qp *qp, void **seg, int *size) +{ + set_linv_umr_seg(*seg); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + set_linv_mkey_seg(*seg); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); +} + +static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16) +{ + __be32 *p = NULL; + int tidx = idx; + int i, j; + + pr_debug("dump wqe at %p\n", mlx5_get_send_wqe(qp, tidx)); + for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) { + if ((i & 0xf) == 0) { + void *buf = mlx5_get_send_wqe(qp, tidx); + tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1); + p = buf; + j = 0; + } + pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]), + be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]), + be32_to_cpu(p[j + 3])); + } +} + +static int __begin_wqe(struct mlx5_ib_qp *qp, void **seg, + struct mlx5_wqe_ctrl_seg **ctrl, + const struct ib_send_wr *wr, unsigned *idx, + int *size, int nreq, bool send_signaled, bool solicited) +{ + if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) + return -ENOMEM; + + *idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); + *seg = mlx5_get_send_wqe(qp, *idx); + *ctrl = *seg; + *(uint32_t *)(*seg + 8) = 0; + (*ctrl)->imm = send_ieth(wr); + (*ctrl)->fm_ce_se = qp->sq_signal_bits | + (send_signaled ? MLX5_WQE_CTRL_CQ_UPDATE : 0) | + (solicited ? MLX5_WQE_CTRL_SOLICITED : 0); + + *seg += sizeof(**ctrl); + *size = sizeof(**ctrl) / 16; + + return 0; +} + +static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, + struct mlx5_wqe_ctrl_seg **ctrl, + const struct ib_send_wr *wr, unsigned *idx, + int *size, int nreq) +{ + return __begin_wqe(qp, seg, ctrl, wr, idx, size, nreq, + wr->send_flags & IB_SEND_SIGNALED, + wr->send_flags & IB_SEND_SOLICITED); +} + +static void finish_wqe(struct mlx5_ib_qp *qp, + struct mlx5_wqe_ctrl_seg *ctrl, + u8 size, unsigned idx, u64 wr_id, + int nreq, u8 fence, u32 mlx5_opcode) +{ + u8 opmod = 0; + + ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | + mlx5_opcode | ((u32)opmod << 24)); + ctrl->qpn_ds = cpu_to_be32(size | (qp->trans_qp.base.mqp.qpn << 8)); + ctrl->fm_ce_se |= fence; + if (unlikely(qp->wq_sig)) + ctrl->signature = wq_sig(ctrl); + + qp->sq.wrid[idx] = wr_id; + qp->sq.w_list[idx].opcode = mlx5_opcode; + qp->sq.wqe_head[idx] = qp->sq.head + nreq; + qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); + qp->sq.w_list[idx].next = qp->sq.cur_post; +} + +static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr, bool drain) +{ + struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_qp *qp; + struct mlx5_ib_mr *mr; + struct mlx5_wqe_data_seg *dpseg; + struct mlx5_wqe_xrc_seg *xrc; + struct mlx5_bf *bf; + int uninitialized_var(size); + void *qend; + unsigned long flags; + unsigned idx; + int err = 0; + int num_sge; + void *seg; + int nreq; + int i; + u8 next_fence = 0; + u8 fence; + + if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR && + !drain)) { + *bad_wr = wr; + return -EIO; + } + + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_post_send(ibqp, wr, bad_wr); + + qp = to_mqp(ibqp); + bf = &qp->bf; + qend = qp->sq.qend; + + spin_lock_irqsave(&qp->sq.lock, flags); + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) { + mlx5_ib_warn(dev, "\n"); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + num_sge = wr->num_sge; + if (unlikely(num_sge > qp->sq.max_gs)) { + mlx5_ib_warn(dev, "\n"); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (wr->opcode == IB_WR_REG_MR) { + fence = dev->umr_fence; + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + } else { + if (wr->send_flags & IB_SEND_FENCE) { + if (qp->next_fence) + fence = MLX5_FENCE_MODE_SMALL_AND_FENCE; + else + fence = MLX5_FENCE_MODE_FENCE; + } else { + fence = qp->next_fence; + } + } + + switch (ibqp->qp_type) { + case IB_QPT_XRC_INI: + xrc = seg; + seg += sizeof(*xrc); + size += sizeof(*xrc) / 16; + /* fall through */ + case IB_QPT_RC: + switch (wr->opcode) { + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(seg, rdma_wr(wr)->remote_addr, + rdma_wr(wr)->rkey); + seg += sizeof(struct mlx5_wqe_raddr_seg); + size += sizeof(struct mlx5_wqe_raddr_seg) / 16; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: + mlx5_ib_warn(dev, "Atomic operations are not supported yet\n"); + err = -ENOSYS; + *bad_wr = wr; + goto out; + + case IB_WR_LOCAL_INV: + qp->sq.wr_data[idx] = IB_WR_LOCAL_INV; + ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey); + set_linv_wr(qp, &seg, &size); + num_sge = 0; + break; + + case IB_WR_REG_MR: + qp->sq.wr_data[idx] = IB_WR_REG_MR; + ctrl->imm = cpu_to_be32(reg_wr(wr)->key); + err = set_reg_wr(qp, reg_wr(wr), &seg, &size); + if (err) { + *bad_wr = wr; + goto out; + } + num_sge = 0; + break; + + case IB_WR_REG_SIG_MR: + qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR; + mr = to_mmr(sig_handover_wr(wr)->sig_mr); + + ctrl->imm = cpu_to_be32(mr->ibmr.rkey); + err = set_sig_umr_wr(wr, qp, &seg, &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, + fence, MLX5_OPCODE_UMR); + /* + * SET_PSV WQEs are not signaled and solicited + * on error + */ + err = __begin_wqe(qp, &seg, &ctrl, wr, &idx, + &size, nreq, false, true); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + err = set_psv_wr(&sig_handover_wr(wr)->sig_attrs->mem, + mr->sig->psv_memory.psv_idx, &seg, + &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, + fence, MLX5_OPCODE_SET_PSV); + err = __begin_wqe(qp, &seg, &ctrl, wr, &idx, + &size, nreq, false, true); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + err = set_psv_wr(&sig_handover_wr(wr)->sig_attrs->wire, + mr->sig->psv_wire.psv_idx, &seg, + &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, + fence, MLX5_OPCODE_SET_PSV); + qp->next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + num_sge = 0; + goto skip_psv; + + default: + break; + } + break; + + case IB_QPT_UC: + switch (wr->opcode) { + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(seg, rdma_wr(wr)->remote_addr, + rdma_wr(wr)->rkey); + seg += sizeof(struct mlx5_wqe_raddr_seg); + size += sizeof(struct mlx5_wqe_raddr_seg) / 16; + break; + + default: + break; + } + break; + + case IB_QPT_SMI: + if (unlikely(!mdev->port_caps[qp->port - 1].has_smi)) { + mlx5_ib_warn(dev, "Send SMP MADs is not allowed\n"); + err = -EPERM; + *bad_wr = wr; + goto out; + } + /* fall through */ + case MLX5_IB_QPT_HW_GSI: + set_datagram_seg(seg, wr); + seg += sizeof(struct mlx5_wqe_datagram_seg); + size += sizeof(struct mlx5_wqe_datagram_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + break; + case IB_QPT_UD: + set_datagram_seg(seg, wr); + seg += sizeof(struct mlx5_wqe_datagram_seg); + size += sizeof(struct mlx5_wqe_datagram_seg) / 16; + + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + + /* handle qp that supports ud offload */ + if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) { + struct mlx5_wqe_eth_pad *pad; + + pad = seg; + memset(pad, 0, sizeof(struct mlx5_wqe_eth_pad)); + seg += sizeof(struct mlx5_wqe_eth_pad); + size += sizeof(struct mlx5_wqe_eth_pad) / 16; + + seg = set_eth_seg(seg, wr, qend, qp, &size); + + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + } + break; + case MLX5_IB_QPT_REG_UMR: + if (wr->opcode != MLX5_IB_WR_UMR) { + err = -EINVAL; + mlx5_ib_warn(dev, "bad opcode\n"); + goto out; + } + qp->sq.wr_data[idx] = MLX5_IB_WR_UMR; + ctrl->imm = cpu_to_be32(umr_wr(wr)->mkey); + err = set_reg_umr_segment(dev, seg, wr, !!(MLX5_CAP_GEN(mdev, atomic))); + if (unlikely(err)) + goto out; + seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + set_reg_mkey_segment(seg, wr); + seg += sizeof(struct mlx5_mkey_seg); + size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + break; + + default: + break; + } + + if (wr->send_flags & IB_SEND_INLINE && num_sge) { + int uninitialized_var(sz); + + err = set_data_inl_seg(qp, wr, seg, &sz); + if (unlikely(err)) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + size += sz; + } else { + dpseg = seg; + for (i = 0; i < num_sge; i++) { + if (unlikely(dpseg == qend)) { + seg = mlx5_get_send_wqe(qp, 0); + dpseg = seg; + } + if (likely(wr->sg_list[i].length)) { + set_data_ptr_seg(dpseg, wr->sg_list + i); + size += sizeof(struct mlx5_wqe_data_seg) / 16; + dpseg++; + } + } + } + + qp->next_fence = next_fence; + finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, fence, + mlx5_ib_opcode[wr->opcode]); +skip_psv: + if (0) + dump_wqe(qp, idx, size); + } + +out: + if (likely(nreq)) { + qp->sq.head += nreq; + + /* Make sure that descriptors are written before + * updating doorbell record and ringing the doorbell + */ + wmb(); + + qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post); + + /* Make sure doorbell record is visible to the HCA before + * we hit doorbell */ + wmb(); + + /* currently we support only regular doorbells */ + mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset, NULL); + /* Make sure doorbells don't leak out of SQ spinlock + * and reach the HCA out of order. + */ + mmiowb(); + bf->offset ^= bf->buf_size; + } + + spin_unlock_irqrestore(&qp->sq.lock, flags); + + return err; +} + +int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + return _mlx5_ib_post_send(ibqp, wr, bad_wr, false); +} + +static void set_sig_seg(struct mlx5_rwqe_sig *sig, int size) +{ + sig->signature = calc_sig(sig, size); +} + +static int _mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr, bool drain) +{ + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_wqe_data_seg *scat; + struct mlx5_rwqe_sig *sig; + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_core_dev *mdev = dev->mdev; + unsigned long flags; + int err = 0; + int nreq; + int ind; + int i; + + if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR && + !drain)) { + *bad_wr = wr; + return -EIO; + } + + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_post_recv(ibqp, wr, bad_wr); + + spin_lock_irqsave(&qp->rq.lock, flags); + + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (mlx5_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->rq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + scat = get_recv_wqe(qp, ind); + if (qp->wq_sig) + scat++; + + for (i = 0; i < wr->num_sge; i++) + set_data_ptr_seg(scat + i, wr->sg_list + i); + + if (i < qp->rq.max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + scat[i].addr = 0; + } + + if (qp->wq_sig) { + sig = (struct mlx5_rwqe_sig *)scat; + set_sig_seg(sig, (qp->rq.max_gs + 1) << 2); + } + + qp->rq.wrid[ind] = wr->wr_id; + + ind = (ind + 1) & (qp->rq.wqe_cnt - 1); + } + +out: + if (likely(nreq)) { + qp->rq.head += nreq; + + /* Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); + } + + spin_unlock_irqrestore(&qp->rq.lock, flags); + + return err; +} + +int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + return _mlx5_ib_post_recv(ibqp, wr, bad_wr, false); +} + +static inline enum ib_qp_state to_ib_qp_state(enum mlx5_qp_state mlx5_state) +{ + switch (mlx5_state) { + case MLX5_QP_STATE_RST: return IB_QPS_RESET; + case MLX5_QP_STATE_INIT: return IB_QPS_INIT; + case MLX5_QP_STATE_RTR: return IB_QPS_RTR; + case MLX5_QP_STATE_RTS: return IB_QPS_RTS; + case MLX5_QP_STATE_SQ_DRAINING: + case MLX5_QP_STATE_SQD: return IB_QPS_SQD; + case MLX5_QP_STATE_SQER: return IB_QPS_SQE; + case MLX5_QP_STATE_ERR: return IB_QPS_ERR; + default: return -1; + } +} + +static inline enum ib_mig_state to_ib_mig_state(int mlx5_mig_state) +{ + switch (mlx5_mig_state) { + case MLX5_QP_PM_ARMED: return IB_MIG_ARMED; + case MLX5_QP_PM_REARM: return IB_MIG_REARM; + case MLX5_QP_PM_MIGRATED: return IB_MIG_MIGRATED; + default: return -1; + } +} + +static int to_ib_qp_access_flags(int mlx5_flags) +{ + int ib_flags = 0; + + if (mlx5_flags & MLX5_QP_BIT_RRE) + ib_flags |= IB_ACCESS_REMOTE_READ; + if (mlx5_flags & MLX5_QP_BIT_RWE) + ib_flags |= IB_ACCESS_REMOTE_WRITE; + if (mlx5_flags & MLX5_QP_BIT_RAE) + ib_flags |= IB_ACCESS_REMOTE_ATOMIC; + + return ib_flags; +} + +static void to_rdma_ah_attr(struct mlx5_ib_dev *ibdev, + struct rdma_ah_attr *ah_attr, + struct mlx5_qp_path *path) +{ + + memset(ah_attr, 0, sizeof(*ah_attr)); + + if (!path->port || path->port > ibdev->num_ports) + return; + + ah_attr->type = rdma_ah_find_type(&ibdev->ib_dev, path->port); + + rdma_ah_set_port_num(ah_attr, path->port); + rdma_ah_set_sl(ah_attr, path->dci_cfi_prio_sl & 0xf); + + rdma_ah_set_dlid(ah_attr, be16_to_cpu(path->rlid)); + rdma_ah_set_path_bits(ah_attr, path->grh_mlid & 0x7f); + rdma_ah_set_static_rate(ah_attr, + path->static_rate ? path->static_rate - 5 : 0); + + if (path->grh_mlid & (1 << 7) || + ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { + u32 tc_fl = be32_to_cpu(path->tclass_flowlabel); + + rdma_ah_set_grh(ah_attr, NULL, + tc_fl & 0xfffff, + path->mgid_index, + path->hop_limit, + (tc_fl >> 20) & 0xff); + rdma_ah_set_dgid_raw(ah_attr, path->rgid); + } +} + +static int query_raw_packet_qp_sq_state(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq, + u8 *sq_state) +{ + int err; + + err = mlx5_core_query_sq_state(dev->mdev, sq->base.mqp.qpn, sq_state); + if (err) + goto out; + sq->state = *sq_state; + +out: + return err; +} + +static int query_raw_packet_qp_rq_state(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq, + u8 *rq_state) +{ + void *out; + void *rqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(query_rq_out); + out = kvzalloc(inlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = mlx5_core_query_rq(dev->mdev, rq->base.mqp.qpn, out); + if (err) + goto out; + + rqc = MLX5_ADDR_OF(query_rq_out, out, rq_context); + *rq_state = MLX5_GET(rqc, rqc, state); + rq->state = *rq_state; + +out: + kvfree(out); + return err; +} + +static int sqrq_state_to_qp_state(u8 sq_state, u8 rq_state, + struct mlx5_ib_qp *qp, u8 *qp_state) +{ + static const u8 sqrq_trans[MLX5_RQ_NUM_STATE][MLX5_SQ_NUM_STATE] = { + [MLX5_RQC_STATE_RST] = { + [MLX5_SQC_STATE_RST] = IB_QPS_RESET, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_ERR] = MLX5_QP_STATE_BAD, + [MLX5_SQ_STATE_NA] = IB_QPS_RESET, + }, + [MLX5_RQC_STATE_RDY] = { + [MLX5_SQC_STATE_RST] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE, + [MLX5_SQC_STATE_ERR] = IB_QPS_SQE, + [MLX5_SQ_STATE_NA] = MLX5_QP_STATE, + }, + [MLX5_RQC_STATE_ERR] = { + [MLX5_SQC_STATE_RST] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_ERR] = IB_QPS_ERR, + [MLX5_SQ_STATE_NA] = IB_QPS_ERR, + }, + [MLX5_RQ_STATE_NA] = { + [MLX5_SQC_STATE_RST] = IB_QPS_RESET, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE, + [MLX5_SQC_STATE_ERR] = MLX5_QP_STATE, + [MLX5_SQ_STATE_NA] = MLX5_QP_STATE_BAD, + }, + }; + + *qp_state = sqrq_trans[rq_state][sq_state]; + + if (*qp_state == MLX5_QP_STATE_BAD) { + WARN(1, "Buggy Raw Packet QP state, SQ 0x%x state: 0x%x, RQ 0x%x state: 0x%x", + qp->raw_packet_qp.sq.base.mqp.qpn, sq_state, + qp->raw_packet_qp.rq.base.mqp.qpn, rq_state); + return -EINVAL; + } + + if (*qp_state == MLX5_QP_STATE) + *qp_state = qp->state; + + return 0; +} + +static int query_raw_packet_qp_state(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, + u8 *raw_packet_qp_state) +{ + struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + int err; + u8 sq_state = MLX5_SQ_STATE_NA; + u8 rq_state = MLX5_RQ_STATE_NA; + + if (qp->sq.wqe_cnt) { + err = query_raw_packet_qp_sq_state(dev, sq, &sq_state); + if (err) + return err; + } + + if (qp->rq.wqe_cnt) { + err = query_raw_packet_qp_rq_state(dev, rq, &rq_state); + if (err) + return err; + } + + return sqrq_state_to_qp_state(sq_state, rq_state, qp, + raw_packet_qp_state); +} + +static int query_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + struct ib_qp_attr *qp_attr) +{ + int outlen = MLX5_ST_SZ_BYTES(query_qp_out); + struct mlx5_qp_context *context; + int mlx5_state; + u32 *outb; + int err = 0; + + outb = kzalloc(outlen, GFP_KERNEL); + if (!outb) + return -ENOMEM; + + err = mlx5_core_qp_query(dev->mdev, &qp->trans_qp.base.mqp, outb, + outlen); + if (err) + goto out; + + /* FIXME: use MLX5_GET rather than mlx5_qp_context manual struct */ + context = (struct mlx5_qp_context *)MLX5_ADDR_OF(query_qp_out, outb, qpc); + + mlx5_state = be32_to_cpu(context->flags) >> 28; + + qp->state = to_ib_qp_state(mlx5_state); + qp_attr->path_mtu = context->mtu_msgmax >> 5; + qp_attr->path_mig_state = + to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3); + qp_attr->qkey = be32_to_cpu(context->qkey); + qp_attr->rq_psn = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff; + qp_attr->sq_psn = be32_to_cpu(context->next_send_psn) & 0xffffff; + qp_attr->dest_qp_num = be32_to_cpu(context->log_pg_sz_remote_qpn) & 0xffffff; + qp_attr->qp_access_flags = + to_ib_qp_access_flags(be32_to_cpu(context->params2)); + + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { + to_rdma_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path); + to_rdma_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path); + qp_attr->alt_pkey_index = + be16_to_cpu(context->alt_path.pkey_index); + qp_attr->alt_port_num = + rdma_ah_get_port_num(&qp_attr->alt_ah_attr); + } + + qp_attr->pkey_index = be16_to_cpu(context->pri_path.pkey_index); + qp_attr->port_num = context->pri_path.port; + + /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ + qp_attr->sq_draining = mlx5_state == MLX5_QP_STATE_SQ_DRAINING; + + qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7); + + qp_attr->max_dest_rd_atomic = + 1 << ((be32_to_cpu(context->params2) >> 21) & 0x7); + qp_attr->min_rnr_timer = + (be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f; + qp_attr->timeout = context->pri_path.ackto_lt >> 3; + qp_attr->retry_cnt = (be32_to_cpu(context->params1) >> 16) & 0x7; + qp_attr->rnr_retry = (be32_to_cpu(context->params1) >> 13) & 0x7; + qp_attr->alt_timeout = context->alt_path.ackto_lt >> 3; + +out: + kfree(outb); + return err; +} + +static int mlx5_ib_dct_query_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *mqp, + struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx5_core_dct *dct = &mqp->dct.mdct; + u32 *out; + u32 access_flags = 0; + int outlen = MLX5_ST_SZ_BYTES(query_dct_out); + void *dctc; + int err; + int supported_mask = IB_QP_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_PORT | + IB_QP_MIN_RNR_TIMER | + IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_PKEY_INDEX; + + if (qp_attr_mask & ~supported_mask) + return -EINVAL; + if (mqp->state != IB_QPS_RTR) + return -EINVAL; + + out = kzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = mlx5_core_dct_query(dev->mdev, dct, out, outlen); + if (err) + goto out; + + dctc = MLX5_ADDR_OF(query_dct_out, out, dct_context_entry); + + if (qp_attr_mask & IB_QP_STATE) + qp_attr->qp_state = IB_QPS_RTR; + + if (qp_attr_mask & IB_QP_ACCESS_FLAGS) { + if (MLX5_GET(dctc, dctc, rre)) + access_flags |= IB_ACCESS_REMOTE_READ; + if (MLX5_GET(dctc, dctc, rwe)) + access_flags |= IB_ACCESS_REMOTE_WRITE; + if (MLX5_GET(dctc, dctc, rae)) + access_flags |= IB_ACCESS_REMOTE_ATOMIC; + qp_attr->qp_access_flags = access_flags; + } + + if (qp_attr_mask & IB_QP_PORT) + qp_attr->port_num = MLX5_GET(dctc, dctc, port); + if (qp_attr_mask & IB_QP_MIN_RNR_TIMER) + qp_attr->min_rnr_timer = MLX5_GET(dctc, dctc, min_rnr_nak); + if (qp_attr_mask & IB_QP_AV) { + qp_attr->ah_attr.grh.traffic_class = MLX5_GET(dctc, dctc, tclass); + qp_attr->ah_attr.grh.flow_label = MLX5_GET(dctc, dctc, flow_label); + qp_attr->ah_attr.grh.sgid_index = MLX5_GET(dctc, dctc, my_addr_index); + qp_attr->ah_attr.grh.hop_limit = MLX5_GET(dctc, dctc, hop_limit); + } + if (qp_attr_mask & IB_QP_PATH_MTU) + qp_attr->path_mtu = MLX5_GET(dctc, dctc, mtu); + if (qp_attr_mask & IB_QP_PKEY_INDEX) + qp_attr->pkey_index = MLX5_GET(dctc, dctc, pkey_index); +out: + kfree(out); + return err; +} + +int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + int err = 0; + u8 raw_packet_qp_state; + + if (ibqp->rwq_ind_tbl) + return -ENOSYS; + + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_query_qp(ibqp, qp_attr, qp_attr_mask, + qp_init_attr); + + /* Not all of output fields are applicable, make sure to zero them */ + memset(qp_init_attr, 0, sizeof(*qp_init_attr)); + memset(qp_attr, 0, sizeof(*qp_attr)); + + if (unlikely(qp->qp_sub_type == MLX5_IB_QPT_DCT)) + return mlx5_ib_dct_query_qp(dev, qp, qp_attr, + qp_attr_mask, qp_init_attr); + + mutex_lock(&qp->mutex); + + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET || + qp->flags & MLX5_IB_QP_UNDERLAY) { + err = query_raw_packet_qp_state(dev, qp, &raw_packet_qp_state); + if (err) + goto out; + qp->state = raw_packet_qp_state; + qp_attr->port_num = 1; + } else { + err = query_qp_attr(dev, qp, qp_attr); + if (err) + goto out; + } + + qp_attr->qp_state = qp->state; + qp_attr->cur_qp_state = qp_attr->qp_state; + qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt; + qp_attr->cap.max_recv_sge = qp->rq.max_gs; + + if (!ibqp->uobject) { + qp_attr->cap.max_send_wr = qp->sq.max_post; + qp_attr->cap.max_send_sge = qp->sq.max_gs; + qp_init_attr->qp_context = ibqp->qp_context; + } else { + qp_attr->cap.max_send_wr = 0; + qp_attr->cap.max_send_sge = 0; + } + + qp_init_attr->qp_type = ibqp->qp_type; + qp_init_attr->recv_cq = ibqp->recv_cq; + qp_init_attr->send_cq = ibqp->send_cq; + qp_init_attr->srq = ibqp->srq; + qp_attr->cap.max_inline_data = qp->max_inline_data; + + qp_init_attr->cap = qp_attr->cap; + + qp_init_attr->create_flags = 0; + if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) + qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + + if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) + qp_init_attr->create_flags |= IB_QP_CREATE_CROSS_CHANNEL; + if (qp->flags & MLX5_IB_QP_MANAGED_SEND) + qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_SEND; + if (qp->flags & MLX5_IB_QP_MANAGED_RECV) + qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV; + if (qp->flags & MLX5_IB_QP_SQPN_QP1) + qp_init_attr->create_flags |= mlx5_ib_create_qp_sqpn_qp1(); + + qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ? + IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + +out: + mutex_unlock(&qp->mutex); + return err; +} + +struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_xrcd *xrcd; + int err; + + if (!MLX5_CAP_GEN(dev->mdev, xrc)) + return ERR_PTR(-ENOSYS); + + xrcd = kmalloc(sizeof(*xrcd), GFP_KERNEL); + if (!xrcd) + return ERR_PTR(-ENOMEM); + + err = mlx5_core_xrcd_alloc(dev->mdev, &xrcd->xrcdn); + if (err) { + kfree(xrcd); + return ERR_PTR(-ENOMEM); + } + + return &xrcd->ibxrcd; +} + +int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd) +{ + struct mlx5_ib_dev *dev = to_mdev(xrcd->device); + u32 xrcdn = to_mxrcd(xrcd)->xrcdn; + int err; + + err = mlx5_core_xrcd_dealloc(dev->mdev, xrcdn); + if (err) + mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn); + + kfree(xrcd); + return 0; +} + +static void mlx5_ib_wq_event(struct mlx5_core_qp *core_qp, int type) +{ + struct mlx5_ib_rwq *rwq = to_mibrwq(core_qp); + struct mlx5_ib_dev *dev = to_mdev(rwq->ibwq.device); + struct ib_event event; + + if (rwq->ibwq.event_handler) { + event.device = rwq->ibwq.device; + event.element.wq = &rwq->ibwq; + switch (type) { + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + event.event = IB_EVENT_WQ_FATAL; + break; + default: + mlx5_ib_warn(dev, "Unexpected event type %d on WQ %06x\n", type, core_qp->qpn); + return; + } + + rwq->ibwq.event_handler(&event, rwq->ibwq.wq_context); + } +} + +static int set_delay_drop(struct mlx5_ib_dev *dev) +{ + int err = 0; + + mutex_lock(&dev->delay_drop.lock); + if (dev->delay_drop.activate) + goto out; + + err = mlx5_core_set_delay_drop(dev->mdev, dev->delay_drop.timeout); + if (err) + goto out; + + dev->delay_drop.activate = true; +out: + mutex_unlock(&dev->delay_drop.lock); + + if (!err) + atomic_inc(&dev->delay_drop.rqs_cnt); + return err; +} + +static int create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd, + struct ib_wq_init_attr *init_attr) +{ + struct mlx5_ib_dev *dev; + int has_net_offloads; + __be64 *rq_pas0; + void *in; + void *rqc; + void *wq; + int inlen; + int err; + + dev = to_mdev(pd->device); + + inlen = MLX5_ST_SZ_BYTES(create_rq_in) + sizeof(u64) * rwq->rq_num_pas; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); + MLX5_SET(rqc, rqc, mem_rq_type, + MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE); + MLX5_SET(rqc, rqc, user_index, rwq->user_index); + MLX5_SET(rqc, rqc, cqn, to_mcq(init_attr->cq)->mcq.cqn); + MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); + MLX5_SET(rqc, rqc, flush_in_error_en, 1); + wq = MLX5_ADDR_OF(rqc, rqc, wq); + MLX5_SET(wq, wq, wq_type, + rwq->create_flags & MLX5_IB_WQ_FLAGS_STRIDING_RQ ? + MLX5_WQ_TYPE_CYCLIC_STRIDING_RQ : MLX5_WQ_TYPE_CYCLIC); + if (init_attr->create_flags & IB_WQ_FLAGS_PCI_WRITE_END_PADDING) { + if (!MLX5_CAP_GEN(dev->mdev, end_pad)) { + mlx5_ib_dbg(dev, "Scatter end padding is not supported\n"); + err = -EOPNOTSUPP; + goto out; + } else { + MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN); + } + } + MLX5_SET(wq, wq, log_wq_stride, rwq->log_rq_stride); + if (rwq->create_flags & MLX5_IB_WQ_FLAGS_STRIDING_RQ) { + MLX5_SET(wq, wq, two_byte_shift_en, rwq->two_byte_shift_en); + MLX5_SET(wq, wq, log_wqe_stride_size, + rwq->single_stride_log_num_of_bytes - + MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES); + MLX5_SET(wq, wq, log_wqe_num_of_strides, rwq->log_num_strides - + MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES); + } + MLX5_SET(wq, wq, log_wq_sz, rwq->log_rq_size); + MLX5_SET(wq, wq, pd, to_mpd(pd)->pdn); + MLX5_SET(wq, wq, page_offset, rwq->rq_page_offset); + MLX5_SET(wq, wq, log_wq_pg_sz, rwq->log_page_size); + MLX5_SET(wq, wq, wq_signature, rwq->wq_sig); + MLX5_SET64(wq, wq, dbr_addr, rwq->db.dma); + has_net_offloads = MLX5_CAP_GEN(dev->mdev, eth_net_offloads); + if (init_attr->create_flags & IB_WQ_FLAGS_CVLAN_STRIPPING) { + if (!(has_net_offloads && MLX5_CAP_ETH(dev->mdev, vlan_cap))) { + mlx5_ib_dbg(dev, "VLAN offloads are not supported\n"); + err = -EOPNOTSUPP; + goto out; + } + } else { + MLX5_SET(rqc, rqc, vsd, 1); + } + if (init_attr->create_flags & IB_WQ_FLAGS_SCATTER_FCS) { + if (!(has_net_offloads && MLX5_CAP_ETH(dev->mdev, scatter_fcs))) { + mlx5_ib_dbg(dev, "Scatter FCS is not supported\n"); + err = -EOPNOTSUPP; + goto out; + } + MLX5_SET(rqc, rqc, scatter_fcs, 1); + } + if (init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP) { + if (!(dev->ib_dev.attrs.raw_packet_caps & + IB_RAW_PACKET_CAP_DELAY_DROP)) { + mlx5_ib_dbg(dev, "Delay drop is not supported\n"); + err = -EOPNOTSUPP; + goto out; + } + MLX5_SET(rqc, rqc, delay_drop_en, 1); + } + rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); + mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0); + err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rwq->core_qp); + if (!err && init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP) { + err = set_delay_drop(dev); + if (err) { + mlx5_ib_warn(dev, "Failed to enable delay drop err=%d\n", + err); + mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp); + } else { + rwq->create_flags |= MLX5_IB_WQ_FLAGS_DELAY_DROP; + } + } +out: + kvfree(in); + return err; +} + +static int set_user_rq_size(struct mlx5_ib_dev *dev, + struct ib_wq_init_attr *wq_init_attr, + struct mlx5_ib_create_wq *ucmd, + struct mlx5_ib_rwq *rwq) +{ + /* Sanity check RQ size before proceeding */ + if (wq_init_attr->max_wr > (1 << MLX5_CAP_GEN(dev->mdev, log_max_wq_sz))) + return -EINVAL; + + if (!ucmd->rq_wqe_count) + return -EINVAL; + + rwq->wqe_count = ucmd->rq_wqe_count; + rwq->wqe_shift = ucmd->rq_wqe_shift; + if (check_shl_overflow(rwq->wqe_count, rwq->wqe_shift, &rwq->buf_size)) + return -EINVAL; + + rwq->log_rq_stride = rwq->wqe_shift; + rwq->log_rq_size = ilog2(rwq->wqe_count); + return 0; +} + +static int prepare_user_rq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata, + struct mlx5_ib_rwq *rwq) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_create_wq ucmd = {}; + int err; + size_t required_cmd_sz; + + required_cmd_sz = offsetof(typeof(ucmd), single_stride_log_num_of_bytes) + + sizeof(ucmd.single_stride_log_num_of_bytes); + if (udata->inlen < required_cmd_sz) { + mlx5_ib_dbg(dev, "invalid inlen\n"); + return -EINVAL; + } + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) { + mlx5_ib_dbg(dev, "inlen is not supported\n"); + return -EOPNOTSUPP; + } + + if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) { + mlx5_ib_dbg(dev, "copy failed\n"); + return -EFAULT; + } + + if (ucmd.comp_mask & (~MLX5_IB_CREATE_WQ_STRIDING_RQ)) { + mlx5_ib_dbg(dev, "invalid comp mask\n"); + return -EOPNOTSUPP; + } else if (ucmd.comp_mask & MLX5_IB_CREATE_WQ_STRIDING_RQ) { + if (!MLX5_CAP_GEN(dev->mdev, striding_rq)) { + mlx5_ib_dbg(dev, "Striding RQ is not supported\n"); + return -EOPNOTSUPP; + } + if ((ucmd.single_stride_log_num_of_bytes < + MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES) || + (ucmd.single_stride_log_num_of_bytes > + MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES)) { + mlx5_ib_dbg(dev, "Invalid log stride size (%u. Range is %u - %u)\n", + ucmd.single_stride_log_num_of_bytes, + MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES, + MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES); + return -EINVAL; + } + if ((ucmd.single_wqe_log_num_of_strides > + MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES) || + (ucmd.single_wqe_log_num_of_strides < + MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES)) { + mlx5_ib_dbg(dev, "Invalid log num strides (%u. Range is %u - %u)\n", + ucmd.single_wqe_log_num_of_strides, + MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES, + MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES); + return -EINVAL; + } + rwq->single_stride_log_num_of_bytes = + ucmd.single_stride_log_num_of_bytes; + rwq->log_num_strides = ucmd.single_wqe_log_num_of_strides; + rwq->two_byte_shift_en = !!ucmd.two_byte_shift_en; + rwq->create_flags |= MLX5_IB_WQ_FLAGS_STRIDING_RQ; + } + + err = set_user_rq_size(dev, init_attr, &ucmd, rwq); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + return err; + } + + err = create_user_rq(dev, pd, rwq, &ucmd); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + if (err) + return err; + } + + rwq->user_index = ucmd.user_index; + return 0; +} + +struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev; + struct mlx5_ib_rwq *rwq; + struct mlx5_ib_create_wq_resp resp = {}; + size_t min_resp_len; + int err; + + if (!udata) + return ERR_PTR(-ENOSYS); + + min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved); + if (udata->outlen && udata->outlen < min_resp_len) + return ERR_PTR(-EINVAL); + + if (!capable(CAP_SYS_RAWIO) && + init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP) + return ERR_PTR(-EPERM); + + dev = to_mdev(pd->device); + switch (init_attr->wq_type) { + case IB_WQT_RQ: + rwq = kzalloc(sizeof(*rwq), GFP_KERNEL); + if (!rwq) + return ERR_PTR(-ENOMEM); + err = prepare_user_rq(pd, init_attr, udata, rwq); + if (err) + goto err; + err = create_rq(rwq, pd, init_attr); + if (err) + goto err_user_rq; + break; + default: + mlx5_ib_dbg(dev, "unsupported wq type %d\n", + init_attr->wq_type); + return ERR_PTR(-EINVAL); + } + + rwq->ibwq.wq_num = rwq->core_qp.qpn; + rwq->ibwq.state = IB_WQS_RESET; + if (udata->outlen) { + resp.response_length = offsetof(typeof(resp), response_length) + + sizeof(resp.response_length); + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) + goto err_copy; + } + + rwq->core_qp.event = mlx5_ib_wq_event; + rwq->ibwq.event_handler = init_attr->event_handler; + return &rwq->ibwq; + +err_copy: + mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp); +err_user_rq: + destroy_user_rq(dev, pd, rwq); +err: + kfree(rwq); + return ERR_PTR(err); +} + +int mlx5_ib_destroy_wq(struct ib_wq *wq) +{ + struct mlx5_ib_dev *dev = to_mdev(wq->device); + struct mlx5_ib_rwq *rwq = to_mrwq(wq); + + mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp); + destroy_user_rq(dev, wq->pd, rwq); + kfree(rwq); + + return 0; +} + +struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_ib_rwq_ind_table *rwq_ind_tbl; + int sz = 1 << init_attr->log_ind_tbl_size; + struct mlx5_ib_create_rwq_ind_tbl_resp resp = {}; + size_t min_resp_len; + int inlen; + int err; + int i; + u32 *in; + void *rqtc; + + if (udata->inlen > 0 && + !ib_is_udata_cleared(udata, 0, + udata->inlen)) + return ERR_PTR(-EOPNOTSUPP); + + if (init_attr->log_ind_tbl_size > + MLX5_CAP_GEN(dev->mdev, log_max_rqt_size)) { + mlx5_ib_dbg(dev, "log_ind_tbl_size = %d is bigger than supported = %d\n", + init_attr->log_ind_tbl_size, + MLX5_CAP_GEN(dev->mdev, log_max_rqt_size)); + return ERR_PTR(-EINVAL); + } + + min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved); + if (udata->outlen && udata->outlen < min_resp_len) + return ERR_PTR(-EINVAL); + + rwq_ind_tbl = kzalloc(sizeof(*rwq_ind_tbl), GFP_KERNEL); + if (!rwq_ind_tbl) + return ERR_PTR(-ENOMEM); + + inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err; + } + + rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context); + + MLX5_SET(rqtc, rqtc, rqt_actual_size, sz); + MLX5_SET(rqtc, rqtc, rqt_max_size, sz); + + for (i = 0; i < sz; i++) + MLX5_SET(rqtc, rqtc, rq_num[i], init_attr->ind_tbl[i]->wq_num); + + err = mlx5_core_create_rqt(dev->mdev, in, inlen, &rwq_ind_tbl->rqtn); + kvfree(in); + + if (err) + goto err; + + rwq_ind_tbl->ib_rwq_ind_tbl.ind_tbl_num = rwq_ind_tbl->rqtn; + if (udata->outlen) { + resp.response_length = offsetof(typeof(resp), response_length) + + sizeof(resp.response_length); + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) + goto err_copy; + } + + return &rwq_ind_tbl->ib_rwq_ind_tbl; + +err_copy: + mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn); +err: + kfree(rwq_ind_tbl); + return ERR_PTR(err); +} + +int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) +{ + struct mlx5_ib_rwq_ind_table *rwq_ind_tbl = to_mrwq_ind_table(ib_rwq_ind_tbl); + struct mlx5_ib_dev *dev = to_mdev(ib_rwq_ind_tbl->device); + + mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn); + + kfree(rwq_ind_tbl); + return 0; +} + +int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(wq->device); + struct mlx5_ib_rwq *rwq = to_mrwq(wq); + struct mlx5_ib_modify_wq ucmd = {}; + size_t required_cmd_sz; + int curr_wq_state; + int wq_state; + int inlen; + int err; + void *rqc; + void *in; + + required_cmd_sz = offsetof(typeof(ucmd), reserved) + sizeof(ucmd.reserved); + if (udata->inlen < required_cmd_sz) + return -EINVAL; + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) + return -EOPNOTSUPP; + + if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) + return -EFAULT; + + if (ucmd.comp_mask || ucmd.reserved) + return -EOPNOTSUPP; + + inlen = MLX5_ST_SZ_BYTES(modify_rq_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); + + curr_wq_state = (wq_attr_mask & IB_WQ_CUR_STATE) ? + wq_attr->curr_wq_state : wq->state; + wq_state = (wq_attr_mask & IB_WQ_STATE) ? + wq_attr->wq_state : curr_wq_state; + if (curr_wq_state == IB_WQS_ERR) + curr_wq_state = MLX5_RQC_STATE_ERR; + if (wq_state == IB_WQS_ERR) + wq_state = MLX5_RQC_STATE_ERR; + MLX5_SET(modify_rq_in, in, rq_state, curr_wq_state); + MLX5_SET(rqc, rqc, state, wq_state); + + if (wq_attr_mask & IB_WQ_FLAGS) { + if (wq_attr->flags_mask & IB_WQ_FLAGS_CVLAN_STRIPPING) { + if (!(MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && + MLX5_CAP_ETH(dev->mdev, vlan_cap))) { + mlx5_ib_dbg(dev, "VLAN offloads are not " + "supported\n"); + err = -EOPNOTSUPP; + goto out; + } + MLX5_SET64(modify_rq_in, in, modify_bitmask, + MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_VSD); + MLX5_SET(rqc, rqc, vsd, + (wq_attr->flags & IB_WQ_FLAGS_CVLAN_STRIPPING) ? 0 : 1); + } + + if (wq_attr->flags_mask & IB_WQ_FLAGS_PCI_WRITE_END_PADDING) { + mlx5_ib_dbg(dev, "Modifying scatter end padding is not supported\n"); + err = -EOPNOTSUPP; + goto out; + } + } + + if (curr_wq_state == IB_WQS_RESET && wq_state == IB_WQS_RDY) { + if (MLX5_CAP_GEN(dev->mdev, modify_rq_counter_set_id)) { + MLX5_SET64(modify_rq_in, in, modify_bitmask, + MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID); + MLX5_SET(rqc, rqc, counter_set_id, + dev->port->cnts.set_id); + } else + pr_info_once("%s: Receive WQ counters are not supported on current FW\n", + dev->ib_dev.name); + } + + err = mlx5_core_modify_rq(dev->mdev, rwq->core_qp.qpn, in, inlen); + if (!err) + rwq->ibwq.state = (wq_state == MLX5_RQC_STATE_ERR) ? IB_WQS_ERR : wq_state; + +out: + kvfree(in); + return err; +} + +struct mlx5_ib_drain_cqe { + struct ib_cqe cqe; + struct completion done; +}; + +static void mlx5_ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct mlx5_ib_drain_cqe *cqe = container_of(wc->wr_cqe, + struct mlx5_ib_drain_cqe, + cqe); + + complete(&cqe->done); +} + +/* This function returns only once the drained WR was completed */ +static void handle_drain_completion(struct ib_cq *cq, + struct mlx5_ib_drain_cqe *sdrain, + struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + + if (cq->poll_ctx == IB_POLL_DIRECT) { + while (wait_for_completion_timeout(&sdrain->done, HZ / 10) <= 0) + ib_process_cq_direct(cq, -1); + return; + } + + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + struct mlx5_ib_cq *mcq = to_mcq(cq); + bool triggered = false; + unsigned long flags; + + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + /* Make sure that the CQ handler won't run if wasn't run yet */ + if (!mcq->mcq.reset_notify_added) + mcq->mcq.reset_notify_added = 1; + else + triggered = true; + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + + if (triggered) { + /* Wait for any scheduled/running task to be ended */ + switch (cq->poll_ctx) { + case IB_POLL_SOFTIRQ: + irq_poll_disable(&cq->iop); + irq_poll_enable(&cq->iop); + break; + case IB_POLL_WORKQUEUE: + cancel_work_sync(&cq->work); + break; + default: + WARN_ON_ONCE(1); + } + } + + /* Run the CQ handler - this makes sure that the drain WR will + * be processed if wasn't processed yet. + */ + mcq->mcq.comp(&mcq->mcq); + } + + wait_for_completion(&sdrain->done); +} + +void mlx5_ib_drain_sq(struct ib_qp *qp) +{ + struct ib_cq *cq = qp->send_cq; + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct mlx5_ib_drain_cqe sdrain; + const struct ib_send_wr *bad_swr; + struct ib_rdma_wr swr = { + .wr = { + .next = NULL, + { .wr_cqe = &sdrain.cqe, }, + .opcode = IB_WR_RDMA_WRITE, + }, + }; + int ret; + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_core_dev *mdev = dev->mdev; + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret && mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) { + WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); + return; + } + + sdrain.cqe.done = mlx5_ib_drain_qp_done; + init_completion(&sdrain.done); + + ret = _mlx5_ib_post_send(qp, &swr.wr, &bad_swr, true); + if (ret) { + WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); + return; + } + + handle_drain_completion(cq, &sdrain, dev); +} + +void mlx5_ib_drain_rq(struct ib_qp *qp) +{ + struct ib_cq *cq = qp->recv_cq; + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct mlx5_ib_drain_cqe rdrain; + struct ib_recv_wr rwr = {}; + const struct ib_recv_wr *bad_rwr; + int ret; + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_core_dev *mdev = dev->mdev; + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret && mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) { + WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); + return; + } + + rwr.wr_cqe = &rdrain.cqe; + rdrain.cqe.done = mlx5_ib_drain_qp_done; + init_completion(&rdrain.done); + + ret = _mlx5_ib_post_recv(qp, &rwr, &bad_rwr, true); + if (ret) { + WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); + return; + } + + handle_drain_completion(cq, &rdrain, dev); +} diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c new file mode 100644 index 000000000..d359fecf7 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -0,0 +1,516 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/module.h> +#include <linux/mlx5/qp.h> +#include <linux/mlx5/srq.h> +#include <linux/slab.h> +#include <rdma/ib_umem.h> +#include <rdma/ib_user_verbs.h> + +#include "mlx5_ib.h" + +/* not supported currently */ +static int srq_signature; + +static void *get_wqe(struct mlx5_ib_srq *srq, int n) +{ + return mlx5_buf_offset(&srq->buf, n << srq->msrq.wqe_shift); +} + +static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, enum mlx5_event type) +{ + struct ib_event event; + struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq; + + if (ibsrq->event_handler) { + event.device = ibsrq->device; + event.element.srq = ibsrq; + switch (type) { + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + event.event = IB_EVENT_SRQ_LIMIT_REACHED; + break; + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + event.event = IB_EVENT_SRQ_ERR; + break; + default: + pr_warn("mlx5_ib: Unexpected event type %d on SRQ %06x\n", + type, srq->srqn); + return; + } + + ibsrq->event_handler(&event, ibsrq->srq_context); + } +} + +static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, + struct mlx5_srq_attr *in, + struct ib_udata *udata, int buf_size) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_create_srq ucmd = {}; + size_t ucmdlen; + int err; + int npages; + int page_shift; + int ncont; + u32 offset; + u32 uidx = MLX5_IB_DEFAULT_UIDX; + + ucmdlen = min(udata->inlen, sizeof(ucmd)); + + if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) { + mlx5_ib_dbg(dev, "failed copy udata\n"); + return -EFAULT; + } + + if (ucmd.reserved0 || ucmd.reserved1) + return -EINVAL; + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) + return -EINVAL; + + if (in->type != IB_SRQT_BASIC) { + err = get_srq_user_index(to_mucontext(pd->uobject->context), + &ucmd, udata->inlen, &uidx); + if (err) + return err; + } + + srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); + + srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size, + 0, 0); + if (IS_ERR(srq->umem)) { + mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size); + err = PTR_ERR(srq->umem); + return err; + } + + mlx5_ib_cont_pages(srq->umem, ucmd.buf_addr, 0, &npages, + &page_shift, &ncont, NULL); + err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, + &offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + + in->pas = kvcalloc(ncont, sizeof(*in->pas), GFP_KERNEL); + if (!in->pas) { + err = -ENOMEM; + goto err_umem; + } + + mlx5_ib_populate_pas(dev, srq->umem, page_shift, in->pas, 0); + + err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context), + ucmd.db_addr, &srq->db); + if (err) { + mlx5_ib_dbg(dev, "map doorbell failed\n"); + goto err_in; + } + + in->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + in->page_offset = offset; + if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 && + in->type != IB_SRQT_BASIC) + in->user_index = uidx; + + return 0; + +err_in: + kvfree(in->pas); + +err_umem: + ib_umem_release(srq->umem); + + return err; +} + +static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, + struct mlx5_srq_attr *in, int buf_size) +{ + int err; + int i; + struct mlx5_wqe_srq_next_seg *next; + + err = mlx5_db_alloc(dev->mdev, &srq->db); + if (err) { + mlx5_ib_warn(dev, "alloc dbell rec failed\n"); + return err; + } + + if (mlx5_buf_alloc(dev->mdev, buf_size, &srq->buf)) { + mlx5_ib_dbg(dev, "buf alloc failed\n"); + err = -ENOMEM; + goto err_db; + } + + srq->head = 0; + srq->tail = srq->msrq.max - 1; + srq->wqe_ctr = 0; + + for (i = 0; i < srq->msrq.max; i++) { + next = get_wqe(srq, i); + next->next_wqe_index = + cpu_to_be16((i + 1) & (srq->msrq.max - 1)); + } + + mlx5_ib_dbg(dev, "srq->buf.page_shift = %d\n", srq->buf.page_shift); + in->pas = kvcalloc(srq->buf.npages, sizeof(*in->pas), GFP_KERNEL); + if (!in->pas) { + err = -ENOMEM; + goto err_buf; + } + mlx5_fill_page_array(&srq->buf, in->pas); + + srq->wrid = kvmalloc_array(srq->msrq.max, sizeof(u64), GFP_KERNEL); + if (!srq->wrid) { + err = -ENOMEM; + goto err_in; + } + srq->wq_sig = !!srq_signature; + + in->log_page_size = srq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT; + if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 && + in->type != IB_SRQT_BASIC) + in->user_index = MLX5_IB_DEFAULT_UIDX; + + return 0; + +err_in: + kvfree(in->pas); + +err_buf: + mlx5_buf_free(dev->mdev, &srq->buf); + +err_db: + mlx5_db_free(dev->mdev, &srq->db); + return err; +} + +static void destroy_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq) +{ + mlx5_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db); + ib_umem_release(srq->umem); +} + + +static void destroy_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq) +{ + kvfree(srq->wrid); + mlx5_buf_free(dev->mdev, &srq->buf); + mlx5_db_free(dev->mdev, &srq->db); +} + +struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_srq *srq; + size_t desc_size; + size_t buf_size; + int err; + struct mlx5_srq_attr in = {0}; + __u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); + + /* Sanity check SRQ size before proceeding */ + if (init_attr->attr.max_wr >= max_srq_wqes) { + mlx5_ib_dbg(dev, "max_wr %d, cap %d\n", + init_attr->attr.max_wr, + max_srq_wqes); + return ERR_PTR(-EINVAL); + } + + srq = kmalloc(sizeof(*srq), GFP_KERNEL); + if (!srq) + return ERR_PTR(-ENOMEM); + + mutex_init(&srq->mutex); + spin_lock_init(&srq->lock); + srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1); + srq->msrq.max_gs = init_attr->attr.max_sge; + + desc_size = sizeof(struct mlx5_wqe_srq_next_seg) + + srq->msrq.max_gs * sizeof(struct mlx5_wqe_data_seg); + if (desc_size == 0 || srq->msrq.max_gs > desc_size) { + err = -EINVAL; + goto err_srq; + } + desc_size = roundup_pow_of_two(desc_size); + desc_size = max_t(size_t, 32, desc_size); + if (desc_size < sizeof(struct mlx5_wqe_srq_next_seg)) { + err = -EINVAL; + goto err_srq; + } + srq->msrq.max_avail_gather = (desc_size - sizeof(struct mlx5_wqe_srq_next_seg)) / + sizeof(struct mlx5_wqe_data_seg); + srq->msrq.wqe_shift = ilog2(desc_size); + buf_size = srq->msrq.max * desc_size; + if (buf_size < desc_size) { + err = -EINVAL; + goto err_srq; + } + in.type = init_attr->srq_type; + + if (pd->uobject) + err = create_srq_user(pd, srq, &in, udata, buf_size); + else + err = create_srq_kernel(dev, srq, &in, buf_size); + + if (err) { + mlx5_ib_warn(dev, "create srq %s failed, err %d\n", + pd->uobject ? "user" : "kernel", err); + goto err_srq; + } + + in.log_size = ilog2(srq->msrq.max); + in.wqe_shift = srq->msrq.wqe_shift - 4; + if (srq->wq_sig) + in.flags |= MLX5_SRQ_FLAG_WQ_SIG; + + if (init_attr->srq_type == IB_SRQT_XRC) + in.xrcd = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn; + else + in.xrcd = to_mxrcd(dev->devr.x0)->xrcdn; + + if (init_attr->srq_type == IB_SRQT_TM) { + in.tm_log_list_size = + ilog2(init_attr->ext.tag_matching.max_num_tags) + 1; + if (in.tm_log_list_size > + MLX5_CAP_GEN(dev->mdev, log_tag_matching_list_sz)) { + mlx5_ib_dbg(dev, "TM SRQ max_num_tags exceeding limit\n"); + err = -EINVAL; + goto err_usr_kern_srq; + } + in.flags |= MLX5_SRQ_FLAG_RNDV; + } + + if (ib_srq_has_cq(init_attr->srq_type)) + in.cqn = to_mcq(init_attr->ext.cq)->mcq.cqn; + else + in.cqn = to_mcq(dev->devr.c0)->mcq.cqn; + + in.pd = to_mpd(pd)->pdn; + in.db_record = srq->db.dma; + err = mlx5_core_create_srq(dev->mdev, &srq->msrq, &in); + kvfree(in.pas); + if (err) { + mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err); + goto err_usr_kern_srq; + } + + mlx5_ib_dbg(dev, "create SRQ with srqn 0x%x\n", srq->msrq.srqn); + + srq->msrq.event = mlx5_ib_srq_event; + srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn; + + if (pd->uobject) + if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof(__u32))) { + mlx5_ib_dbg(dev, "copy to user failed\n"); + err = -EFAULT; + goto err_core; + } + + init_attr->attr.max_wr = srq->msrq.max - 1; + + return &srq->ibsrq; + +err_core: + mlx5_core_destroy_srq(dev->mdev, &srq->msrq); + +err_usr_kern_srq: + if (pd->uobject) + destroy_srq_user(pd, srq); + else + destroy_srq_kernel(dev, srq); + +err_srq: + kfree(srq); + + return ERR_PTR(err); +} + +int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + int ret; + + /* We don't support resizing SRQs yet */ + if (attr_mask & IB_SRQ_MAX_WR) + return -EINVAL; + + if (attr_mask & IB_SRQ_LIMIT) { + if (attr->srq_limit >= srq->msrq.max) + return -EINVAL; + + mutex_lock(&srq->mutex); + ret = mlx5_core_arm_srq(dev->mdev, &srq->msrq, attr->srq_limit, 1); + mutex_unlock(&srq->mutex); + + if (ret) + return ret; + } + + return 0; +} + +int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + int ret; + struct mlx5_srq_attr *out; + + out = kzalloc(sizeof(*out), GFP_KERNEL); + if (!out) + return -ENOMEM; + + ret = mlx5_core_query_srq(dev->mdev, &srq->msrq, out); + if (ret) + goto out_box; + + srq_attr->srq_limit = out->lwm; + srq_attr->max_wr = srq->msrq.max - 1; + srq_attr->max_sge = srq->msrq.max_gs; + +out_box: + kfree(out); + return ret; +} + +int mlx5_ib_destroy_srq(struct ib_srq *srq) +{ + struct mlx5_ib_dev *dev = to_mdev(srq->device); + struct mlx5_ib_srq *msrq = to_msrq(srq); + + mlx5_core_destroy_srq(dev->mdev, &msrq->msrq); + + if (srq->uobject) { + mlx5_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db); + ib_umem_release(msrq->umem); + } else { + destroy_srq_kernel(dev, msrq); + } + + kfree(srq); + return 0; +} + +void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index) +{ + struct mlx5_wqe_srq_next_seg *next; + + /* always called with interrupts disabled. */ + spin_lock(&srq->lock); + + next = get_wqe(srq, srq->tail); + next->next_wqe_index = cpu_to_be16(wqe_index); + srq->tail = wqe_index; + + spin_unlock(&srq->lock); +} + +int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + struct mlx5_wqe_srq_next_seg *next; + struct mlx5_wqe_data_seg *scat; + struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx5_core_dev *mdev = dev->mdev; + unsigned long flags; + int err = 0; + int nreq; + int i; + + spin_lock_irqsave(&srq->lock, flags); + + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + err = -EIO; + *bad_wr = wr; + goto out; + } + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (unlikely(wr->num_sge > srq->msrq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + break; + } + + if (unlikely(srq->head == srq->tail)) { + err = -ENOMEM; + *bad_wr = wr; + break; + } + + srq->wrid[srq->head] = wr->wr_id; + + next = get_wqe(srq, srq->head); + srq->head = be16_to_cpu(next->next_wqe_index); + scat = (struct mlx5_wqe_data_seg *)(next + 1); + + for (i = 0; i < wr->num_sge; i++) { + scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length); + scat[i].lkey = cpu_to_be32(wr->sg_list[i].lkey); + scat[i].addr = cpu_to_be64(wr->sg_list[i].addr); + } + + if (i < srq->msrq.max_avail_gather) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + scat[i].addr = 0; + } + } + + if (likely(nreq)) { + srq->wqe_ctr += nreq; + + /* Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *srq->db.db = cpu_to_be32(srq->wqe_ctr); + } +out: + spin_unlock_irqrestore(&srq->lock, flags); + + return err; +} |