diff options
Diffstat (limited to 'drivers/infiniband/hw/mlx4')
-rw-r--r-- | drivers/infiniband/hw/mlx4/Kconfig | 12 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx4/Makefile | 3 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx4/ah.c | 257 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx4/alias_GUID.c | 905 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx4/cm.c | 483 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx4/cq.c | 980 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx4/doorbell.c | 96 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx4/mad.c | 2399 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx4/main.c | 3450 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx4/mcg.c | 1257 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx4/mlx4_ib.h | 932 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx4/mr.c | 826 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx4/qp.c | 4466 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx4/srq.c | 378 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx4/sysfs.c | 886 |
15 files changed, 17330 insertions, 0 deletions
diff --git a/drivers/infiniband/hw/mlx4/Kconfig b/drivers/infiniband/hw/mlx4/Kconfig new file mode 100644 index 000000000..d1de3285f --- /dev/null +++ b/drivers/infiniband/hw/mlx4/Kconfig @@ -0,0 +1,12 @@ +config MLX4_INFINIBAND + tristate "Mellanox ConnectX HCA support" + depends on NETDEVICES && ETHERNET && PCI && INET + depends on INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS + depends on MAY_USE_DEVLINK + select NET_VENDOR_MELLANOX + select MLX4_CORE + ---help--- + This driver provides low-level InfiniBand support for + Mellanox ConnectX PCI Express host channel adapters (HCAs). + This is required to use InfiniBand protocols such as + IP-over-IB or SRP with these devices. diff --git a/drivers/infiniband/hw/mlx4/Makefile b/drivers/infiniband/hw/mlx4/Makefile new file mode 100644 index 000000000..f4213b3a8 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_MLX4_INFINIBAND) += mlx4_ib.o + +mlx4_ib-y := ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o cm.o alias_GUID.o sysfs.o diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c new file mode 100644 index 000000000..e9e3a6f39 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -0,0 +1,257 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/ib_addr.h> +#include <rdma/ib_cache.h> + +#include <linux/slab.h> +#include <linux/inet.h> +#include <linux/string.h> +#include <linux/mlx4/driver.h> + +#include "mlx4_ib.h" + +static struct ib_ah *create_ib_ah(struct ib_pd *pd, + struct rdma_ah_attr *ah_attr, + struct mlx4_ib_ah *ah) +{ + struct mlx4_dev *dev = to_mdev(pd->device)->dev; + + ah->av.ib.port_pd = cpu_to_be32(to_mpd(pd)->pdn | + (rdma_ah_get_port_num(ah_attr) << 24)); + ah->av.ib.g_slid = rdma_ah_get_path_bits(ah_attr); + ah->av.ib.sl_tclass_flowlabel = + cpu_to_be32(rdma_ah_get_sl(ah_attr) << 28); + if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) { + const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); + + ah->av.ib.g_slid |= 0x80; + ah->av.ib.gid_index = grh->sgid_index; + ah->av.ib.hop_limit = grh->hop_limit; + ah->av.ib.sl_tclass_flowlabel |= + cpu_to_be32((grh->traffic_class << 20) | + grh->flow_label); + memcpy(ah->av.ib.dgid, grh->dgid.raw, 16); + } + + ah->av.ib.dlid = cpu_to_be16(rdma_ah_get_dlid(ah_attr)); + if (rdma_ah_get_static_rate(ah_attr)) { + u8 static_rate = rdma_ah_get_static_rate(ah_attr) + + MLX4_STAT_RATE_OFFSET; + + while (static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && + !(1 << static_rate & dev->caps.stat_rate_support)) + --static_rate; + ah->av.ib.stat_rate = static_rate; + } + + return &ah->ibah; +} + +static struct ib_ah *create_iboe_ah(struct ib_pd *pd, + struct rdma_ah_attr *ah_attr, + struct mlx4_ib_ah *ah) +{ + struct mlx4_ib_dev *ibdev = to_mdev(pd->device); + const struct ib_gid_attr *gid_attr; + struct mlx4_dev *dev = ibdev->dev; + int is_mcast = 0; + struct in6_addr in6; + u16 vlan_tag = 0xffff; + const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); + int ret; + + memcpy(&in6, grh->dgid.raw, sizeof(in6)); + if (rdma_is_multicast_addr(&in6)) + is_mcast = 1; + + memcpy(ah->av.eth.mac, ah_attr->roce.dmac, ETH_ALEN); + eth_zero_addr(ah->av.eth.s_mac); + + /* + * If sgid_attr is NULL we are being called by mlx4_ib_create_ah_slave + * and we are directly creating an AV for a slave's gid_index. + */ + gid_attr = ah_attr->grh.sgid_attr; + if (gid_attr) { + if (is_vlan_dev(gid_attr->ndev)) + vlan_tag = vlan_dev_vlan_id(gid_attr->ndev); + memcpy(ah->av.eth.s_mac, gid_attr->ndev->dev_addr, ETH_ALEN); + ret = mlx4_ib_gid_index_to_real_index(ibdev, gid_attr); + if (ret < 0) + return ERR_PTR(ret); + ah->av.eth.gid_index = ret; + } else { + /* mlx4_ib_create_ah_slave fills in the s_mac and the vlan */ + ah->av.eth.gid_index = ah_attr->grh.sgid_index; + } + + if (vlan_tag < 0x1000) + vlan_tag |= (rdma_ah_get_sl(ah_attr) & 7) << 13; + ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | + (rdma_ah_get_port_num(ah_attr) << 24)); + ah->av.eth.vlan = cpu_to_be16(vlan_tag); + ah->av.eth.hop_limit = grh->hop_limit; + if (rdma_ah_get_static_rate(ah_attr)) { + ah->av.eth.stat_rate = rdma_ah_get_static_rate(ah_attr) + + MLX4_STAT_RATE_OFFSET; + while (ah->av.eth.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && + !(1 << ah->av.eth.stat_rate & dev->caps.stat_rate_support)) + --ah->av.eth.stat_rate; + } + ah->av.eth.sl_tclass_flowlabel |= + cpu_to_be32((grh->traffic_class << 20) | + grh->flow_label); + /* + * HW requires multicast LID so we just choose one. + */ + if (is_mcast) + ah->av.ib.dlid = cpu_to_be16(0xc000); + + memcpy(ah->av.eth.dgid, grh->dgid.raw, 16); + ah->av.eth.sl_tclass_flowlabel |= cpu_to_be32(rdma_ah_get_sl(ah_attr) + << 29); + return &ah->ibah; +} + +struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, + struct ib_udata *udata) + +{ + struct mlx4_ib_ah *ah; + struct ib_ah *ret; + + ah = kzalloc(sizeof *ah, GFP_ATOMIC); + if (!ah) + return ERR_PTR(-ENOMEM); + + if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { + if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) { + ret = ERR_PTR(-EINVAL); + } else { + /* + * TBD: need to handle the case when we get + * called in an atomic context and there we + * might sleep. We don't expect this + * currently since we're working with link + * local addresses which we can translate + * without going to sleep. + */ + ret = create_iboe_ah(pd, ah_attr, ah); + } + + if (IS_ERR(ret)) + kfree(ah); + + return ret; + } else + return create_ib_ah(pd, ah_attr, ah); /* never fails */ +} + +/* AH's created via this call must be free'd by mlx4_ib_destroy_ah. */ +struct ib_ah *mlx4_ib_create_ah_slave(struct ib_pd *pd, + struct rdma_ah_attr *ah_attr, + int slave_sgid_index, u8 *s_mac, + u16 vlan_tag) +{ + struct rdma_ah_attr slave_attr = *ah_attr; + struct mlx4_ib_ah *mah; + struct ib_ah *ah; + + slave_attr.grh.sgid_attr = NULL; + slave_attr.grh.sgid_index = slave_sgid_index; + ah = mlx4_ib_create_ah(pd, &slave_attr, NULL); + if (IS_ERR(ah)) + return ah; + + ah->device = pd->device; + ah->pd = pd; + ah->type = ah_attr->type; + mah = to_mah(ah); + + /* get rid of force-loopback bit */ + mah->av.ib.port_pd &= cpu_to_be32(0x7FFFFFFF); + + if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) + memcpy(mah->av.eth.s_mac, s_mac, 6); + + if (vlan_tag < 0x1000) + vlan_tag |= (rdma_ah_get_sl(ah_attr) & 7) << 13; + mah->av.eth.vlan = cpu_to_be16(vlan_tag); + + return ah; +} + +int mlx4_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr) +{ + struct mlx4_ib_ah *ah = to_mah(ibah); + int port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24; + + memset(ah_attr, 0, sizeof *ah_attr); + ah_attr->type = ibah->type; + + if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { + rdma_ah_set_dlid(ah_attr, 0); + rdma_ah_set_sl(ah_attr, + be32_to_cpu(ah->av.eth.sl_tclass_flowlabel) + >> 29); + } else { + rdma_ah_set_dlid(ah_attr, be16_to_cpu(ah->av.ib.dlid)); + rdma_ah_set_sl(ah_attr, + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) + >> 28); + } + + rdma_ah_set_port_num(ah_attr, port_num); + if (ah->av.ib.stat_rate) + rdma_ah_set_static_rate(ah_attr, + ah->av.ib.stat_rate - + MLX4_STAT_RATE_OFFSET); + rdma_ah_set_path_bits(ah_attr, ah->av.ib.g_slid & 0x7F); + if (mlx4_ib_ah_grh_present(ah)) { + u32 tc_fl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel); + + rdma_ah_set_grh(ah_attr, NULL, + tc_fl & 0xfffff, ah->av.ib.gid_index, + ah->av.ib.hop_limit, + tc_fl >> 20); + rdma_ah_set_dgid_raw(ah_attr, ah->av.ib.dgid); + } + + return 0; +} + +int mlx4_ib_destroy_ah(struct ib_ah *ah) +{ + kfree(to_mah(ah)); + return 0; +} diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c new file mode 100644 index 000000000..baab9afa9 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -0,0 +1,905 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + /***********************************************************/ +/*This file support the handling of the Alias GUID feature. */ +/***********************************************************/ +#include <rdma/ib_mad.h> +#include <rdma/ib_smi.h> +#include <rdma/ib_cache.h> +#include <rdma/ib_sa.h> +#include <rdma/ib_pack.h> +#include <linux/mlx4/cmd.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <rdma/ib_user_verbs.h> +#include <linux/delay.h> +#include "mlx4_ib.h" + +/* +The driver keeps the current state of all guids, as they are in the HW. +Whenever we receive an smp mad GUIDInfo record, the data will be cached. +*/ + +struct mlx4_alias_guid_work_context { + u8 port; + struct mlx4_ib_dev *dev ; + struct ib_sa_query *sa_query; + struct completion done; + int query_id; + struct list_head list; + int block_num; + ib_sa_comp_mask guid_indexes; + u8 method; +}; + +struct mlx4_next_alias_guid_work { + u8 port; + u8 block_num; + u8 method; + struct mlx4_sriov_alias_guid_info_rec_det rec_det; +}; + +static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port, + int *resched_delay_sec); + +void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num, + u8 port_num, u8 *p_data) +{ + int i; + u64 guid_indexes; + int slave_id; + int port_index = port_num - 1; + + if (!mlx4_is_master(dev->dev)) + return; + + guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid. + ports_guid[port_num - 1]. + all_rec_per_port[block_num].guid_indexes); + pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, guid_indexes); + + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + /* The location of the specific index starts from bit number 4 + * until bit num 11 */ + if (test_bit(i + 4, (unsigned long *)&guid_indexes)) { + slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ; + if (slave_id >= dev->dev->num_slaves) { + pr_debug("The last slave: %d\n", slave_id); + return; + } + + /* cache the guid: */ + memcpy(&dev->sriov.demux[port_index].guid_cache[slave_id], + &p_data[i * GUID_REC_SIZE], + GUID_REC_SIZE); + } else + pr_debug("Guid number: %d in block: %d" + " was not updated\n", i, block_num); + } +} + +static __be64 get_cached_alias_guid(struct mlx4_ib_dev *dev, int port, int index) +{ + if (index >= NUM_ALIAS_GUID_PER_PORT) { + pr_err("%s: ERROR: asked for index:%d\n", __func__, index); + return (__force __be64) -1; + } + return *(__be64 *)&dev->sriov.demux[port - 1].guid_cache[index]; +} + + +ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index) +{ + return IB_SA_COMP_MASK(4 + index); +} + +void mlx4_ib_slave_alias_guid_event(struct mlx4_ib_dev *dev, int slave, + int port, int slave_init) +{ + __be64 curr_guid, required_guid; + int record_num = slave / 8; + int index = slave % 8; + int port_index = port - 1; + unsigned long flags; + int do_work = 0; + + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); + if (dev->sriov.alias_guid.ports_guid[port_index].state_flags & + GUID_STATE_NEED_PORT_INIT) + goto unlock; + if (!slave_init) { + curr_guid = *(__be64 *)&dev->sriov. + alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num]. + all_recs[GUID_REC_SIZE * index]; + if (curr_guid == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL) || + !curr_guid) + goto unlock; + required_guid = cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL); + } else { + required_guid = mlx4_get_admin_guid(dev->dev, slave, port); + if (required_guid == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) + goto unlock; + } + *(__be64 *)&dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num]. + all_recs[GUID_REC_SIZE * index] = required_guid; + dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num].guid_indexes + |= mlx4_ib_get_aguid_comp_mask_from_ix(index); + dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num].status + = MLX4_GUID_INFO_STATUS_IDLE; + /* set to run immediately */ + dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num].time_to_run = 0; + dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num]. + guids_retry_schedule[index] = 0; + do_work = 1; +unlock: + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); + + if (do_work) + mlx4_ib_init_alias_guid_work(dev, port_index); +} + +/* + * Whenever new GUID is set/unset (guid table change) create event and + * notify the relevant slave (master also should be notified). + * If the GUID value is not as we have in the cache the slave will not be + * updated; in this case it waits for the smp_snoop or the port management + * event to call the function and to update the slave. + * block_number - the index of the block (16 blocks available) + * port_number - 1 or 2 + */ +void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, + int block_num, u8 port_num, + u8 *p_data) +{ + int i; + u64 guid_indexes; + int slave_id, slave_port; + enum slave_port_state new_state; + enum slave_port_state prev_state; + __be64 tmp_cur_ag, form_cache_ag; + enum slave_port_gen_event gen_event; + struct mlx4_sriov_alias_guid_info_rec_det *rec; + unsigned long flags; + __be64 required_value; + + if (!mlx4_is_master(dev->dev)) + return; + + rec = &dev->sriov.alias_guid.ports_guid[port_num - 1]. + all_rec_per_port[block_num]; + guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid. + ports_guid[port_num - 1]. + all_rec_per_port[block_num].guid_indexes); + pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, guid_indexes); + + /*calculate the slaves and notify them*/ + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + /* the location of the specific index runs from bits 4..11 */ + if (!(test_bit(i + 4, (unsigned long *)&guid_indexes))) + continue; + + slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ; + if (slave_id >= dev->dev->persist->num_vfs + 1) + return; + + slave_port = mlx4_phys_to_slave_port(dev->dev, slave_id, port_num); + if (slave_port < 0) /* this port isn't available for the VF */ + continue; + + tmp_cur_ag = *(__be64 *)&p_data[i * GUID_REC_SIZE]; + form_cache_ag = get_cached_alias_guid(dev, port_num, + (NUM_ALIAS_GUID_IN_REC * block_num) + i); + /* + * Check if guid is not the same as in the cache, + * If it is different, wait for the snoop_smp or the port mgmt + * change event to update the slave on its port state change + */ + if (tmp_cur_ag != form_cache_ag) + continue; + + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); + required_value = *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE]; + + if (required_value == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) + required_value = 0; + + if (tmp_cur_ag == required_value) { + rec->guid_indexes = rec->guid_indexes & + ~mlx4_ib_get_aguid_comp_mask_from_ix(i); + } else { + /* may notify port down if value is 0 */ + if (tmp_cur_ag != MLX4_NOT_SET_GUID) { + spin_unlock_irqrestore(&dev->sriov. + alias_guid.ag_work_lock, flags); + continue; + } + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, + flags); + mlx4_gen_guid_change_eqe(dev->dev, slave_id, port_num); + /*2 cases: Valid GUID, and Invalid Guid*/ + + if (tmp_cur_ag != MLX4_NOT_SET_GUID) { /*valid GUID*/ + prev_state = mlx4_get_slave_port_state(dev->dev, slave_id, port_num); + new_state = set_and_calc_slave_port_state(dev->dev, slave_id, port_num, + MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID, + &gen_event); + pr_debug("slave: %d, port: %d prev_port_state: %d," + " new_port_state: %d, gen_event: %d\n", + slave_id, port_num, prev_state, new_state, gen_event); + if (gen_event == SLAVE_PORT_GEN_EVENT_UP) { + pr_debug("sending PORT_UP event to slave: %d, port: %d\n", + slave_id, port_num); + mlx4_gen_port_state_change_eqe(dev->dev, slave_id, + port_num, MLX4_PORT_CHANGE_SUBTYPE_ACTIVE); + } + } else { /* request to invalidate GUID */ + set_and_calc_slave_port_state(dev->dev, slave_id, port_num, + MLX4_PORT_STATE_IB_EVENT_GID_INVALID, + &gen_event); + if (gen_event == SLAVE_PORT_GEN_EVENT_DOWN) { + pr_debug("sending PORT DOWN event to slave: %d, port: %d\n", + slave_id, port_num); + mlx4_gen_port_state_change_eqe(dev->dev, + slave_id, + port_num, + MLX4_PORT_CHANGE_SUBTYPE_DOWN); + } + } + } +} + +static void aliasguid_query_handler(int status, + struct ib_sa_guidinfo_rec *guid_rec, + void *context) +{ + struct mlx4_ib_dev *dev; + struct mlx4_alias_guid_work_context *cb_ctx = context; + u8 port_index ; + int i; + struct mlx4_sriov_alias_guid_info_rec_det *rec; + unsigned long flags, flags1; + ib_sa_comp_mask declined_guid_indexes = 0; + ib_sa_comp_mask applied_guid_indexes = 0; + unsigned int resched_delay_sec = 0; + + if (!context) + return; + + dev = cb_ctx->dev; + port_index = cb_ctx->port - 1; + rec = &dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[cb_ctx->block_num]; + + if (status) { + pr_debug("(port: %d) failed: status = %d\n", + cb_ctx->port, status); + rec->time_to_run = ktime_get_boot_ns() + 1 * NSEC_PER_SEC; + goto out; + } + + if (guid_rec->block_num != cb_ctx->block_num) { + pr_err("block num mismatch: %d != %d\n", + cb_ctx->block_num, guid_rec->block_num); + goto out; + } + + pr_debug("lid/port: %d/%d, block_num: %d\n", + be16_to_cpu(guid_rec->lid), cb_ctx->port, + guid_rec->block_num); + + rec = &dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[guid_rec->block_num]; + + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); + for (i = 0 ; i < NUM_ALIAS_GUID_IN_REC; i++) { + __be64 sm_response, required_val; + + if (!(cb_ctx->guid_indexes & + mlx4_ib_get_aguid_comp_mask_from_ix(i))) + continue; + sm_response = *(__be64 *)&guid_rec->guid_info_list + [i * GUID_REC_SIZE]; + required_val = *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE]; + if (cb_ctx->method == MLX4_GUID_INFO_RECORD_DELETE) { + if (required_val == + cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) + goto next_entry; + + /* A new value was set till we got the response */ + pr_debug("need to set new value %llx, record num %d, block_num:%d\n", + be64_to_cpu(required_val), + i, guid_rec->block_num); + goto entry_declined; + } + + /* check if the SM didn't assign one of the records. + * if it didn't, re-ask for. + */ + if (sm_response == MLX4_NOT_SET_GUID) { + if (rec->guids_retry_schedule[i] == 0) + mlx4_ib_warn(&dev->ib_dev, + "%s:Record num %d in block_num: %d was declined by SM\n", + __func__, i, + guid_rec->block_num); + goto entry_declined; + } else { + /* properly assigned record. */ + /* We save the GUID we just got from the SM in the + * admin_guid in order to be persistent, and in the + * request from the sm the process will ask for the same GUID */ + if (required_val && + sm_response != required_val) { + /* Warn only on first retry */ + if (rec->guids_retry_schedule[i] == 0) + mlx4_ib_warn(&dev->ib_dev, "%s: Failed to set" + " admin guid after SysAdmin " + "configuration. " + "Record num %d in block_num:%d " + "was declined by SM, " + "new val(0x%llx) was kept, SM returned (0x%llx)\n", + __func__, i, + guid_rec->block_num, + be64_to_cpu(required_val), + be64_to_cpu(sm_response)); + goto entry_declined; + } else { + *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] = + sm_response; + if (required_val == 0) + mlx4_set_admin_guid(dev->dev, + sm_response, + (guid_rec->block_num + * NUM_ALIAS_GUID_IN_REC) + i, + cb_ctx->port); + goto next_entry; + } + } +entry_declined: + declined_guid_indexes |= mlx4_ib_get_aguid_comp_mask_from_ix(i); + rec->guids_retry_schedule[i] = + (rec->guids_retry_schedule[i] == 0) ? 1 : + min((unsigned int)60, + rec->guids_retry_schedule[i] * 2); + /* using the minimum value among all entries in that record */ + resched_delay_sec = (resched_delay_sec == 0) ? + rec->guids_retry_schedule[i] : + min(resched_delay_sec, + rec->guids_retry_schedule[i]); + continue; + +next_entry: + rec->guids_retry_schedule[i] = 0; + } + + applied_guid_indexes = cb_ctx->guid_indexes & ~declined_guid_indexes; + if (declined_guid_indexes || + rec->guid_indexes & ~(applied_guid_indexes)) { + pr_debug("record=%d wasn't fully set, guid_indexes=0x%llx applied_indexes=0x%llx, declined_indexes=0x%llx\n", + guid_rec->block_num, + be64_to_cpu((__force __be64)rec->guid_indexes), + be64_to_cpu((__force __be64)applied_guid_indexes), + be64_to_cpu((__force __be64)declined_guid_indexes)); + rec->time_to_run = ktime_get_boot_ns() + + resched_delay_sec * NSEC_PER_SEC; + } else { + rec->status = MLX4_GUID_INFO_STATUS_SET; + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); + /* + The func is call here to close the cases when the + sm doesn't send smp, so in the sa response the driver + notifies the slave. + */ + mlx4_ib_notify_slaves_on_guid_change(dev, guid_rec->block_num, + cb_ctx->port, + guid_rec->guid_info_list); +out: + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + if (!dev->sriov.is_going_down) { + get_low_record_time_index(dev, port_index, &resched_delay_sec); + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port_index].wq, + &dev->sriov.alias_guid.ports_guid[port_index]. + alias_guid_work, + msecs_to_jiffies(resched_delay_sec * 1000)); + } + if (cb_ctx->sa_query) { + list_del(&cb_ctx->list); + kfree(cb_ctx); + } else + complete(&cb_ctx->done); + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +static void invalidate_guid_record(struct mlx4_ib_dev *dev, u8 port, int index) +{ + int i; + u64 cur_admin_val; + ib_sa_comp_mask comp_mask = 0; + + dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].status + = MLX4_GUID_INFO_STATUS_SET; + + /* calculate the comp_mask for that record.*/ + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + cur_admin_val = + *(u64 *)&dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[index].all_recs[GUID_REC_SIZE * i]; + /* + check the admin value: if it's for delete (~00LL) or + it is the first guid of the first record (hw guid) or + the records is not in ownership of the sysadmin and the sm doesn't + need to assign GUIDs, then don't put it up for assignment. + */ + if (MLX4_GUID_FOR_DELETE_VAL == cur_admin_val || + (!index && !i)) + continue; + comp_mask |= mlx4_ib_get_aguid_comp_mask_from_ix(i); + } + dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[index].guid_indexes |= comp_mask; + if (dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[index].guid_indexes) + dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[index].status = MLX4_GUID_INFO_STATUS_IDLE; + +} + +static int set_guid_rec(struct ib_device *ibdev, + struct mlx4_next_alias_guid_work *rec) +{ + int err; + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_sa_guidinfo_rec guid_info_rec; + ib_sa_comp_mask comp_mask; + struct ib_port_attr attr; + struct mlx4_alias_guid_work_context *callback_context; + unsigned long resched_delay, flags, flags1; + u8 port = rec->port + 1; + int index = rec->block_num; + struct mlx4_sriov_alias_guid_info_rec_det *rec_det = &rec->rec_det; + struct list_head *head = + &dev->sriov.alias_guid.ports_guid[port - 1].cb_list; + + memset(&attr, 0, sizeof(attr)); + err = __mlx4_ib_query_port(ibdev, port, &attr, 1); + if (err) { + pr_debug("mlx4_ib_query_port failed (err: %d), port: %d\n", + err, port); + return err; + } + /*check the port was configured by the sm, otherwise no need to send */ + if (attr.state != IB_PORT_ACTIVE) { + pr_debug("port %d not active...rescheduling\n", port); + resched_delay = 5 * HZ; + err = -EAGAIN; + goto new_schedule; + } + + callback_context = kmalloc(sizeof *callback_context, GFP_KERNEL); + if (!callback_context) { + err = -ENOMEM; + resched_delay = HZ * 5; + goto new_schedule; + } + callback_context->port = port; + callback_context->dev = dev; + callback_context->block_num = index; + callback_context->guid_indexes = rec_det->guid_indexes; + callback_context->method = rec->method; + + memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec)); + + guid_info_rec.lid = ib_lid_be16(attr.lid); + guid_info_rec.block_num = index; + + memcpy(guid_info_rec.guid_info_list, rec_det->all_recs, + GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC); + comp_mask = IB_SA_GUIDINFO_REC_LID | IB_SA_GUIDINFO_REC_BLOCK_NUM | + rec_det->guid_indexes; + + init_completion(&callback_context->done); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + list_add_tail(&callback_context->list, head); + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + + callback_context->query_id = + ib_sa_guid_info_rec_query(dev->sriov.alias_guid.sa_client, + ibdev, port, &guid_info_rec, + comp_mask, rec->method, 1000, + GFP_KERNEL, aliasguid_query_handler, + callback_context, + &callback_context->sa_query); + if (callback_context->query_id < 0) { + pr_debug("ib_sa_guid_info_rec_query failed, query_id: " + "%d. will reschedule to the next 1 sec.\n", + callback_context->query_id); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + list_del(&callback_context->list); + kfree(callback_context); + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + resched_delay = 1 * HZ; + err = -EAGAIN; + goto new_schedule; + } + err = 0; + goto out; + +new_schedule: + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + invalidate_guid_record(dev, port, index); + if (!dev->sriov.is_going_down) { + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq, + &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work, + resched_delay); + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); + +out: + return err; +} + +static void mlx4_ib_guid_port_init(struct mlx4_ib_dev *dev, int port) +{ + int j, k, entry; + __be64 guid; + + /*Check if the SM doesn't need to assign the GUIDs*/ + for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) { + for (k = 0; k < NUM_ALIAS_GUID_IN_REC; k++) { + entry = j * NUM_ALIAS_GUID_IN_REC + k; + /* no request for the 0 entry (hw guid) */ + if (!entry || entry > dev->dev->persist->num_vfs || + !mlx4_is_slave_active(dev->dev, entry)) + continue; + guid = mlx4_get_admin_guid(dev->dev, entry, port); + *(__be64 *)&dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[j].all_recs + [GUID_REC_SIZE * k] = guid; + pr_debug("guid was set, entry=%d, val=0x%llx, port=%d\n", + entry, + be64_to_cpu(guid), + port); + } + } +} +void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port) +{ + int i; + unsigned long flags, flags1; + + pr_debug("port %d\n", port); + + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + + if (dev->sriov.alias_guid.ports_guid[port - 1].state_flags & + GUID_STATE_NEED_PORT_INIT) { + mlx4_ib_guid_port_init(dev, port); + dev->sriov.alias_guid.ports_guid[port - 1].state_flags &= + (~GUID_STATE_NEED_PORT_INIT); + } + for (i = 0; i < NUM_ALIAS_GUID_REC_IN_PORT; i++) + invalidate_guid_record(dev, port, i); + + if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) { + /* + make sure no work waits in the queue, if the work is already + queued(not on the timer) the cancel will fail. That is not a problem + because we just want the work started. + */ + cancel_delayed_work(&dev->sriov.alias_guid. + ports_guid[port - 1].alias_guid_work); + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq, + &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work, + 0); + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +static void set_required_record(struct mlx4_ib_dev *dev, u8 port, + struct mlx4_next_alias_guid_work *next_rec, + int record_index) +{ + int i; + int lowset_time_entry = -1; + int lowest_time = 0; + ib_sa_comp_mask delete_guid_indexes = 0; + ib_sa_comp_mask set_guid_indexes = 0; + struct mlx4_sriov_alias_guid_info_rec_det *rec = + &dev->sriov.alias_guid.ports_guid[port]. + all_rec_per_port[record_index]; + + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + if (!(rec->guid_indexes & + mlx4_ib_get_aguid_comp_mask_from_ix(i))) + continue; + + if (*(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] == + cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) + delete_guid_indexes |= + mlx4_ib_get_aguid_comp_mask_from_ix(i); + else + set_guid_indexes |= + mlx4_ib_get_aguid_comp_mask_from_ix(i); + + if (lowset_time_entry == -1 || rec->guids_retry_schedule[i] <= + lowest_time) { + lowset_time_entry = i; + lowest_time = rec->guids_retry_schedule[i]; + } + } + + memcpy(&next_rec->rec_det, rec, sizeof(*rec)); + next_rec->port = port; + next_rec->block_num = record_index; + + if (*(__be64 *)&rec->all_recs[lowset_time_entry * GUID_REC_SIZE] == + cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) { + next_rec->rec_det.guid_indexes = delete_guid_indexes; + next_rec->method = MLX4_GUID_INFO_RECORD_DELETE; + } else { + next_rec->rec_det.guid_indexes = set_guid_indexes; + next_rec->method = MLX4_GUID_INFO_RECORD_SET; + } +} + +/* return index of record that should be updated based on lowest + * rescheduled time + */ +static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port, + int *resched_delay_sec) +{ + int record_index = -1; + u64 low_record_time = 0; + struct mlx4_sriov_alias_guid_info_rec_det rec; + int j; + + for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) { + rec = dev->sriov.alias_guid.ports_guid[port]. + all_rec_per_port[j]; + if (rec.status == MLX4_GUID_INFO_STATUS_IDLE && + rec.guid_indexes) { + if (record_index == -1 || + rec.time_to_run < low_record_time) { + record_index = j; + low_record_time = rec.time_to_run; + } + } + } + if (resched_delay_sec) { + u64 curr_time = ktime_get_boot_ns(); + + *resched_delay_sec = (low_record_time < curr_time) ? 0 : + div_u64((low_record_time - curr_time), NSEC_PER_SEC); + } + + return record_index; +} + +/* The function returns the next record that was + * not configured (or failed to be configured) */ +static int get_next_record_to_update(struct mlx4_ib_dev *dev, u8 port, + struct mlx4_next_alias_guid_work *rec) +{ + unsigned long flags; + int record_index; + int ret = 0; + + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); + record_index = get_low_record_time_index(dev, port, NULL); + + if (record_index < 0) { + ret = -ENOENT; + goto out; + } + + set_required_record(dev, port, rec, record_index); +out: + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); + return ret; +} + +static void alias_guid_work(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + int ret = 0; + struct mlx4_next_alias_guid_work *rec; + struct mlx4_sriov_alias_guid_port_rec_det *sriov_alias_port = + container_of(delay, struct mlx4_sriov_alias_guid_port_rec_det, + alias_guid_work); + struct mlx4_sriov_alias_guid *sriov_alias_guid = sriov_alias_port->parent; + struct mlx4_ib_sriov *ib_sriov = container_of(sriov_alias_guid, + struct mlx4_ib_sriov, + alias_guid); + struct mlx4_ib_dev *dev = container_of(ib_sriov, struct mlx4_ib_dev, sriov); + + rec = kzalloc(sizeof *rec, GFP_KERNEL); + if (!rec) + return; + + pr_debug("starting [port: %d]...\n", sriov_alias_port->port + 1); + ret = get_next_record_to_update(dev, sriov_alias_port->port, rec); + if (ret) { + pr_debug("No more records to update.\n"); + goto out; + } + + set_guid_rec(&dev->ib_dev, rec); +out: + kfree(rec); +} + + +void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port) +{ + unsigned long flags, flags1; + + if (!mlx4_is_master(dev->dev)) + return; + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + if (!dev->sriov.is_going_down) { + /* If there is pending one should cancel then run, otherwise + * won't run till previous one is ended as same work + * struct is used. + */ + cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[port]. + alias_guid_work); + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port].wq, + &dev->sriov.alias_guid.ports_guid[port].alias_guid_work, 0); + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev) +{ + int i; + struct mlx4_ib_sriov *sriov = &dev->sriov; + struct mlx4_alias_guid_work_context *cb_ctx; + struct mlx4_sriov_alias_guid_port_rec_det *det; + struct ib_sa_query *sa_query; + unsigned long flags; + + for (i = 0 ; i < dev->num_ports; i++) { + det = &sriov->alias_guid.ports_guid[i]; + cancel_delayed_work_sync(&det->alias_guid_work); + spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags); + while (!list_empty(&det->cb_list)) { + cb_ctx = list_entry(det->cb_list.next, + struct mlx4_alias_guid_work_context, + list); + sa_query = cb_ctx->sa_query; + cb_ctx->sa_query = NULL; + list_del(&cb_ctx->list); + spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags); + ib_sa_cancel_query(cb_ctx->query_id, sa_query); + wait_for_completion(&cb_ctx->done); + kfree(cb_ctx); + spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags); + } + spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags); + } + for (i = 0 ; i < dev->num_ports; i++) { + flush_workqueue(dev->sriov.alias_guid.ports_guid[i].wq); + destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq); + } + ib_sa_unregister_client(dev->sriov.alias_guid.sa_client); + kfree(dev->sriov.alias_guid.sa_client); +} + +int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev) +{ + char alias_wq_name[15]; + int ret = 0; + int i, j; + union ib_gid gid; + + if (!mlx4_is_master(dev->dev)) + return 0; + dev->sriov.alias_guid.sa_client = + kzalloc(sizeof *dev->sriov.alias_guid.sa_client, GFP_KERNEL); + if (!dev->sriov.alias_guid.sa_client) + return -ENOMEM; + + ib_sa_register_client(dev->sriov.alias_guid.sa_client); + + spin_lock_init(&dev->sriov.alias_guid.ag_work_lock); + + for (i = 1; i <= dev->num_ports; ++i) { + if (dev->ib_dev.query_gid(&dev->ib_dev , i, 0, &gid)) { + ret = -EFAULT; + goto err_unregister; + } + } + + for (i = 0 ; i < dev->num_ports; i++) { + memset(&dev->sriov.alias_guid.ports_guid[i], 0, + sizeof (struct mlx4_sriov_alias_guid_port_rec_det)); + dev->sriov.alias_guid.ports_guid[i].state_flags |= + GUID_STATE_NEED_PORT_INIT; + for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) { + /* mark each val as it was deleted */ + memset(dev->sriov.alias_guid.ports_guid[i]. + all_rec_per_port[j].all_recs, 0xFF, + sizeof(dev->sriov.alias_guid.ports_guid[i]. + all_rec_per_port[j].all_recs)); + } + INIT_LIST_HEAD(&dev->sriov.alias_guid.ports_guid[i].cb_list); + /*prepare the records, set them to be allocated by sm*/ + if (mlx4_ib_sm_guid_assign) + for (j = 1; j < NUM_ALIAS_GUID_PER_PORT; j++) + mlx4_set_admin_guid(dev->dev, 0, j, i + 1); + for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) + invalidate_guid_record(dev, i + 1, j); + + dev->sriov.alias_guid.ports_guid[i].parent = &dev->sriov.alias_guid; + dev->sriov.alias_guid.ports_guid[i].port = i; + + snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i); + dev->sriov.alias_guid.ports_guid[i].wq = + alloc_ordered_workqueue(alias_wq_name, WQ_MEM_RECLAIM); + if (!dev->sriov.alias_guid.ports_guid[i].wq) { + ret = -ENOMEM; + goto err_thread; + } + INIT_DELAYED_WORK(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work, + alias_guid_work); + } + return 0; + +err_thread: + for (--i; i >= 0; i--) { + destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq); + dev->sriov.alias_guid.ports_guid[i].wq = NULL; + } + +err_unregister: + ib_sa_unregister_client(dev->sriov.alias_guid.sa_client); + kfree(dev->sriov.alias_guid.sa_client); + dev->sriov.alias_guid.sa_client = NULL; + pr_err("init_alias_guid_service: Failed. (ret:%d)\n", ret); + return ret; +} diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c new file mode 100644 index 000000000..d3e11503e --- /dev/null +++ b/drivers/infiniband/hw/mlx4/cm.c @@ -0,0 +1,483 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/ib_mad.h> + +#include <linux/mlx4/cmd.h> +#include <linux/rbtree.h> +#include <linux/idr.h> +#include <rdma/ib_cm.h> + +#include "mlx4_ib.h" + +#define CM_CLEANUP_CACHE_TIMEOUT (30 * HZ) + +struct id_map_entry { + struct rb_node node; + + u32 sl_cm_id; + u32 pv_cm_id; + int slave_id; + int scheduled_delete; + struct mlx4_ib_dev *dev; + + struct list_head list; + struct delayed_work timeout; +}; + +struct cm_generic_msg { + struct ib_mad_hdr hdr; + + __be32 local_comm_id; + __be32 remote_comm_id; +}; + +struct cm_sidr_generic_msg { + struct ib_mad_hdr hdr; + __be32 request_id; +}; + +struct cm_req_msg { + unsigned char unused[0x60]; + union ib_gid primary_path_sgid; +}; + + +static void set_local_comm_id(struct ib_mad *mad, u32 cm_id) +{ + if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + msg->request_id = cpu_to_be32(cm_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + pr_err("trying to set local_comm_id in SIDR_REP\n"); + return; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + msg->local_comm_id = cpu_to_be32(cm_id); + } +} + +static u32 get_local_comm_id(struct ib_mad *mad) +{ + if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + return be32_to_cpu(msg->request_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + pr_err("trying to set local_comm_id in SIDR_REP\n"); + return -1; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + return be32_to_cpu(msg->local_comm_id); + } +} + +static void set_remote_comm_id(struct ib_mad *mad, u32 cm_id) +{ + if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + msg->request_id = cpu_to_be32(cm_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + pr_err("trying to set remote_comm_id in SIDR_REQ\n"); + return; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + msg->remote_comm_id = cpu_to_be32(cm_id); + } +} + +static u32 get_remote_comm_id(struct ib_mad *mad) +{ + if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + return be32_to_cpu(msg->request_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + pr_err("trying to set remote_comm_id in SIDR_REQ\n"); + return -1; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + return be32_to_cpu(msg->remote_comm_id); + } +} + +static union ib_gid gid_from_req_msg(struct ib_device *ibdev, struct ib_mad *mad) +{ + struct cm_req_msg *msg = (struct cm_req_msg *)mad; + + return msg->primary_path_sgid; +} + +/* Lock should be taken before called */ +static struct id_map_entry * +id_map_find_by_sl_id(struct ib_device *ibdev, u32 slave_id, u32 sl_cm_id) +{ + struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map; + struct rb_node *node = sl_id_map->rb_node; + + while (node) { + struct id_map_entry *id_map_entry = + rb_entry(node, struct id_map_entry, node); + + if (id_map_entry->sl_cm_id > sl_cm_id) + node = node->rb_left; + else if (id_map_entry->sl_cm_id < sl_cm_id) + node = node->rb_right; + else if (id_map_entry->slave_id > slave_id) + node = node->rb_left; + else if (id_map_entry->slave_id < slave_id) + node = node->rb_right; + else + return id_map_entry; + } + return NULL; +} + +static void id_map_ent_timeout(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + struct id_map_entry *ent = container_of(delay, struct id_map_entry, timeout); + struct id_map_entry *db_ent, *found_ent; + struct mlx4_ib_dev *dev = ent->dev; + struct mlx4_ib_sriov *sriov = &dev->sriov; + struct rb_root *sl_id_map = &sriov->sl_id_map; + int pv_id = (int) ent->pv_cm_id; + + spin_lock(&sriov->id_map_lock); + db_ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_id); + if (!db_ent) + goto out; + found_ent = id_map_find_by_sl_id(&dev->ib_dev, ent->slave_id, ent->sl_cm_id); + if (found_ent && found_ent == ent) + rb_erase(&found_ent->node, sl_id_map); + idr_remove(&sriov->pv_id_table, pv_id); + +out: + list_del(&ent->list); + spin_unlock(&sriov->id_map_lock); + kfree(ent); +} + +static void id_map_find_del(struct ib_device *ibdev, int pv_cm_id) +{ + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + struct rb_root *sl_id_map = &sriov->sl_id_map; + struct id_map_entry *ent, *found_ent; + + spin_lock(&sriov->id_map_lock); + ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_cm_id); + if (!ent) + goto out; + found_ent = id_map_find_by_sl_id(ibdev, ent->slave_id, ent->sl_cm_id); + if (found_ent && found_ent == ent) + rb_erase(&found_ent->node, sl_id_map); + idr_remove(&sriov->pv_id_table, pv_cm_id); +out: + spin_unlock(&sriov->id_map_lock); +} + +static void sl_id_map_add(struct ib_device *ibdev, struct id_map_entry *new) +{ + struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map; + struct rb_node **link = &sl_id_map->rb_node, *parent = NULL; + struct id_map_entry *ent; + int slave_id = new->slave_id; + int sl_cm_id = new->sl_cm_id; + + ent = id_map_find_by_sl_id(ibdev, slave_id, sl_cm_id); + if (ent) { + pr_debug("overriding existing sl_id_map entry (cm_id = %x)\n", + sl_cm_id); + + rb_replace_node(&ent->node, &new->node, sl_id_map); + return; + } + + /* Go to the bottom of the tree */ + while (*link) { + parent = *link; + ent = rb_entry(parent, struct id_map_entry, node); + + if (ent->sl_cm_id > sl_cm_id || (ent->sl_cm_id == sl_cm_id && ent->slave_id > slave_id)) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, sl_id_map); +} + +static struct id_map_entry * +id_map_alloc(struct ib_device *ibdev, int slave_id, u32 sl_cm_id) +{ + int ret; + struct id_map_entry *ent; + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + + ent = kmalloc(sizeof (struct id_map_entry), GFP_KERNEL); + if (!ent) + return ERR_PTR(-ENOMEM); + + ent->sl_cm_id = sl_cm_id; + ent->slave_id = slave_id; + ent->scheduled_delete = 0; + ent->dev = to_mdev(ibdev); + INIT_DELAYED_WORK(&ent->timeout, id_map_ent_timeout); + + idr_preload(GFP_KERNEL); + spin_lock(&to_mdev(ibdev)->sriov.id_map_lock); + + ret = idr_alloc_cyclic(&sriov->pv_id_table, ent, 0, 0, GFP_NOWAIT); + if (ret >= 0) { + ent->pv_cm_id = (u32)ret; + sl_id_map_add(ibdev, ent); + list_add_tail(&ent->list, &sriov->cm_list); + } + + spin_unlock(&sriov->id_map_lock); + idr_preload_end(); + + if (ret >= 0) + return ent; + + /*error flow*/ + kfree(ent); + mlx4_ib_warn(ibdev, "No more space in the idr (err:0x%x)\n", ret); + return ERR_PTR(-ENOMEM); +} + +static struct id_map_entry * +id_map_get(struct ib_device *ibdev, int *pv_cm_id, int slave_id, int sl_cm_id) +{ + struct id_map_entry *ent; + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + + spin_lock(&sriov->id_map_lock); + if (*pv_cm_id == -1) { + ent = id_map_find_by_sl_id(ibdev, slave_id, sl_cm_id); + if (ent) + *pv_cm_id = (int) ent->pv_cm_id; + } else + ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, *pv_cm_id); + spin_unlock(&sriov->id_map_lock); + + return ent; +} + +static void schedule_delayed(struct ib_device *ibdev, struct id_map_entry *id) +{ + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + unsigned long flags; + + spin_lock(&sriov->id_map_lock); + spin_lock_irqsave(&sriov->going_down_lock, flags); + /*make sure that there is no schedule inside the scheduled work.*/ + if (!sriov->is_going_down) { + id->scheduled_delete = 1; + schedule_delayed_work(&id->timeout, CM_CLEANUP_CACHE_TIMEOUT); + } else if (id->scheduled_delete) { + /* Adjust timeout if already scheduled */ + mod_delayed_work(system_wq, &id->timeout, CM_CLEANUP_CACHE_TIMEOUT); + } + spin_unlock_irqrestore(&sriov->going_down_lock, flags); + spin_unlock(&sriov->id_map_lock); +} + +int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id, + struct ib_mad *mad) +{ + struct id_map_entry *id; + u32 sl_cm_id; + int pv_cm_id = -1; + + if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || + mad->mad_hdr.attr_id == CM_REP_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + sl_cm_id = get_local_comm_id(mad); + id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id); + if (id) + goto cont; + id = id_map_alloc(ibdev, slave_id, sl_cm_id); + if (IS_ERR(id)) { + mlx4_ib_warn(ibdev, "%s: id{slave: %d, sl_cm_id: 0x%x} Failed to id_map_alloc\n", + __func__, slave_id, sl_cm_id); + return PTR_ERR(id); + } + } else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + return 0; + } else { + sl_cm_id = get_local_comm_id(mad); + id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id); + } + + if (!id) { + pr_debug("id{slave: %d, sl_cm_id: 0x%x} is NULL!\n", + slave_id, sl_cm_id); + return -EINVAL; + } + +cont: + set_local_comm_id(mad, id->pv_cm_id); + + if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID) + schedule_delayed(ibdev, id); + else if (mad->mad_hdr.attr_id == CM_DREP_ATTR_ID) + id_map_find_del(ibdev, pv_cm_id); + + return 0; +} + +int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, + struct ib_mad *mad) +{ + u32 pv_cm_id; + struct id_map_entry *id; + + if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + union ib_gid gid; + + if (!slave) + return 0; + + gid = gid_from_req_msg(ibdev, mad); + *slave = mlx4_ib_find_real_gid(ibdev, port, gid.global.interface_id); + if (*slave < 0) { + mlx4_ib_warn(ibdev, "failed matching slave_id by gid (0x%llx)\n", + be64_to_cpu(gid.global.interface_id)); + return -ENOENT; + } + return 0; + } + + pv_cm_id = get_remote_comm_id(mad); + id = id_map_get(ibdev, (int *)&pv_cm_id, -1, -1); + + if (!id) { + pr_debug("Couldn't find an entry for pv_cm_id 0x%x\n", pv_cm_id); + return -ENOENT; + } + + if (slave) + *slave = id->slave_id; + set_remote_comm_id(mad, id->sl_cm_id); + + if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID) + schedule_delayed(ibdev, id); + else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID || + mad->mad_hdr.attr_id == CM_DREP_ATTR_ID) { + id_map_find_del(ibdev, (int) pv_cm_id); + } + + return 0; +} + +void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev) +{ + spin_lock_init(&dev->sriov.id_map_lock); + INIT_LIST_HEAD(&dev->sriov.cm_list); + dev->sriov.sl_id_map = RB_ROOT; + idr_init(&dev->sriov.pv_id_table); +} + +/* slave = -1 ==> all slaves */ +/* TBD -- call paravirt clean for single slave. Need for slave RESET event */ +void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave) +{ + struct mlx4_ib_sriov *sriov = &dev->sriov; + struct rb_root *sl_id_map = &sriov->sl_id_map; + struct list_head lh; + struct rb_node *nd; + int need_flush = 0; + struct id_map_entry *map, *tmp_map; + /* cancel all delayed work queue entries */ + INIT_LIST_HEAD(&lh); + spin_lock(&sriov->id_map_lock); + list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) { + if (slave < 0 || slave == map->slave_id) { + if (map->scheduled_delete) + need_flush |= !cancel_delayed_work(&map->timeout); + } + } + + spin_unlock(&sriov->id_map_lock); + + if (need_flush) + flush_scheduled_work(); /* make sure all timers were flushed */ + + /* now, remove all leftover entries from databases*/ + spin_lock(&sriov->id_map_lock); + if (slave < 0) { + while (rb_first(sl_id_map)) { + struct id_map_entry *ent = + rb_entry(rb_first(sl_id_map), + struct id_map_entry, node); + + rb_erase(&ent->node, sl_id_map); + idr_remove(&sriov->pv_id_table, (int) ent->pv_cm_id); + } + list_splice_init(&dev->sriov.cm_list, &lh); + } else { + /* first, move nodes belonging to slave to db remove list */ + nd = rb_first(sl_id_map); + while (nd) { + struct id_map_entry *ent = + rb_entry(nd, struct id_map_entry, node); + nd = rb_next(nd); + if (ent->slave_id == slave) + list_move_tail(&ent->list, &lh); + } + /* remove those nodes from databases */ + list_for_each_entry_safe(map, tmp_map, &lh, list) { + rb_erase(&map->node, sl_id_map); + idr_remove(&sriov->pv_id_table, (int) map->pv_cm_id); + } + + /* add remaining nodes from cm_list */ + list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) { + if (slave == map->slave_id) + list_move_tail(&map->list, &lh); + } + } + + spin_unlock(&sriov->id_map_lock); + + /* free any map entries left behind due to cancel_delayed_work above */ + list_for_each_entry_safe(map, tmp_map, &lh, list) { + list_del(&map->list); + kfree(map); + } +} diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c new file mode 100644 index 000000000..82adc0d1d --- /dev/null +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -0,0 +1,980 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/mlx4/cq.h> +#include <linux/mlx4/qp.h> +#include <linux/mlx4/srq.h> +#include <linux/slab.h> + +#include "mlx4_ib.h" +#include <rdma/mlx4-abi.h> + +static void mlx4_ib_cq_comp(struct mlx4_cq *cq) +{ + struct ib_cq *ibcq = &to_mibcq(cq)->ibcq; + ibcq->comp_handler(ibcq, ibcq->cq_context); +} + +static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type) +{ + struct ib_event event; + struct ib_cq *ibcq; + + if (type != MLX4_EVENT_TYPE_CQ_ERROR) { + pr_warn("Unexpected event type %d " + "on CQ %06x\n", type, cq->cqn); + return; + } + + ibcq = &to_mibcq(cq)->ibcq; + if (ibcq->event_handler) { + event.device = ibcq->device; + event.event = IB_EVENT_CQ_ERR; + event.element.cq = ibcq; + ibcq->event_handler(&event, ibcq->cq_context); + } +} + +static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n) +{ + return mlx4_buf_offset(&buf->buf, n * buf->entry_size); +} + +static void *get_cqe(struct mlx4_ib_cq *cq, int n) +{ + return get_cqe_from_buf(&cq->buf, n); +} + +static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n) +{ + struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe); + struct mlx4_cqe *tcqe = ((cq->buf.entry_size == 64) ? (cqe + 1) : cqe); + + return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ + !!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe; +} + +static struct mlx4_cqe *next_cqe_sw(struct mlx4_ib_cq *cq) +{ + return get_sw_cqe(cq, cq->mcq.cons_index); +} + +int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) +{ + struct mlx4_ib_cq *mcq = to_mcq(cq); + struct mlx4_ib_dev *dev = to_mdev(cq->device); + + return mlx4_cq_modify(dev->dev, &mcq->mcq, cq_count, cq_period); +} + +static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int nent) +{ + int err; + + err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size, + PAGE_SIZE * 2, &buf->buf); + + if (err) + goto out; + + buf->entry_size = dev->dev->caps.cqe_size; + err = mlx4_mtt_init(dev->dev, buf->buf.npages, buf->buf.page_shift, + &buf->mtt); + if (err) + goto err_buf; + + err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf); + if (err) + goto err_mtt; + + return 0; + +err_mtt: + mlx4_mtt_cleanup(dev->dev, &buf->mtt); + +err_buf: + mlx4_buf_free(dev->dev, nent * buf->entry_size, &buf->buf); + +out: + return err; +} + +static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int cqe) +{ + mlx4_buf_free(dev->dev, (cqe + 1) * buf->entry_size, &buf->buf); +} + +static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *context, + struct mlx4_ib_cq_buf *buf, struct ib_umem **umem, + u64 buf_addr, int cqe) +{ + int err; + int cqe_size = dev->dev->caps.cqe_size; + int shift; + int n; + + *umem = ib_umem_get(context, buf_addr, cqe * cqe_size, + IB_ACCESS_LOCAL_WRITE, 1); + if (IS_ERR(*umem)) + return PTR_ERR(*umem); + + n = ib_umem_page_count(*umem); + shift = mlx4_ib_umem_calc_optimal_mtt_size(*umem, 0, &n); + err = mlx4_mtt_init(dev->dev, n, shift, &buf->mtt); + + if (err) + goto err_buf; + + err = mlx4_ib_umem_write_mtt(dev, &buf->mtt, *umem); + if (err) + goto err_mtt; + + return 0; + +err_mtt: + mlx4_mtt_cleanup(dev->dev, &buf->mtt); + +err_buf: + ib_umem_release(*umem); + + return err; +} + +#define CQ_CREATE_FLAGS_SUPPORTED IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION +struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, + const struct ib_cq_init_attr *attr, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + int entries = attr->cqe; + int vector = attr->comp_vector; + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct mlx4_ib_cq *cq; + struct mlx4_uar *uar; + int err; + + if (entries < 1 || entries > dev->dev->caps.max_cqes) + return ERR_PTR(-EINVAL); + + if (attr->flags & ~CQ_CREATE_FLAGS_SUPPORTED) + return ERR_PTR(-EINVAL); + + cq = kmalloc(sizeof *cq, GFP_KERNEL); + if (!cq) + return ERR_PTR(-ENOMEM); + + entries = roundup_pow_of_two(entries + 1); + cq->ibcq.cqe = entries - 1; + mutex_init(&cq->resize_mutex); + spin_lock_init(&cq->lock); + cq->resize_buf = NULL; + cq->resize_umem = NULL; + cq->create_flags = attr->flags; + INIT_LIST_HEAD(&cq->send_qp_list); + INIT_LIST_HEAD(&cq->recv_qp_list); + + if (context) { + struct mlx4_ib_create_cq ucmd; + + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + err = -EFAULT; + goto err_cq; + } + + err = mlx4_ib_get_cq_umem(dev, context, &cq->buf, &cq->umem, + ucmd.buf_addr, entries); + if (err) + goto err_cq; + + err = mlx4_ib_db_map_user(to_mucontext(context), ucmd.db_addr, + &cq->db); + if (err) + goto err_mtt; + + uar = &to_mucontext(context)->uar; + cq->mcq.usage = MLX4_RES_USAGE_USER_VERBS; + } else { + err = mlx4_db_alloc(dev->dev, &cq->db, 1); + if (err) + goto err_cq; + + cq->mcq.set_ci_db = cq->db.db; + cq->mcq.arm_db = cq->db.db + 1; + *cq->mcq.set_ci_db = 0; + *cq->mcq.arm_db = 0; + + err = mlx4_ib_alloc_cq_buf(dev, &cq->buf, entries); + if (err) + goto err_db; + + uar = &dev->priv_uar; + cq->mcq.usage = MLX4_RES_USAGE_DRIVER; + } + + if (dev->eq_table) + vector = dev->eq_table[vector % ibdev->num_comp_vectors]; + + err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar, + cq->db.dma, &cq->mcq, vector, 0, + !!(cq->create_flags & IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION)); + if (err) + goto err_dbmap; + + if (context) + cq->mcq.tasklet_ctx.comp = mlx4_ib_cq_comp; + else + cq->mcq.comp = mlx4_ib_cq_comp; + cq->mcq.event = mlx4_ib_cq_event; + + if (context) + if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof (__u32))) { + err = -EFAULT; + goto err_cq_free; + } + + return &cq->ibcq; + +err_cq_free: + mlx4_cq_free(dev->dev, &cq->mcq); + +err_dbmap: + if (context) + mlx4_ib_db_unmap_user(to_mucontext(context), &cq->db); + +err_mtt: + mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt); + + if (context) + ib_umem_release(cq->umem); + else + mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe); + +err_db: + if (!context) + mlx4_db_free(dev->dev, &cq->db); + +err_cq: + kfree(cq); + + return ERR_PTR(err); +} + +static int mlx4_alloc_resize_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq, + int entries) +{ + int err; + + if (cq->resize_buf) + return -EBUSY; + + cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_KERNEL); + if (!cq->resize_buf) + return -ENOMEM; + + err = mlx4_ib_alloc_cq_buf(dev, &cq->resize_buf->buf, entries); + if (err) { + kfree(cq->resize_buf); + cq->resize_buf = NULL; + return err; + } + + cq->resize_buf->cqe = entries - 1; + + return 0; +} + +static int mlx4_alloc_resize_umem(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq, + int entries, struct ib_udata *udata) +{ + struct mlx4_ib_resize_cq ucmd; + int err; + + if (cq->resize_umem) + return -EBUSY; + + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) + return -EFAULT; + + cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_KERNEL); + if (!cq->resize_buf) + return -ENOMEM; + + err = mlx4_ib_get_cq_umem(dev, cq->umem->context, &cq->resize_buf->buf, + &cq->resize_umem, ucmd.buf_addr, entries); + if (err) { + kfree(cq->resize_buf); + cq->resize_buf = NULL; + return err; + } + + cq->resize_buf->cqe = entries - 1; + + return 0; +} + +static int mlx4_ib_get_outstanding_cqes(struct mlx4_ib_cq *cq) +{ + u32 i; + + i = cq->mcq.cons_index; + while (get_sw_cqe(cq, i)) + ++i; + + return i - cq->mcq.cons_index; +} + +static void mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq) +{ + struct mlx4_cqe *cqe, *new_cqe; + int i; + int cqe_size = cq->buf.entry_size; + int cqe_inc = cqe_size == 64 ? 1 : 0; + + i = cq->mcq.cons_index; + cqe = get_cqe(cq, i & cq->ibcq.cqe); + cqe += cqe_inc; + + while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) { + new_cqe = get_cqe_from_buf(&cq->resize_buf->buf, + (i + 1) & cq->resize_buf->cqe); + memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), cqe_size); + new_cqe += cqe_inc; + + new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) | + (((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0); + cqe = get_cqe(cq, ++i & cq->ibcq.cqe); + cqe += cqe_inc; + } + ++cq->mcq.cons_index; +} + +int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(ibcq->device); + struct mlx4_ib_cq *cq = to_mcq(ibcq); + struct mlx4_mtt mtt; + int outst_cqe; + int err; + + mutex_lock(&cq->resize_mutex); + if (entries < 1 || entries > dev->dev->caps.max_cqes) { + err = -EINVAL; + goto out; + } + + entries = roundup_pow_of_two(entries + 1); + if (entries == ibcq->cqe + 1) { + err = 0; + goto out; + } + + if (entries > dev->dev->caps.max_cqes + 1) { + err = -EINVAL; + goto out; + } + + if (ibcq->uobject) { + err = mlx4_alloc_resize_umem(dev, cq, entries, udata); + if (err) + goto out; + } else { + /* Can't be smaller than the number of outstanding CQEs */ + outst_cqe = mlx4_ib_get_outstanding_cqes(cq); + if (entries < outst_cqe + 1) { + err = -EINVAL; + goto out; + } + + err = mlx4_alloc_resize_buf(dev, cq, entries); + if (err) + goto out; + } + + mtt = cq->buf.mtt; + + err = mlx4_cq_resize(dev->dev, &cq->mcq, entries, &cq->resize_buf->buf.mtt); + if (err) + goto err_buf; + + mlx4_mtt_cleanup(dev->dev, &mtt); + if (ibcq->uobject) { + cq->buf = cq->resize_buf->buf; + cq->ibcq.cqe = cq->resize_buf->cqe; + ib_umem_release(cq->umem); + cq->umem = cq->resize_umem; + + kfree(cq->resize_buf); + cq->resize_buf = NULL; + cq->resize_umem = NULL; + } else { + struct mlx4_ib_cq_buf tmp_buf; + int tmp_cqe = 0; + + spin_lock_irq(&cq->lock); + if (cq->resize_buf) { + mlx4_ib_cq_resize_copy_cqes(cq); + tmp_buf = cq->buf; + tmp_cqe = cq->ibcq.cqe; + cq->buf = cq->resize_buf->buf; + cq->ibcq.cqe = cq->resize_buf->cqe; + + kfree(cq->resize_buf); + cq->resize_buf = NULL; + } + spin_unlock_irq(&cq->lock); + + if (tmp_cqe) + mlx4_ib_free_cq_buf(dev, &tmp_buf, tmp_cqe); + } + + goto out; + +err_buf: + mlx4_mtt_cleanup(dev->dev, &cq->resize_buf->buf.mtt); + if (!ibcq->uobject) + mlx4_ib_free_cq_buf(dev, &cq->resize_buf->buf, + cq->resize_buf->cqe); + + kfree(cq->resize_buf); + cq->resize_buf = NULL; + + if (cq->resize_umem) { + ib_umem_release(cq->resize_umem); + cq->resize_umem = NULL; + } + +out: + mutex_unlock(&cq->resize_mutex); + + return err; +} + +int mlx4_ib_destroy_cq(struct ib_cq *cq) +{ + struct mlx4_ib_dev *dev = to_mdev(cq->device); + struct mlx4_ib_cq *mcq = to_mcq(cq); + + mlx4_cq_free(dev->dev, &mcq->mcq); + mlx4_mtt_cleanup(dev->dev, &mcq->buf.mtt); + + if (cq->uobject) { + mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db); + ib_umem_release(mcq->umem); + } else { + mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe); + mlx4_db_free(dev->dev, &mcq->db); + } + + kfree(mcq); + + return 0; +} + +static void dump_cqe(void *cqe) +{ + __be32 *buf = cqe; + + pr_debug("CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n", + be32_to_cpu(buf[0]), be32_to_cpu(buf[1]), be32_to_cpu(buf[2]), + be32_to_cpu(buf[3]), be32_to_cpu(buf[4]), be32_to_cpu(buf[5]), + be32_to_cpu(buf[6]), be32_to_cpu(buf[7])); +} + +static void mlx4_ib_handle_error_cqe(struct mlx4_err_cqe *cqe, + struct ib_wc *wc) +{ + if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) { + pr_debug("local QP operation err " + "(QPN %06x, WQE index %x, vendor syndrome %02x, " + "opcode = %02x)\n", + be32_to_cpu(cqe->my_qpn), be16_to_cpu(cqe->wqe_index), + cqe->vendor_err_syndrome, + cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK); + dump_cqe(cqe); + } + + switch (cqe->syndrome) { + case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR: + wc->status = IB_WC_LOC_LEN_ERR; + break; + case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR: + wc->status = IB_WC_LOC_QP_OP_ERR; + break; + case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR: + wc->status = IB_WC_LOC_PROT_ERR; + break; + case MLX4_CQE_SYNDROME_WR_FLUSH_ERR: + wc->status = IB_WC_WR_FLUSH_ERR; + break; + case MLX4_CQE_SYNDROME_MW_BIND_ERR: + wc->status = IB_WC_MW_BIND_ERR; + break; + case MLX4_CQE_SYNDROME_BAD_RESP_ERR: + wc->status = IB_WC_BAD_RESP_ERR; + break; + case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: + wc->status = IB_WC_REM_INV_REQ_ERR; + break; + case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR: + wc->status = IB_WC_REM_ACCESS_ERR; + break; + case MLX4_CQE_SYNDROME_REMOTE_OP_ERR: + wc->status = IB_WC_REM_OP_ERR; + break; + case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: + wc->status = IB_WC_RETRY_EXC_ERR; + break; + case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR: + wc->status = IB_WC_RNR_RETRY_EXC_ERR; + break; + case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR: + wc->status = IB_WC_REM_ABORT_ERR; + break; + default: + wc->status = IB_WC_GENERAL_ERR; + break; + } + + wc->vendor_err = cqe->vendor_err_syndrome; +} + +static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 checksum) +{ + return ((status & cpu_to_be16(MLX4_CQE_STATUS_IPV4 | + MLX4_CQE_STATUS_IPV4F | + MLX4_CQE_STATUS_IPV4OPT | + MLX4_CQE_STATUS_IPV6 | + MLX4_CQE_STATUS_IPOK)) == + cpu_to_be16(MLX4_CQE_STATUS_IPV4 | + MLX4_CQE_STATUS_IPOK)) && + (status & cpu_to_be16(MLX4_CQE_STATUS_UDP | + MLX4_CQE_STATUS_TCP)) && + checksum == cpu_to_be16(0xffff); +} + +static void use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc, + unsigned tail, struct mlx4_cqe *cqe, int is_eth) +{ + struct mlx4_ib_proxy_sqp_hdr *hdr; + + ib_dma_sync_single_for_cpu(qp->ibqp.device, + qp->sqp_proxy_rcv[tail].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + hdr = (struct mlx4_ib_proxy_sqp_hdr *) (qp->sqp_proxy_rcv[tail].addr); + wc->pkey_index = be16_to_cpu(hdr->tun.pkey_index); + wc->src_qp = be32_to_cpu(hdr->tun.flags_src_qp) & 0xFFFFFF; + wc->wc_flags |= (hdr->tun.g_ml_path & 0x80) ? (IB_WC_GRH) : 0; + wc->dlid_path_bits = 0; + + if (is_eth) { + wc->slid = 0; + wc->vlan_id = be16_to_cpu(hdr->tun.sl_vid); + memcpy(&(wc->smac[0]), (char *)&hdr->tun.mac_31_0, 4); + memcpy(&(wc->smac[4]), (char *)&hdr->tun.slid_mac_47_32, 2); + wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC); + } else { + wc->slid = be16_to_cpu(hdr->tun.slid_mac_47_32); + wc->sl = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12); + } +} + +static void mlx4_ib_qp_sw_comp(struct mlx4_ib_qp *qp, int num_entries, + struct ib_wc *wc, int *npolled, int is_send) +{ + struct mlx4_ib_wq *wq; + unsigned cur; + int i; + + wq = is_send ? &qp->sq : &qp->rq; + cur = wq->head - wq->tail; + + if (cur == 0) + return; + + for (i = 0; i < cur && *npolled < num_entries; i++) { + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + wc->status = IB_WC_WR_FLUSH_ERR; + wc->vendor_err = MLX4_CQE_SYNDROME_WR_FLUSH_ERR; + wq->tail++; + (*npolled)++; + wc->qp = &qp->ibqp; + wc++; + } +} + +static void mlx4_ib_poll_sw_comp(struct mlx4_ib_cq *cq, int num_entries, + struct ib_wc *wc, int *npolled) +{ + struct mlx4_ib_qp *qp; + + *npolled = 0; + /* Find uncompleted WQEs belonging to that cq and return + * simulated FLUSH_ERR completions + */ + list_for_each_entry(qp, &cq->send_qp_list, cq_send_list) { + mlx4_ib_qp_sw_comp(qp, num_entries, wc + *npolled, npolled, 1); + if (*npolled >= num_entries) + goto out; + } + + list_for_each_entry(qp, &cq->recv_qp_list, cq_recv_list) { + mlx4_ib_qp_sw_comp(qp, num_entries, wc + *npolled, npolled, 0); + if (*npolled >= num_entries) + goto out; + } + +out: + return; +} + +static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, + struct mlx4_ib_qp **cur_qp, + struct ib_wc *wc) +{ + struct mlx4_cqe *cqe; + struct mlx4_qp *mqp; + struct mlx4_ib_wq *wq; + struct mlx4_ib_srq *srq; + struct mlx4_srq *msrq = NULL; + int is_send; + int is_error; + int is_eth; + u32 g_mlpath_rqpn; + u16 wqe_ctr; + unsigned tail = 0; + +repoll: + cqe = next_cqe_sw(cq); + if (!cqe) + return -EAGAIN; + + if (cq->buf.entry_size == 64) + cqe++; + + ++cq->mcq.cons_index; + + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rmb(); + + is_send = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK; + is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == + MLX4_CQE_OPCODE_ERROR; + + /* Resize CQ in progress */ + if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_RESIZE)) { + if (cq->resize_buf) { + struct mlx4_ib_dev *dev = to_mdev(cq->ibcq.device); + + mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe); + cq->buf = cq->resize_buf->buf; + cq->ibcq.cqe = cq->resize_buf->cqe; + + kfree(cq->resize_buf); + cq->resize_buf = NULL; + } + + goto repoll; + } + + if (!*cur_qp || + (be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) != (*cur_qp)->mqp.qpn) { + /* + * We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + mqp = __mlx4_qp_lookup(to_mdev(cq->ibcq.device)->dev, + be32_to_cpu(cqe->vlan_my_qpn)); + *cur_qp = to_mibqp(mqp); + } + + wc->qp = &(*cur_qp)->ibqp; + + if (wc->qp->qp_type == IB_QPT_XRC_TGT) { + u32 srq_num; + g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn); + srq_num = g_mlpath_rqpn & 0xffffff; + /* SRQ is also in the radix tree */ + msrq = mlx4_srq_lookup(to_mdev(cq->ibcq.device)->dev, + srq_num); + } + + if (is_send) { + wq = &(*cur_qp)->sq; + if (!(*cur_qp)->sq_signal_bits) { + wqe_ctr = be16_to_cpu(cqe->wqe_index); + wq->tail += (u16) (wqe_ctr - (u16) wq->tail); + } + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } else if ((*cur_qp)->ibqp.srq) { + srq = to_msrq((*cur_qp)->ibqp.srq); + wqe_ctr = be16_to_cpu(cqe->wqe_index); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx4_ib_free_srq_wqe(srq, wqe_ctr); + } else if (msrq) { + srq = to_mibsrq(msrq); + wqe_ctr = be16_to_cpu(cqe->wqe_index); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx4_ib_free_srq_wqe(srq, wqe_ctr); + } else { + wq = &(*cur_qp)->rq; + tail = wq->tail & (wq->wqe_cnt - 1); + wc->wr_id = wq->wrid[tail]; + ++wq->tail; + } + + if (unlikely(is_error)) { + mlx4_ib_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc); + return 0; + } + + wc->status = IB_WC_SUCCESS; + + if (is_send) { + wc->wc_flags = 0; + switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { + case MLX4_OPCODE_RDMA_WRITE_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + /* fall through */ + case MLX4_OPCODE_RDMA_WRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case MLX4_OPCODE_SEND_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + /* fall through */ + case MLX4_OPCODE_SEND: + case MLX4_OPCODE_SEND_INVAL: + wc->opcode = IB_WC_SEND; + break; + case MLX4_OPCODE_RDMA_READ: + wc->opcode = IB_WC_RDMA_READ; + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + break; + case MLX4_OPCODE_ATOMIC_CS: + wc->opcode = IB_WC_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX4_OPCODE_ATOMIC_FA: + wc->opcode = IB_WC_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX4_OPCODE_MASKED_ATOMIC_CS: + wc->opcode = IB_WC_MASKED_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX4_OPCODE_MASKED_ATOMIC_FA: + wc->opcode = IB_WC_MASKED_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX4_OPCODE_LSO: + wc->opcode = IB_WC_LSO; + break; + case MLX4_OPCODE_FMR: + wc->opcode = IB_WC_REG_MR; + break; + case MLX4_OPCODE_LOCAL_INVAL: + wc->opcode = IB_WC_LOCAL_INV; + break; + } + } else { + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + + switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { + case MLX4_RECV_OPCODE_RDMA_WRITE_IMM: + wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->immed_rss_invalid; + break; + case MLX4_RECV_OPCODE_SEND_INVAL: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_INVALIDATE; + wc->ex.invalidate_rkey = be32_to_cpu(cqe->immed_rss_invalid); + break; + case MLX4_RECV_OPCODE_SEND: + wc->opcode = IB_WC_RECV; + wc->wc_flags = 0; + break; + case MLX4_RECV_OPCODE_SEND_IMM: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->immed_rss_invalid; + break; + } + + is_eth = (rdma_port_get_link_layer(wc->qp->device, + (*cur_qp)->port) == + IB_LINK_LAYER_ETHERNET); + if (mlx4_is_mfunc(to_mdev(cq->ibcq.device)->dev)) { + if ((*cur_qp)->mlx4_ib_qp_type & + (MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) { + use_tunnel_data(*cur_qp, cq, wc, tail, cqe, + is_eth); + return 0; + } + } + + g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn); + wc->src_qp = g_mlpath_rqpn & 0xffffff; + wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f; + wc->wc_flags |= g_mlpath_rqpn & 0x80000000 ? IB_WC_GRH : 0; + wc->pkey_index = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f; + wc->wc_flags |= mlx4_ib_ipoib_csum_ok(cqe->status, + cqe->checksum) ? IB_WC_IP_CSUM_OK : 0; + if (is_eth) { + wc->slid = 0; + wc->sl = be16_to_cpu(cqe->sl_vid) >> 13; + if (be32_to_cpu(cqe->vlan_my_qpn) & + MLX4_CQE_CVLAN_PRESENT_MASK) { + wc->vlan_id = be16_to_cpu(cqe->sl_vid) & + MLX4_CQE_VID_MASK; + } else { + wc->vlan_id = 0xffff; + } + memcpy(wc->smac, cqe->smac, ETH_ALEN); + wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC); + } else { + wc->slid = be16_to_cpu(cqe->rlid); + wc->sl = be16_to_cpu(cqe->sl_vid) >> 12; + wc->vlan_id = 0xffff; + } + } + + return 0; +} + +int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct mlx4_ib_cq *cq = to_mcq(ibcq); + struct mlx4_ib_qp *cur_qp = NULL; + unsigned long flags; + int npolled; + struct mlx4_ib_dev *mdev = to_mdev(cq->ibcq.device); + + spin_lock_irqsave(&cq->lock, flags); + if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { + mlx4_ib_poll_sw_comp(cq, num_entries, wc, &npolled); + goto out; + } + + for (npolled = 0; npolled < num_entries; ++npolled) { + if (mlx4_ib_poll_one(cq, &cur_qp, wc + npolled)) + break; + } + + mlx4_cq_set_ci(&cq->mcq); + +out: + spin_unlock_irqrestore(&cq->lock, flags); + + return npolled; +} + +int mlx4_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + mlx4_cq_arm(&to_mcq(ibcq)->mcq, + (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? + MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT, + to_mdev(ibcq->device)->uar_map, + MLX4_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->uar_lock)); + + return 0; +} + +void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq) +{ + u32 prod_index; + int nfreed = 0; + struct mlx4_cqe *cqe, *dest; + u8 owner_bit; + int cqe_inc = cq->buf.entry_size == 64 ? 1 : 0; + + /* + * First we need to find the current producer index, so we + * know where to start cleaning from. It doesn't matter if HW + * adds new entries after this loop -- the QP we're worried + * about is already in RESET, so the new entries won't come + * from our QP and therefore don't need to be checked. + */ + for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); ++prod_index) + if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe) + break; + + /* + * Now sweep backwards through the CQ, removing CQ entries + * that match our QP by copying older entries on top of them. + */ + while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { + cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); + cqe += cqe_inc; + + if ((be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) { + if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) + mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index)); + ++nfreed; + } else if (nfreed) { + dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe); + dest += cqe_inc; + + owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK; + memcpy(dest, cqe, sizeof *cqe); + dest->owner_sr_opcode = owner_bit | + (dest->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK); + } + } + + if (nfreed) { + cq->mcq.cons_index += nfreed; + /* + * Make sure update of buffer contents is done before + * updating consumer index. + */ + wmb(); + mlx4_cq_set_ci(&cq->mcq); + } +} + +void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq) +{ + spin_lock_irq(&cq->lock); + __mlx4_ib_cq_clean(cq, qpn, srq); + spin_unlock_irq(&cq->lock); +} diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c new file mode 100644 index 000000000..c51740986 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/doorbell.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/slab.h> + +#include "mlx4_ib.h" + +struct mlx4_ib_user_db_page { + struct list_head list; + struct ib_umem *umem; + unsigned long user_virt; + int refcnt; +}; + +int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, + struct mlx4_db *db) +{ + struct mlx4_ib_user_db_page *page; + int err = 0; + + mutex_lock(&context->db_page_mutex); + + list_for_each_entry(page, &context->db_page_list, list) + if (page->user_virt == (virt & PAGE_MASK)) + goto found; + + page = kmalloc(sizeof *page, GFP_KERNEL); + if (!page) { + err = -ENOMEM; + goto out; + } + + page->user_virt = (virt & PAGE_MASK); + page->refcnt = 0; + page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK, + PAGE_SIZE, 0, 0); + if (IS_ERR(page->umem)) { + err = PTR_ERR(page->umem); + kfree(page); + goto out; + } + + list_add(&page->list, &context->db_page_list); + +found: + db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK); + db->u.user_page = page; + ++page->refcnt; + +out: + mutex_unlock(&context->db_page_mutex); + + return err; +} + +void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db) +{ + mutex_lock(&context->db_page_mutex); + + if (!--db->u.user_page->refcnt) { + list_del(&db->u.user_page->list); + ib_umem_release(db->u.user_page->umem); + kfree(db->u.user_page); + } + + mutex_unlock(&context->db_page_mutex); +} diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c new file mode 100644 index 000000000..418b9312f --- /dev/null +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -0,0 +1,2399 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/ib_mad.h> +#include <rdma/ib_smi.h> +#include <rdma/ib_sa.h> +#include <rdma/ib_cache.h> + +#include <linux/random.h> +#include <linux/mlx4/cmd.h> +#include <linux/gfp.h> +#include <rdma/ib_pma.h> +#include <linux/ip.h> +#include <net/ipv6.h> + +#include <linux/mlx4/driver.h> +#include "mlx4_ib.h" + +enum { + MLX4_IB_VENDOR_CLASS1 = 0x9, + MLX4_IB_VENDOR_CLASS2 = 0xa +}; + +#define MLX4_TUN_SEND_WRID_SHIFT 34 +#define MLX4_TUN_QPN_SHIFT 32 +#define MLX4_TUN_WRID_RECV (((u64) 1) << MLX4_TUN_SEND_WRID_SHIFT) +#define MLX4_TUN_SET_WRID_QPN(a) (((u64) ((a) & 0x3)) << MLX4_TUN_QPN_SHIFT) + +#define MLX4_TUN_IS_RECV(a) (((a) >> MLX4_TUN_SEND_WRID_SHIFT) & 0x1) +#define MLX4_TUN_WRID_QPN(a) (((a) >> MLX4_TUN_QPN_SHIFT) & 0x3) + + /* Port mgmt change event handling */ + +#define GET_BLK_PTR_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.block_ptr) +#define GET_MASK_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.tbl_entries_mask) +#define NUM_IDX_IN_PKEY_TBL_BLK 32 +#define GUID_TBL_ENTRY_SIZE 8 /* size in bytes */ +#define GUID_TBL_BLK_NUM_ENTRIES 8 +#define GUID_TBL_BLK_SIZE (GUID_TBL_ENTRY_SIZE * GUID_TBL_BLK_NUM_ENTRIES) + +struct mlx4_mad_rcv_buf { + struct ib_grh grh; + u8 payload[256]; +} __packed; + +struct mlx4_mad_snd_buf { + u8 payload[256]; +} __packed; + +struct mlx4_tunnel_mad { + struct ib_grh grh; + struct mlx4_ib_tunnel_header hdr; + struct ib_mad mad; +} __packed; + +struct mlx4_rcv_tunnel_mad { + struct mlx4_rcv_tunnel_hdr hdr; + struct ib_grh grh; + struct ib_mad mad; +} __packed; + +static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num); +static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num); +static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, + int block, u32 change_bitmap); + +__be64 mlx4_ib_gen_node_guid(void) +{ +#define NODE_GUID_HI ((u64) (((u64)IB_OPENIB_OUI) << 40)) + return cpu_to_be64(NODE_GUID_HI | prandom_u32()); +} + +__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx) +{ + return cpu_to_be64(atomic_inc_return(&ctx->tid)) | + cpu_to_be64(0xff00000000000000LL); +} + +int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags, + int port, const struct ib_wc *in_wc, + const struct ib_grh *in_grh, + const void *in_mad, void *response_mad) +{ + struct mlx4_cmd_mailbox *inmailbox, *outmailbox; + void *inbox; + int err; + u32 in_modifier = port; + u8 op_modifier = 0; + + inmailbox = mlx4_alloc_cmd_mailbox(dev->dev); + if (IS_ERR(inmailbox)) + return PTR_ERR(inmailbox); + inbox = inmailbox->buf; + + outmailbox = mlx4_alloc_cmd_mailbox(dev->dev); + if (IS_ERR(outmailbox)) { + mlx4_free_cmd_mailbox(dev->dev, inmailbox); + return PTR_ERR(outmailbox); + } + + memcpy(inbox, in_mad, 256); + + /* + * Key check traps can't be generated unless we have in_wc to + * tell us where to send the trap. + */ + if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_MKEY) || !in_wc) + op_modifier |= 0x1; + if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_BKEY) || !in_wc) + op_modifier |= 0x2; + if (mlx4_is_mfunc(dev->dev) && + (mad_ifc_flags & MLX4_MAD_IFC_NET_VIEW || in_wc)) + op_modifier |= 0x8; + + if (in_wc) { + struct { + __be32 my_qpn; + u32 reserved1; + __be32 rqpn; + u8 sl; + u8 g_path; + u16 reserved2[2]; + __be16 pkey; + u32 reserved3[11]; + u8 grh[40]; + } *ext_info; + + memset(inbox + 256, 0, 256); + ext_info = inbox + 256; + + ext_info->my_qpn = cpu_to_be32(in_wc->qp->qp_num); + ext_info->rqpn = cpu_to_be32(in_wc->src_qp); + ext_info->sl = in_wc->sl << 4; + ext_info->g_path = in_wc->dlid_path_bits | + (in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0); + ext_info->pkey = cpu_to_be16(in_wc->pkey_index); + + if (in_grh) + memcpy(ext_info->grh, in_grh, 40); + + op_modifier |= 0x4; + + in_modifier |= ib_lid_cpu16(in_wc->slid) << 16; + } + + err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, in_modifier, + mlx4_is_master(dev->dev) ? (op_modifier & ~0x8) : op_modifier, + MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C, + (op_modifier & 0x8) ? MLX4_CMD_NATIVE : MLX4_CMD_WRAPPED); + + if (!err) + memcpy(response_mad, outmailbox->buf, 256); + + mlx4_free_cmd_mailbox(dev->dev, inmailbox); + mlx4_free_cmd_mailbox(dev->dev, outmailbox); + + return err; +} + +static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl) +{ + struct ib_ah *new_ah; + struct rdma_ah_attr ah_attr; + unsigned long flags; + + if (!dev->send_agent[port_num - 1][0]) + return; + + memset(&ah_attr, 0, sizeof ah_attr); + ah_attr.type = rdma_ah_find_type(&dev->ib_dev, port_num); + rdma_ah_set_dlid(&ah_attr, lid); + rdma_ah_set_sl(&ah_attr, sl); + rdma_ah_set_port_num(&ah_attr, port_num); + + new_ah = rdma_create_ah(dev->send_agent[port_num - 1][0]->qp->pd, + &ah_attr); + if (IS_ERR(new_ah)) + return; + + spin_lock_irqsave(&dev->sm_lock, flags); + if (dev->sm_ah[port_num - 1]) + rdma_destroy_ah(dev->sm_ah[port_num - 1]); + dev->sm_ah[port_num - 1] = new_ah; + spin_unlock_irqrestore(&dev->sm_lock, flags); +} + +/* + * Snoop SM MADs for port info, GUID info, and P_Key table sets, so we can + * synthesize LID change, Client-Rereg, GID change, and P_Key change events. + */ +static void smp_snoop(struct ib_device *ibdev, u8 port_num, const struct ib_mad *mad, + u16 prev_lid) +{ + struct ib_port_info *pinfo; + u16 lid; + __be16 *base; + u32 bn, pkey_change_bitmap; + int i; + + + struct mlx4_ib_dev *dev = to_mdev(ibdev); + if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && + mad->mad_hdr.method == IB_MGMT_METHOD_SET) + switch (mad->mad_hdr.attr_id) { + case IB_SMP_ATTR_PORT_INFO: + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV) + return; + pinfo = (struct ib_port_info *) ((struct ib_smp *) mad)->data; + lid = be16_to_cpu(pinfo->lid); + + update_sm_ah(dev, port_num, + be16_to_cpu(pinfo->sm_lid), + pinfo->neighbormtu_mastersmsl & 0xf); + + if (pinfo->clientrereg_resv_subnetto & 0x80) + handle_client_rereg_event(dev, port_num); + + if (prev_lid != lid) + handle_lid_change_event(dev, port_num); + break; + + case IB_SMP_ATTR_PKEY_TABLE: + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV) + return; + if (!mlx4_is_mfunc(dev->dev)) { + mlx4_ib_dispatch_event(dev, port_num, + IB_EVENT_PKEY_CHANGE); + break; + } + + /* at this point, we are running in the master. + * Slaves do not receive SMPs. + */ + bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod) & 0xFFFF; + base = (__be16 *) &(((struct ib_smp *)mad)->data[0]); + pkey_change_bitmap = 0; + for (i = 0; i < 32; i++) { + pr_debug("PKEY[%d] = x%x\n", + i + bn*32, be16_to_cpu(base[i])); + if (be16_to_cpu(base[i]) != + dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32]) { + pkey_change_bitmap |= (1 << i); + dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32] = + be16_to_cpu(base[i]); + } + } + pr_debug("PKEY Change event: port=%d, " + "block=0x%x, change_bitmap=0x%x\n", + port_num, bn, pkey_change_bitmap); + + if (pkey_change_bitmap) { + mlx4_ib_dispatch_event(dev, port_num, + IB_EVENT_PKEY_CHANGE); + if (!dev->sriov.is_going_down) + __propagate_pkey_ev(dev, port_num, bn, + pkey_change_bitmap); + } + break; + + case IB_SMP_ATTR_GUID_INFO: + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV) + return; + /* paravirtualized master's guid is guid 0 -- does not change */ + if (!mlx4_is_master(dev->dev)) + mlx4_ib_dispatch_event(dev, port_num, + IB_EVENT_GID_CHANGE); + /*if master, notify relevant slaves*/ + if (mlx4_is_master(dev->dev) && + !dev->sriov.is_going_down) { + bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod); + mlx4_ib_update_cache_on_guid_change(dev, bn, port_num, + (u8 *)(&((struct ib_smp *)mad)->data)); + mlx4_ib_notify_slaves_on_guid_change(dev, bn, port_num, + (u8 *)(&((struct ib_smp *)mad)->data)); + } + break; + + case IB_SMP_ATTR_SL_TO_VL_TABLE: + /* cache sl to vl mapping changes for use in + * filling QP1 LRH VL field when sending packets + */ + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV && + dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT) + return; + if (!mlx4_is_slave(dev->dev)) { + union sl2vl_tbl_to_u64 sl2vl64; + int jj; + + for (jj = 0; jj < 8; jj++) { + sl2vl64.sl8[jj] = ((struct ib_smp *)mad)->data[jj]; + pr_debug("port %u, sl2vl[%d] = %02x\n", + port_num, jj, sl2vl64.sl8[jj]); + } + atomic64_set(&dev->sl2vl[port_num - 1], sl2vl64.sl64); + } + break; + + default: + break; + } +} + +static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, + int block, u32 change_bitmap) +{ + int i, ix, slave, err; + int have_event = 0; + + for (slave = 0; slave < dev->dev->caps.sqp_demux; slave++) { + if (slave == mlx4_master_func_num(dev->dev)) + continue; + if (!mlx4_is_slave_active(dev->dev, slave)) + continue; + + have_event = 0; + for (i = 0; i < 32; i++) { + if (!(change_bitmap & (1 << i))) + continue; + for (ix = 0; + ix < dev->dev->caps.pkey_table_len[port_num]; ix++) { + if (dev->pkeys.virt2phys_pkey[slave][port_num - 1] + [ix] == i + 32 * block) { + err = mlx4_gen_pkey_eqe(dev->dev, slave, port_num); + pr_debug("propagate_pkey_ev: slave %d," + " port %d, ix %d (%d)\n", + slave, port_num, ix, err); + have_event = 1; + break; + } + } + if (have_event) + break; + } + } +} + +static void node_desc_override(struct ib_device *dev, + struct ib_mad *mad) +{ + unsigned long flags; + + if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && + mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP && + mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) { + spin_lock_irqsave(&to_mdev(dev)->sm_lock, flags); + memcpy(((struct ib_smp *) mad)->data, dev->node_desc, + IB_DEVICE_NODE_DESC_MAX); + spin_unlock_irqrestore(&to_mdev(dev)->sm_lock, flags); + } +} + +static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, const struct ib_mad *mad) +{ + int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED; + struct ib_mad_send_buf *send_buf; + struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn]; + int ret; + unsigned long flags; + + if (agent) { + send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR, + IB_MGMT_MAD_DATA, GFP_ATOMIC, + IB_MGMT_BASE_VERSION); + if (IS_ERR(send_buf)) + return; + /* + * We rely here on the fact that MLX QPs don't use the + * address handle after the send is posted (this is + * wrong following the IB spec strictly, but we know + * it's OK for our devices). + */ + spin_lock_irqsave(&dev->sm_lock, flags); + memcpy(send_buf->mad, mad, sizeof *mad); + if ((send_buf->ah = dev->sm_ah[port_num - 1])) + ret = ib_post_send_mad(send_buf, NULL); + else + ret = -EINVAL; + spin_unlock_irqrestore(&dev->sm_lock, flags); + + if (ret) + ib_free_send_mad(send_buf); + } +} + +static int mlx4_ib_demux_sa_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *sa_mad) +{ + int ret = 0; + + /* dispatch to different sa handlers */ + switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) { + case IB_SA_ATTR_MC_MEMBER_REC: + ret = mlx4_ib_mcg_demux_handler(ibdev, port, slave, sa_mad); + break; + default: + break; + } + return ret; +} + +int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int i; + + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + if (dev->sriov.demux[port - 1].guid_cache[i] == guid) + return i; + } + return -1; +} + + +static int find_slave_port_pkey_ix(struct mlx4_ib_dev *dev, int slave, + u8 port, u16 pkey, u16 *ix) +{ + int i, ret; + u8 unassigned_pkey_ix, pkey_ix, partial_ix = 0xFF; + u16 slot_pkey; + + if (slave == mlx4_master_func_num(dev->dev)) + return ib_find_cached_pkey(&dev->ib_dev, port, pkey, ix); + + unassigned_pkey_ix = dev->dev->phys_caps.pkey_phys_table_len[port] - 1; + + for (i = 0; i < dev->dev->caps.pkey_table_len[port]; i++) { + if (dev->pkeys.virt2phys_pkey[slave][port - 1][i] == unassigned_pkey_ix) + continue; + + pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][i]; + + ret = ib_get_cached_pkey(&dev->ib_dev, port, pkey_ix, &slot_pkey); + if (ret) + continue; + if ((slot_pkey & 0x7FFF) == (pkey & 0x7FFF)) { + if (slot_pkey & 0x8000) { + *ix = (u16) pkey_ix; + return 0; + } else { + /* take first partial pkey index found */ + if (partial_ix == 0xFF) + partial_ix = pkey_ix; + } + } + } + + if (partial_ix < 0xFF) { + *ix = (u16) partial_ix; + return 0; + } + + return -EINVAL; +} + +static int get_gids_from_l3_hdr(struct ib_grh *grh, union ib_gid *sgid, + union ib_gid *dgid) +{ + int version = ib_get_rdma_header_version((const union rdma_network_hdr *)grh); + enum rdma_network_type net_type; + + if (version == 4) + net_type = RDMA_NETWORK_IPV4; + else if (version == 6) + net_type = RDMA_NETWORK_IPV6; + else + return -EINVAL; + + return ib_get_gids_from_rdma_hdr((union rdma_network_hdr *)grh, net_type, + sgid, dgid); +} + +int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type dest_qpt, struct ib_wc *wc, + struct ib_grh *grh, struct ib_mad *mad) +{ + struct ib_sge list; + struct ib_ud_wr wr; + const struct ib_send_wr *bad_wr; + struct mlx4_ib_demux_pv_ctx *tun_ctx; + struct mlx4_ib_demux_pv_qp *tun_qp; + struct mlx4_rcv_tunnel_mad *tun_mad; + struct rdma_ah_attr attr; + struct ib_ah *ah; + struct ib_qp *src_qp = NULL; + unsigned tun_tx_ix = 0; + int dqpn; + int ret = 0; + u16 tun_pkey_ix; + u16 cached_pkey; + u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; + + if (dest_qpt > IB_QPT_GSI) + return -EINVAL; + + tun_ctx = dev->sriov.demux[port-1].tun[slave]; + + /* check if proxy qp created */ + if (!tun_ctx || tun_ctx->state != DEMUX_PV_STATE_ACTIVE) + return -EAGAIN; + + if (!dest_qpt) + tun_qp = &tun_ctx->qp[0]; + else + tun_qp = &tun_ctx->qp[1]; + + /* compute P_Key index to put in tunnel header for slave */ + if (dest_qpt) { + u16 pkey_ix; + ret = ib_get_cached_pkey(&dev->ib_dev, port, wc->pkey_index, &cached_pkey); + if (ret) + return -EINVAL; + + ret = find_slave_port_pkey_ix(dev, slave, port, cached_pkey, &pkey_ix); + if (ret) + return -EINVAL; + tun_pkey_ix = pkey_ix; + } else + tun_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0]; + + dqpn = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave + port + (dest_qpt * 2) - 1; + + /* get tunnel tx data buf for slave */ + src_qp = tun_qp->qp; + + /* create ah. Just need an empty one with the port num for the post send. + * The driver will set the force loopback bit in post_send */ + memset(&attr, 0, sizeof attr); + attr.type = rdma_ah_find_type(&dev->ib_dev, port); + + rdma_ah_set_port_num(&attr, port); + if (is_eth) { + union ib_gid sgid; + union ib_gid dgid; + + if (get_gids_from_l3_hdr(grh, &sgid, &dgid)) + return -EINVAL; + rdma_ah_set_grh(&attr, &dgid, 0, 0, 0, 0); + } + ah = rdma_create_ah(tun_ctx->pd, &attr); + if (IS_ERR(ah)) + return -ENOMEM; + + /* allocate tunnel tx buf after pass failure returns */ + spin_lock(&tun_qp->tx_lock); + if (tun_qp->tx_ix_head - tun_qp->tx_ix_tail >= + (MLX4_NUM_TUNNEL_BUFS - 1)) + ret = -EAGAIN; + else + tun_tx_ix = (++tun_qp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1); + spin_unlock(&tun_qp->tx_lock); + if (ret) + goto end; + + tun_mad = (struct mlx4_rcv_tunnel_mad *) (tun_qp->tx_ring[tun_tx_ix].buf.addr); + if (tun_qp->tx_ring[tun_tx_ix].ah) + rdma_destroy_ah(tun_qp->tx_ring[tun_tx_ix].ah); + tun_qp->tx_ring[tun_tx_ix].ah = ah; + ib_dma_sync_single_for_cpu(&dev->ib_dev, + tun_qp->tx_ring[tun_tx_ix].buf.map, + sizeof (struct mlx4_rcv_tunnel_mad), + DMA_TO_DEVICE); + + /* copy over to tunnel buffer */ + if (grh) + memcpy(&tun_mad->grh, grh, sizeof *grh); + memcpy(&tun_mad->mad, mad, sizeof *mad); + + /* adjust tunnel data */ + tun_mad->hdr.pkey_index = cpu_to_be16(tun_pkey_ix); + tun_mad->hdr.flags_src_qp = cpu_to_be32(wc->src_qp & 0xFFFFFF); + tun_mad->hdr.g_ml_path = (grh && (wc->wc_flags & IB_WC_GRH)) ? 0x80 : 0; + + if (is_eth) { + u16 vlan = 0; + if (mlx4_get_slave_default_vlan(dev->dev, port, slave, &vlan, + NULL)) { + /* VST mode */ + if (vlan != wc->vlan_id) + /* Packet vlan is not the VST-assigned vlan. + * Drop the packet. + */ + goto out; + else + /* Remove the vlan tag before forwarding + * the packet to the VF. + */ + vlan = 0xffff; + } else { + vlan = wc->vlan_id; + } + + tun_mad->hdr.sl_vid = cpu_to_be16(vlan); + memcpy((char *)&tun_mad->hdr.mac_31_0, &(wc->smac[0]), 4); + memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2); + } else { + tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12); + tun_mad->hdr.slid_mac_47_32 = ib_lid_be16(wc->slid); + } + + ib_dma_sync_single_for_device(&dev->ib_dev, + tun_qp->tx_ring[tun_tx_ix].buf.map, + sizeof (struct mlx4_rcv_tunnel_mad), + DMA_TO_DEVICE); + + list.addr = tun_qp->tx_ring[tun_tx_ix].buf.map; + list.length = sizeof (struct mlx4_rcv_tunnel_mad); + list.lkey = tun_ctx->pd->local_dma_lkey; + + wr.ah = ah; + wr.port_num = port; + wr.remote_qkey = IB_QP_SET_QKEY; + wr.remote_qpn = dqpn; + wr.wr.next = NULL; + wr.wr.wr_id = ((u64) tun_tx_ix) | MLX4_TUN_SET_WRID_QPN(dest_qpt); + wr.wr.sg_list = &list; + wr.wr.num_sge = 1; + wr.wr.opcode = IB_WR_SEND; + wr.wr.send_flags = IB_SEND_SIGNALED; + + ret = ib_post_send(src_qp, &wr.wr, &bad_wr); + if (!ret) + return 0; + out: + spin_lock(&tun_qp->tx_lock); + tun_qp->tx_ix_tail++; + spin_unlock(&tun_qp->tx_lock); + tun_qp->tx_ring[tun_tx_ix].ah = NULL; +end: + rdma_destroy_ah(ah); + return ret; +} + +static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port, + struct ib_wc *wc, struct ib_grh *grh, + struct ib_mad *mad) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int err, other_port; + int slave = -1; + u8 *slave_id; + int is_eth = 0; + + if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) + is_eth = 0; + else + is_eth = 1; + + if (is_eth) { + union ib_gid dgid; + union ib_gid sgid; + + if (get_gids_from_l3_hdr(grh, &sgid, &dgid)) + return -EINVAL; + if (!(wc->wc_flags & IB_WC_GRH)) { + mlx4_ib_warn(ibdev, "RoCE grh not present.\n"); + return -EINVAL; + } + if (mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_CM) { + mlx4_ib_warn(ibdev, "RoCE mgmt class is not CM\n"); + return -EINVAL; + } + err = mlx4_get_slave_from_roce_gid(dev->dev, port, dgid.raw, &slave); + if (err && mlx4_is_mf_bonded(dev->dev)) { + other_port = (port == 1) ? 2 : 1; + err = mlx4_get_slave_from_roce_gid(dev->dev, other_port, dgid.raw, &slave); + if (!err) { + port = other_port; + pr_debug("resolved slave %d from gid %pI6 wire port %d other %d\n", + slave, grh->dgid.raw, port, other_port); + } + } + if (err) { + mlx4_ib_warn(ibdev, "failed matching grh\n"); + return -ENOENT; + } + if (slave >= dev->dev->caps.sqp_demux) { + mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n", + slave, dev->dev->caps.sqp_demux); + return -ENOENT; + } + + if (mlx4_ib_demux_cm_handler(ibdev, port, NULL, mad)) + return 0; + + err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad); + if (err) + pr_debug("failed sending to slave %d via tunnel qp (%d)\n", + slave, err); + return 0; + } + + /* Initially assume that this mad is for us */ + slave = mlx4_master_func_num(dev->dev); + + /* See if the slave id is encoded in a response mad */ + if (mad->mad_hdr.method & 0x80) { + slave_id = (u8 *) &mad->mad_hdr.tid; + slave = *slave_id; + if (slave != 255) /*255 indicates the dom0*/ + *slave_id = 0; /* remap tid */ + } + + /* If a grh is present, we demux according to it */ + if (wc->wc_flags & IB_WC_GRH) { + if (grh->dgid.global.interface_id == + cpu_to_be64(IB_SA_WELL_KNOWN_GUID) && + grh->dgid.global.subnet_prefix == cpu_to_be64( + atomic64_read(&dev->sriov.demux[port - 1].subnet_prefix))) { + slave = 0; + } else { + slave = mlx4_ib_find_real_gid(ibdev, port, + grh->dgid.global.interface_id); + if (slave < 0) { + mlx4_ib_warn(ibdev, "failed matching grh\n"); + return -ENOENT; + } + } + } + /* Class-specific handling */ + switch (mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + /* 255 indicates the dom0 */ + if (slave != 255 && slave != mlx4_master_func_num(dev->dev)) { + if (!mlx4_vf_smi_enabled(dev->dev, slave, port)) + return -EPERM; + /* for a VF. drop unsolicited MADs */ + if (!(mad->mad_hdr.method & IB_MGMT_METHOD_RESP)) { + mlx4_ib_warn(ibdev, "demux QP0. rejecting unsolicited mad for slave %d class 0x%x, method 0x%x\n", + slave, mad->mad_hdr.mgmt_class, + mad->mad_hdr.method); + return -EINVAL; + } + } + break; + case IB_MGMT_CLASS_SUBN_ADM: + if (mlx4_ib_demux_sa_handler(ibdev, port, slave, + (struct ib_sa_mad *) mad)) + return 0; + break; + case IB_MGMT_CLASS_CM: + if (mlx4_ib_demux_cm_handler(ibdev, port, &slave, mad)) + return 0; + break; + case IB_MGMT_CLASS_DEVICE_MGMT: + if (mad->mad_hdr.method != IB_MGMT_METHOD_GET_RESP) + return 0; + break; + default: + /* Drop unsupported classes for slaves in tunnel mode */ + if (slave != mlx4_master_func_num(dev->dev)) { + pr_debug("dropping unsupported ingress mad from class:%d " + "for slave:%d\n", mad->mad_hdr.mgmt_class, slave); + return 0; + } + } + /*make sure that no slave==255 was not handled yet.*/ + if (slave >= dev->dev->caps.sqp_demux) { + mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n", + slave, dev->dev->caps.sqp_demux); + return -ENOENT; + } + + err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad); + if (err) + pr_debug("failed sending to slave %d via tunnel qp (%d)\n", + slave, err); + return 0; +} + +static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + u16 slid, prev_lid = 0; + int err; + struct ib_port_attr pattr; + + if (in_wc && in_wc->qp->qp_num) { + pr_debug("received MAD: slid:%d sqpn:%d " + "dlid_bits:%d dqpn:%d wc_flags:0x%x, cls %x, mtd %x, atr %x\n", + in_wc->slid, in_wc->src_qp, + in_wc->dlid_path_bits, + in_wc->qp->qp_num, + in_wc->wc_flags, + in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method, + be16_to_cpu(in_mad->mad_hdr.attr_id)); + if (in_wc->wc_flags & IB_WC_GRH) { + pr_debug("sgid_hi:0x%016llx sgid_lo:0x%016llx\n", + be64_to_cpu(in_grh->sgid.global.subnet_prefix), + be64_to_cpu(in_grh->sgid.global.interface_id)); + pr_debug("dgid_hi:0x%016llx dgid_lo:0x%016llx\n", + be64_to_cpu(in_grh->dgid.global.subnet_prefix), + be64_to_cpu(in_grh->dgid.global.interface_id)); + } + } + + slid = in_wc ? ib_lid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) { + forward_trap(to_mdev(ibdev), port_num, in_mad); + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + } + + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS) + return IB_MAD_RESULT_SUCCESS; + + /* + * Don't process SMInfo queries -- the SMA can't handle them. + */ + if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO) + return IB_MAD_RESULT_SUCCESS; + } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT || + in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS1 || + in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS2 || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET) + return IB_MAD_RESULT_SUCCESS; + } else + return IB_MAD_RESULT_SUCCESS; + + if ((in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && + in_mad->mad_hdr.method == IB_MGMT_METHOD_SET && + in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO && + !ib_query_port(ibdev, port_num, &pattr)) + prev_lid = ib_lid_cpu16(pattr.lid); + + err = mlx4_MAD_IFC(to_mdev(ibdev), + (mad_flags & IB_MAD_IGNORE_MKEY ? MLX4_MAD_IFC_IGNORE_MKEY : 0) | + (mad_flags & IB_MAD_IGNORE_BKEY ? MLX4_MAD_IFC_IGNORE_BKEY : 0) | + MLX4_MAD_IFC_NET_VIEW, + port_num, in_wc, in_grh, in_mad, out_mad); + if (err) + return IB_MAD_RESULT_FAILURE; + + if (!out_mad->mad_hdr.status) { + smp_snoop(ibdev, port_num, in_mad, prev_lid); + /* slaves get node desc from FW */ + if (!mlx4_is_slave(to_mdev(ibdev)->dev)) + node_desc_override(ibdev, out_mad); + } + + /* set return bit in status of directed route responses */ + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + out_mad->mad_hdr.status |= cpu_to_be16(1 << 15); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) + /* no response for trap repress */ + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +static void edit_counter(struct mlx4_counter *cnt, void *counters, + __be16 attr_id) +{ + switch (attr_id) { + case IB_PMA_PORT_COUNTERS: + { + struct ib_pma_portcounters *pma_cnt = + (struct ib_pma_portcounters *)counters; + + ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_data, + (be64_to_cpu(cnt->tx_bytes) >> 2)); + ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_data, + (be64_to_cpu(cnt->rx_bytes) >> 2)); + ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_packets, + be64_to_cpu(cnt->tx_frames)); + ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_packets, + be64_to_cpu(cnt->rx_frames)); + break; + } + case IB_PMA_PORT_COUNTERS_EXT: + { + struct ib_pma_portcounters_ext *pma_cnt_ext = + (struct ib_pma_portcounters_ext *)counters; + + pma_cnt_ext->port_xmit_data = + cpu_to_be64(be64_to_cpu(cnt->tx_bytes) >> 2); + pma_cnt_ext->port_rcv_data = + cpu_to_be64(be64_to_cpu(cnt->rx_bytes) >> 2); + pma_cnt_ext->port_xmit_packets = cnt->tx_frames; + pma_cnt_ext->port_rcv_packets = cnt->rx_frames; + break; + } + } +} + +static int iboe_process_mad_port_info(void *out_mad) +{ + struct ib_class_port_info cpi = {}; + + cpi.capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH; + memcpy(out_mad, &cpi, sizeof(cpi)); + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + struct mlx4_counter counter_stats; + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct counter_index *tmp_counter; + int err = IB_MAD_RESULT_FAILURE, stats_avail = 0; + + if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT) + return -EINVAL; + + if (in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO) + return iboe_process_mad_port_info((void *)(out_mad->data + 40)); + + memset(&counter_stats, 0, sizeof(counter_stats)); + mutex_lock(&dev->counters_table[port_num - 1].mutex); + list_for_each_entry(tmp_counter, + &dev->counters_table[port_num - 1].counters_list, + list) { + err = mlx4_get_counter_stats(dev->dev, + tmp_counter->index, + &counter_stats, 0); + if (err) { + err = IB_MAD_RESULT_FAILURE; + stats_avail = 0; + break; + } + stats_avail = 1; + } + mutex_unlock(&dev->counters_table[port_num - 1].mutex); + if (stats_avail) { + memset(out_mad->data, 0, sizeof out_mad->data); + switch (counter_stats.counter_mode & 0xf) { + case 0: + edit_counter(&counter_stats, + (void *)(out_mad->data + 40), + in_mad->mad_hdr.attr_id); + err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + break; + default: + err = IB_MAD_RESULT_FAILURE; + } + } + + return err; +} + +int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad_hdr *in, size_t in_mad_size, + struct ib_mad_hdr *out, size_t *out_mad_size, + u16 *out_mad_pkey_index) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + const struct ib_mad *in_mad = (const struct ib_mad *)in; + struct ib_mad *out_mad = (struct ib_mad *)out; + enum rdma_link_layer link = rdma_port_get_link_layer(ibdev, port_num); + + if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || + *out_mad_size != sizeof(*out_mad))) + return IB_MAD_RESULT_FAILURE; + + /* iboe_process_mad() which uses the HCA flow-counters to implement IB PMA + * queries, should be called only by VFs and for that specific purpose + */ + if (link == IB_LINK_LAYER_INFINIBAND) { + if (mlx4_is_slave(dev->dev) && + (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT && + (in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS || + in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT || + in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO))) + return iboe_process_mad(ibdev, mad_flags, port_num, in_wc, + in_grh, in_mad, out_mad); + + return ib_process_mad(ibdev, mad_flags, port_num, in_wc, + in_grh, in_mad, out_mad); + } + + if (link == IB_LINK_LAYER_ETHERNET) + return iboe_process_mad(ibdev, mad_flags, port_num, in_wc, + in_grh, in_mad, out_mad); + + return -EINVAL; +} + +static void send_handler(struct ib_mad_agent *agent, + struct ib_mad_send_wc *mad_send_wc) +{ + if (mad_send_wc->send_buf->context[0]) + rdma_destroy_ah(mad_send_wc->send_buf->context[0]); + ib_free_send_mad(mad_send_wc->send_buf); +} + +int mlx4_ib_mad_init(struct mlx4_ib_dev *dev) +{ + struct ib_mad_agent *agent; + int p, q; + int ret; + enum rdma_link_layer ll; + + for (p = 0; p < dev->num_ports; ++p) { + ll = rdma_port_get_link_layer(&dev->ib_dev, p + 1); + for (q = 0; q <= 1; ++q) { + if (ll == IB_LINK_LAYER_INFINIBAND) { + agent = ib_register_mad_agent(&dev->ib_dev, p + 1, + q ? IB_QPT_GSI : IB_QPT_SMI, + NULL, 0, send_handler, + NULL, NULL, 0); + if (IS_ERR(agent)) { + ret = PTR_ERR(agent); + goto err; + } + dev->send_agent[p][q] = agent; + } else + dev->send_agent[p][q] = NULL; + } + } + + return 0; + +err: + for (p = 0; p < dev->num_ports; ++p) + for (q = 0; q <= 1; ++q) + if (dev->send_agent[p][q]) + ib_unregister_mad_agent(dev->send_agent[p][q]); + + return ret; +} + +void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev) +{ + struct ib_mad_agent *agent; + int p, q; + + for (p = 0; p < dev->num_ports; ++p) { + for (q = 0; q <= 1; ++q) { + agent = dev->send_agent[p][q]; + if (agent) { + dev->send_agent[p][q] = NULL; + ib_unregister_mad_agent(agent); + } + } + + if (dev->sm_ah[p]) + rdma_destroy_ah(dev->sm_ah[p]); + } +} + +static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num) +{ + mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_LID_CHANGE); + + if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) + mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num, + MLX4_EQ_PORT_INFO_LID_CHANGE_MASK); +} + +static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num) +{ + /* re-configure the alias-guid and mcg's */ + if (mlx4_is_master(dev->dev)) { + mlx4_ib_invalidate_all_guid_record(dev, port_num); + + if (!dev->sriov.is_going_down) { + mlx4_ib_mcg_port_cleanup(&dev->sriov.demux[port_num - 1], 0); + mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num, + MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK); + } + } + + /* Update the sl to vl table from inside client rereg + * only if in secure-host mode (snooping is not possible) + * and the sl-to-vl change event is not generated by FW. + */ + if (!mlx4_is_slave(dev->dev) && + dev->dev->flags & MLX4_FLAG_SECURE_HOST && + !(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT)) { + if (mlx4_is_master(dev->dev)) + /* already in work queue from mlx4_ib_event queueing + * mlx4_handle_port_mgmt_change_event, which calls + * this procedure. Therefore, call sl2vl_update directly. + */ + mlx4_ib_sl2vl_update(dev, port_num); + else + mlx4_sched_ib_sl2vl_update_work(dev, port_num); + } + mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_CLIENT_REREGISTER); +} + +static void propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, + struct mlx4_eqe *eqe) +{ + __propagate_pkey_ev(dev, port_num, GET_BLK_PTR_FROM_EQE(eqe), + GET_MASK_FROM_EQE(eqe)); +} + +static void handle_slaves_guid_change(struct mlx4_ib_dev *dev, u8 port_num, + u32 guid_tbl_blk_num, u32 change_bitmap) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + u16 i; + + if (!mlx4_is_mfunc(dev->dev) || !mlx4_is_master(dev->dev)) + return; + + in_mad = kmalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + guid_tbl_blk_num *= 4; + + for (i = 0; i < 4; i++) { + if (change_bitmap && (!((change_bitmap >> (8 * i)) & 0xff))) + continue; + memset(in_mad, 0, sizeof *in_mad); + memset(out_mad, 0, sizeof *out_mad); + + in_mad->base_version = 1; + in_mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + in_mad->class_version = 1; + in_mad->method = IB_MGMT_METHOD_GET; + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(guid_tbl_blk_num + i); + + if (mlx4_MAD_IFC(dev, + MLX4_MAD_IFC_IGNORE_KEYS | MLX4_MAD_IFC_NET_VIEW, + port_num, NULL, NULL, in_mad, out_mad)) { + mlx4_ib_warn(&dev->ib_dev, "Failed in get GUID INFO MAD_IFC\n"); + goto out; + } + + mlx4_ib_update_cache_on_guid_change(dev, guid_tbl_blk_num + i, + port_num, + (u8 *)(&((struct ib_smp *)out_mad)->data)); + mlx4_ib_notify_slaves_on_guid_change(dev, guid_tbl_blk_num + i, + port_num, + (u8 *)(&((struct ib_smp *)out_mad)->data)); + } + +out: + kfree(in_mad); + kfree(out_mad); + return; +} + +void handle_port_mgmt_change_event(struct work_struct *work) +{ + struct ib_event_work *ew = container_of(work, struct ib_event_work, work); + struct mlx4_ib_dev *dev = ew->ib_dev; + struct mlx4_eqe *eqe = &(ew->ib_eqe); + u8 port = eqe->event.port_mgmt_change.port; + u32 changed_attr; + u32 tbl_block; + u32 change_bitmap; + + switch (eqe->subtype) { + case MLX4_DEV_PMC_SUBTYPE_PORT_INFO: + changed_attr = be32_to_cpu(eqe->event.port_mgmt_change.params.port_info.changed_attr); + + /* Update the SM ah - This should be done before handling + the other changed attributes so that MADs can be sent to the SM */ + if (changed_attr & MSTR_SM_CHANGE_MASK) { + u16 lid = be16_to_cpu(eqe->event.port_mgmt_change.params.port_info.mstr_sm_lid); + u8 sl = eqe->event.port_mgmt_change.params.port_info.mstr_sm_sl & 0xf; + update_sm_ah(dev, port, lid, sl); + } + + /* Check if it is a lid change event */ + if (changed_attr & MLX4_EQ_PORT_INFO_LID_CHANGE_MASK) + handle_lid_change_event(dev, port); + + /* Generate GUID changed event */ + if (changed_attr & MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK) { + if (mlx4_is_master(dev->dev)) { + union ib_gid gid; + int err = 0; + + if (!eqe->event.port_mgmt_change.params.port_info.gid_prefix) + err = __mlx4_ib_query_gid(&dev->ib_dev, port, 0, &gid, 1); + else + gid.global.subnet_prefix = + eqe->event.port_mgmt_change.params.port_info.gid_prefix; + if (err) { + pr_warn("Could not change QP1 subnet prefix for port %d: query_gid error (%d)\n", + port, err); + } else { + pr_debug("Changing QP1 subnet prefix for port %d. old=0x%llx. new=0x%llx\n", + port, + (u64)atomic64_read(&dev->sriov.demux[port - 1].subnet_prefix), + be64_to_cpu(gid.global.subnet_prefix)); + atomic64_set(&dev->sriov.demux[port - 1].subnet_prefix, + be64_to_cpu(gid.global.subnet_prefix)); + } + } + mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE); + /*if master, notify all slaves*/ + if (mlx4_is_master(dev->dev)) + mlx4_gen_slaves_port_mgt_ev(dev->dev, port, + MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK); + } + + if (changed_attr & MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK) + handle_client_rereg_event(dev, port); + break; + + case MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE: + mlx4_ib_dispatch_event(dev, port, IB_EVENT_PKEY_CHANGE); + if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) + propagate_pkey_ev(dev, port, eqe); + break; + case MLX4_DEV_PMC_SUBTYPE_GUID_INFO: + /* paravirtualized master's guid is guid 0 -- does not change */ + if (!mlx4_is_master(dev->dev)) + mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE); + /*if master, notify relevant slaves*/ + else if (!dev->sriov.is_going_down) { + tbl_block = GET_BLK_PTR_FROM_EQE(eqe); + change_bitmap = GET_MASK_FROM_EQE(eqe); + handle_slaves_guid_change(dev, port, tbl_block, change_bitmap); + } + break; + + case MLX4_DEV_PMC_SUBTYPE_SL_TO_VL_MAP: + /* cache sl to vl mapping changes for use in + * filling QP1 LRH VL field when sending packets + */ + if (!mlx4_is_slave(dev->dev)) { + union sl2vl_tbl_to_u64 sl2vl64; + int jj; + + for (jj = 0; jj < 8; jj++) { + sl2vl64.sl8[jj] = + eqe->event.port_mgmt_change.params.sl2vl_tbl_change_info.sl2vl_table[jj]; + pr_debug("port %u, sl2vl[%d] = %02x\n", + port, jj, sl2vl64.sl8[jj]); + } + atomic64_set(&dev->sl2vl[port - 1], sl2vl64.sl64); + } + break; + default: + pr_warn("Unsupported subtype 0x%x for " + "Port Management Change event\n", eqe->subtype); + } + + kfree(ew); +} + +void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num, + enum ib_event_type type) +{ + struct ib_event event; + + event.device = &dev->ib_dev; + event.element.port_num = port_num; + event.event = type; + + ib_dispatch_event(&event); +} + +static void mlx4_ib_tunnel_comp_handler(struct ib_cq *cq, void *arg) +{ + unsigned long flags; + struct mlx4_ib_demux_pv_ctx *ctx = cq->cq_context; + struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + if (!dev->sriov.is_going_down && ctx->state == DEMUX_PV_STATE_ACTIVE) + queue_work(ctx->wq, &ctx->work); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +static void mlx4_ib_wire_comp_handler(struct ib_cq *cq, void *arg) +{ + unsigned long flags; + struct mlx4_ib_demux_pv_ctx *ctx = cq->cq_context; + struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); + + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + if (!dev->sriov.is_going_down && ctx->state == DEMUX_PV_STATE_ACTIVE) + queue_work(ctx->wi_wq, &ctx->work); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx, + struct mlx4_ib_demux_pv_qp *tun_qp, + int index) +{ + struct ib_sge sg_list; + struct ib_recv_wr recv_wr; + const struct ib_recv_wr *bad_recv_wr; + int size; + + size = (tun_qp->qp->qp_type == IB_QPT_UD) ? + sizeof (struct mlx4_tunnel_mad) : sizeof (struct mlx4_mad_rcv_buf); + + sg_list.addr = tun_qp->ring[index].map; + sg_list.length = size; + sg_list.lkey = ctx->pd->local_dma_lkey; + + recv_wr.next = NULL; + recv_wr.sg_list = &sg_list; + recv_wr.num_sge = 1; + recv_wr.wr_id = (u64) index | MLX4_TUN_WRID_RECV | + MLX4_TUN_SET_WRID_QPN(tun_qp->proxy_qpt); + ib_dma_sync_single_for_device(ctx->ib_dev, tun_qp->ring[index].map, + size, DMA_FROM_DEVICE); + return ib_post_recv(tun_qp->qp, &recv_wr, &bad_recv_wr); +} + +static int mlx4_ib_multiplex_sa_handler(struct ib_device *ibdev, int port, + int slave, struct ib_sa_mad *sa_mad) +{ + int ret = 0; + + /* dispatch to different sa handlers */ + switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) { + case IB_SA_ATTR_MC_MEMBER_REC: + ret = mlx4_ib_mcg_multiplex_handler(ibdev, port, slave, sa_mad); + break; + default: + break; + } + return ret; +} + +static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave) +{ + int proxy_start = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave; + + return (qpn >= proxy_start && qpn <= proxy_start + 1); +} + + +int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type dest_qpt, u16 pkey_index, + u32 remote_qpn, u32 qkey, struct rdma_ah_attr *attr, + u8 *s_mac, u16 vlan_id, struct ib_mad *mad) +{ + struct ib_sge list; + struct ib_ud_wr wr; + const struct ib_send_wr *bad_wr; + struct mlx4_ib_demux_pv_ctx *sqp_ctx; + struct mlx4_ib_demux_pv_qp *sqp; + struct mlx4_mad_snd_buf *sqp_mad; + struct ib_ah *ah; + struct ib_qp *send_qp = NULL; + unsigned wire_tx_ix = 0; + int ret = 0; + u16 wire_pkey_ix; + int src_qpnum; + + sqp_ctx = dev->sriov.sqps[port-1]; + + /* check if proxy qp created */ + if (!sqp_ctx || sqp_ctx->state != DEMUX_PV_STATE_ACTIVE) + return -EAGAIN; + + if (dest_qpt == IB_QPT_SMI) { + src_qpnum = 0; + sqp = &sqp_ctx->qp[0]; + wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0]; + } else { + src_qpnum = 1; + sqp = &sqp_ctx->qp[1]; + wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][pkey_index]; + } + + send_qp = sqp->qp; + + /* create ah */ + ah = mlx4_ib_create_ah_slave(sqp_ctx->pd, attr, + rdma_ah_retrieve_grh(attr)->sgid_index, + s_mac, vlan_id); + if (IS_ERR(ah)) + return -ENOMEM; + spin_lock(&sqp->tx_lock); + if (sqp->tx_ix_head - sqp->tx_ix_tail >= + (MLX4_NUM_TUNNEL_BUFS - 1)) + ret = -EAGAIN; + else + wire_tx_ix = (++sqp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1); + spin_unlock(&sqp->tx_lock); + if (ret) + goto out; + + sqp_mad = (struct mlx4_mad_snd_buf *) (sqp->tx_ring[wire_tx_ix].buf.addr); + if (sqp->tx_ring[wire_tx_ix].ah) + rdma_destroy_ah(sqp->tx_ring[wire_tx_ix].ah); + sqp->tx_ring[wire_tx_ix].ah = ah; + ib_dma_sync_single_for_cpu(&dev->ib_dev, + sqp->tx_ring[wire_tx_ix].buf.map, + sizeof (struct mlx4_mad_snd_buf), + DMA_TO_DEVICE); + + memcpy(&sqp_mad->payload, mad, sizeof *mad); + + ib_dma_sync_single_for_device(&dev->ib_dev, + sqp->tx_ring[wire_tx_ix].buf.map, + sizeof (struct mlx4_mad_snd_buf), + DMA_TO_DEVICE); + + list.addr = sqp->tx_ring[wire_tx_ix].buf.map; + list.length = sizeof (struct mlx4_mad_snd_buf); + list.lkey = sqp_ctx->pd->local_dma_lkey; + + wr.ah = ah; + wr.port_num = port; + wr.pkey_index = wire_pkey_ix; + wr.remote_qkey = qkey; + wr.remote_qpn = remote_qpn; + wr.wr.next = NULL; + wr.wr.wr_id = ((u64) wire_tx_ix) | MLX4_TUN_SET_WRID_QPN(src_qpnum); + wr.wr.sg_list = &list; + wr.wr.num_sge = 1; + wr.wr.opcode = IB_WR_SEND; + wr.wr.send_flags = IB_SEND_SIGNALED; + + ret = ib_post_send(send_qp, &wr.wr, &bad_wr); + if (!ret) + return 0; + + spin_lock(&sqp->tx_lock); + sqp->tx_ix_tail++; + spin_unlock(&sqp->tx_lock); + sqp->tx_ring[wire_tx_ix].ah = NULL; +out: + mlx4_ib_destroy_ah(ah); + return ret; +} + +static int get_slave_base_gid_ix(struct mlx4_ib_dev *dev, int slave, int port) +{ + if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) + return slave; + return mlx4_get_base_gid_ix(dev->dev, slave, port); +} + +static void fill_in_real_sgid_index(struct mlx4_ib_dev *dev, int slave, int port, + struct rdma_ah_attr *ah_attr) +{ + struct ib_global_route *grh = rdma_ah_retrieve_grh(ah_attr); + if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) + grh->sgid_index = slave; + else + grh->sgid_index += get_slave_base_gid_ix(dev, slave, port); +} + +static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc *wc) +{ + struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); + struct mlx4_ib_demux_pv_qp *tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc->wr_id)]; + int wr_ix = wc->wr_id & (MLX4_NUM_TUNNEL_BUFS - 1); + struct mlx4_tunnel_mad *tunnel = tun_qp->ring[wr_ix].addr; + struct mlx4_ib_ah ah; + struct rdma_ah_attr ah_attr; + u8 *slave_id; + int slave; + int port; + u16 vlan_id; + u8 qos; + u8 *dmac; + + /* Get slave that sent this packet */ + if (wc->src_qp < dev->dev->phys_caps.base_proxy_sqpn || + wc->src_qp >= dev->dev->phys_caps.base_proxy_sqpn + 8 * MLX4_MFUNC_MAX || + (wc->src_qp & 0x1) != ctx->port - 1 || + wc->src_qp & 0x4) { + mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d\n", wc->src_qp); + return; + } + slave = ((wc->src_qp & ~0x7) - dev->dev->phys_caps.base_proxy_sqpn) / 8; + if (slave != ctx->slave) { + mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: " + "belongs to another slave\n", wc->src_qp); + return; + } + + /* Map transaction ID */ + ib_dma_sync_single_for_cpu(ctx->ib_dev, tun_qp->ring[wr_ix].map, + sizeof (struct mlx4_tunnel_mad), + DMA_FROM_DEVICE); + switch (tunnel->mad.mad_hdr.method) { + case IB_MGMT_METHOD_SET: + case IB_MGMT_METHOD_GET: + case IB_MGMT_METHOD_REPORT: + case IB_SA_METHOD_GET_TABLE: + case IB_SA_METHOD_DELETE: + case IB_SA_METHOD_GET_MULTI: + case IB_SA_METHOD_GET_TRACE_TBL: + slave_id = (u8 *) &tunnel->mad.mad_hdr.tid; + if (*slave_id) { + mlx4_ib_warn(ctx->ib_dev, "egress mad has non-null tid msb:%d " + "class:%d slave:%d\n", *slave_id, + tunnel->mad.mad_hdr.mgmt_class, slave); + return; + } else + *slave_id = slave; + default: + /* nothing */; + } + + /* Class-specific handling */ + switch (tunnel->mad.mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + if (slave != mlx4_master_func_num(dev->dev) && + !mlx4_vf_smi_enabled(dev->dev, slave, ctx->port)) + return; + break; + case IB_MGMT_CLASS_SUBN_ADM: + if (mlx4_ib_multiplex_sa_handler(ctx->ib_dev, ctx->port, slave, + (struct ib_sa_mad *) &tunnel->mad)) + return; + break; + case IB_MGMT_CLASS_CM: + if (mlx4_ib_multiplex_cm_handler(ctx->ib_dev, ctx->port, slave, + (struct ib_mad *) &tunnel->mad)) + return; + break; + case IB_MGMT_CLASS_DEVICE_MGMT: + if (tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_GET && + tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_SET) + return; + break; + default: + /* Drop unsupported classes for slaves in tunnel mode */ + if (slave != mlx4_master_func_num(dev->dev)) { + mlx4_ib_warn(ctx->ib_dev, "dropping unsupported egress mad from class:%d " + "for slave:%d\n", tunnel->mad.mad_hdr.mgmt_class, slave); + return; + } + } + + /* We are using standard ib_core services to send the mad, so generate a + * stadard address handle by decoding the tunnelled mlx4_ah fields */ + memcpy(&ah.av, &tunnel->hdr.av, sizeof (struct mlx4_av)); + ah.ibah.device = ctx->ib_dev; + + port = be32_to_cpu(ah.av.ib.port_pd) >> 24; + port = mlx4_slave_convert_port(dev->dev, slave, port); + if (port < 0) + return; + ah.av.ib.port_pd = cpu_to_be32(port << 24 | (be32_to_cpu(ah.av.ib.port_pd) & 0xffffff)); + ah.ibah.type = rdma_ah_find_type(&dev->ib_dev, port); + + mlx4_ib_query_ah(&ah.ibah, &ah_attr); + if (rdma_ah_get_ah_flags(&ah_attr) & IB_AH_GRH) + fill_in_real_sgid_index(dev, slave, ctx->port, &ah_attr); + dmac = rdma_ah_retrieve_dmac(&ah_attr); + if (dmac) + memcpy(dmac, tunnel->hdr.mac, ETH_ALEN); + vlan_id = be16_to_cpu(tunnel->hdr.vlan); + /* if slave have default vlan use it */ + if (mlx4_get_slave_default_vlan(dev->dev, ctx->port, slave, + &vlan_id, &qos)) + rdma_ah_set_sl(&ah_attr, qos); + + mlx4_ib_send_to_wire(dev, slave, ctx->port, + is_proxy_qp0(dev, wc->src_qp, slave) ? + IB_QPT_SMI : IB_QPT_GSI, + be16_to_cpu(tunnel->hdr.pkey_index), + be32_to_cpu(tunnel->hdr.remote_qpn), + be32_to_cpu(tunnel->hdr.qkey), + &ah_attr, wc->smac, vlan_id, &tunnel->mad); +} + +static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx, + enum ib_qp_type qp_type, int is_tun) +{ + int i; + struct mlx4_ib_demux_pv_qp *tun_qp; + int rx_buf_size, tx_buf_size; + + if (qp_type > IB_QPT_GSI) + return -EINVAL; + + tun_qp = &ctx->qp[qp_type]; + + tun_qp->ring = kcalloc(MLX4_NUM_TUNNEL_BUFS, + sizeof(struct mlx4_ib_buf), + GFP_KERNEL); + if (!tun_qp->ring) + return -ENOMEM; + + tun_qp->tx_ring = kcalloc(MLX4_NUM_TUNNEL_BUFS, + sizeof (struct mlx4_ib_tun_tx_buf), + GFP_KERNEL); + if (!tun_qp->tx_ring) { + kfree(tun_qp->ring); + tun_qp->ring = NULL; + return -ENOMEM; + } + + if (is_tun) { + rx_buf_size = sizeof (struct mlx4_tunnel_mad); + tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad); + } else { + rx_buf_size = sizeof (struct mlx4_mad_rcv_buf); + tx_buf_size = sizeof (struct mlx4_mad_snd_buf); + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + tun_qp->ring[i].addr = kmalloc(rx_buf_size, GFP_KERNEL); + if (!tun_qp->ring[i].addr) + goto err; + tun_qp->ring[i].map = ib_dma_map_single(ctx->ib_dev, + tun_qp->ring[i].addr, + rx_buf_size, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ctx->ib_dev, tun_qp->ring[i].map)) { + kfree(tun_qp->ring[i].addr); + goto err; + } + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + tun_qp->tx_ring[i].buf.addr = + kmalloc(tx_buf_size, GFP_KERNEL); + if (!tun_qp->tx_ring[i].buf.addr) + goto tx_err; + tun_qp->tx_ring[i].buf.map = + ib_dma_map_single(ctx->ib_dev, + tun_qp->tx_ring[i].buf.addr, + tx_buf_size, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(ctx->ib_dev, + tun_qp->tx_ring[i].buf.map)) { + kfree(tun_qp->tx_ring[i].buf.addr); + goto tx_err; + } + tun_qp->tx_ring[i].ah = NULL; + } + spin_lock_init(&tun_qp->tx_lock); + tun_qp->tx_ix_head = 0; + tun_qp->tx_ix_tail = 0; + tun_qp->proxy_qpt = qp_type; + + return 0; + +tx_err: + while (i > 0) { + --i; + ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map, + tx_buf_size, DMA_TO_DEVICE); + kfree(tun_qp->tx_ring[i].buf.addr); + } + i = MLX4_NUM_TUNNEL_BUFS; +err: + while (i > 0) { + --i; + ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map, + rx_buf_size, DMA_FROM_DEVICE); + kfree(tun_qp->ring[i].addr); + } + kfree(tun_qp->tx_ring); + tun_qp->tx_ring = NULL; + kfree(tun_qp->ring); + tun_qp->ring = NULL; + return -ENOMEM; +} + +static void mlx4_ib_free_pv_qp_bufs(struct mlx4_ib_demux_pv_ctx *ctx, + enum ib_qp_type qp_type, int is_tun) +{ + int i; + struct mlx4_ib_demux_pv_qp *tun_qp; + int rx_buf_size, tx_buf_size; + + if (qp_type > IB_QPT_GSI) + return; + + tun_qp = &ctx->qp[qp_type]; + if (is_tun) { + rx_buf_size = sizeof (struct mlx4_tunnel_mad); + tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad); + } else { + rx_buf_size = sizeof (struct mlx4_mad_rcv_buf); + tx_buf_size = sizeof (struct mlx4_mad_snd_buf); + } + + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map, + rx_buf_size, DMA_FROM_DEVICE); + kfree(tun_qp->ring[i].addr); + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map, + tx_buf_size, DMA_TO_DEVICE); + kfree(tun_qp->tx_ring[i].buf.addr); + if (tun_qp->tx_ring[i].ah) + rdma_destroy_ah(tun_qp->tx_ring[i].ah); + } + kfree(tun_qp->tx_ring); + kfree(tun_qp->ring); +} + +static void mlx4_ib_tunnel_comp_worker(struct work_struct *work) +{ + struct mlx4_ib_demux_pv_ctx *ctx; + struct mlx4_ib_demux_pv_qp *tun_qp; + struct ib_wc wc; + int ret; + ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work); + ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); + + while (ib_poll_cq(ctx->cq, 1, &wc) == 1) { + tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)]; + if (wc.status == IB_WC_SUCCESS) { + switch (wc.opcode) { + case IB_WC_RECV: + mlx4_ib_multiplex_mad(ctx, &wc); + ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, + wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)); + if (ret) + pr_err("Failed reposting tunnel " + "buf:%lld\n", wc.wr_id); + break; + case IB_WC_SEND: + pr_debug("received tunnel send completion:" + "wrid=0x%llx, status=0x%x\n", + wc.wr_id, wc.status); + rdma_destroy_ah(tun_qp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&tun_qp->tx_lock); + tun_qp->tx_ix_tail++; + spin_unlock(&tun_qp->tx_lock); + + break; + default: + break; + } + } else { + pr_debug("mlx4_ib: completion error in tunnel: %d." + " status = %d, wrid = 0x%llx\n", + ctx->slave, wc.status, wc.wr_id); + if (!MLX4_TUN_IS_RECV(wc.wr_id)) { + rdma_destroy_ah(tun_qp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&tun_qp->tx_lock); + tun_qp->tx_ix_tail++; + spin_unlock(&tun_qp->tx_lock); + } + } + } +} + +static void pv_qp_event_handler(struct ib_event *event, void *qp_context) +{ + struct mlx4_ib_demux_pv_ctx *sqp = qp_context; + + /* It's worse than that! He's dead, Jim! */ + pr_err("Fatal error (%d) on a MAD QP on port %d\n", + event->event, sqp->port); +} + +static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx, + enum ib_qp_type qp_type, int create_tun) +{ + int i, ret; + struct mlx4_ib_demux_pv_qp *tun_qp; + struct mlx4_ib_qp_tunnel_init_attr qp_init_attr; + struct ib_qp_attr attr; + int qp_attr_mask_INIT; + + if (qp_type > IB_QPT_GSI) + return -EINVAL; + + tun_qp = &ctx->qp[qp_type]; + + memset(&qp_init_attr, 0, sizeof qp_init_attr); + qp_init_attr.init_attr.send_cq = ctx->cq; + qp_init_attr.init_attr.recv_cq = ctx->cq; + qp_init_attr.init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + qp_init_attr.init_attr.cap.max_send_wr = MLX4_NUM_TUNNEL_BUFS; + qp_init_attr.init_attr.cap.max_recv_wr = MLX4_NUM_TUNNEL_BUFS; + qp_init_attr.init_attr.cap.max_send_sge = 1; + qp_init_attr.init_attr.cap.max_recv_sge = 1; + if (create_tun) { + qp_init_attr.init_attr.qp_type = IB_QPT_UD; + qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_TUNNEL_QP; + qp_init_attr.port = ctx->port; + qp_init_attr.slave = ctx->slave; + qp_init_attr.proxy_qp_type = qp_type; + qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | + IB_QP_QKEY | IB_QP_PORT; + } else { + qp_init_attr.init_attr.qp_type = qp_type; + qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_SQP; + qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY; + } + qp_init_attr.init_attr.port_num = ctx->port; + qp_init_attr.init_attr.qp_context = ctx; + qp_init_attr.init_attr.event_handler = pv_qp_event_handler; + tun_qp->qp = ib_create_qp(ctx->pd, &qp_init_attr.init_attr); + if (IS_ERR(tun_qp->qp)) { + ret = PTR_ERR(tun_qp->qp); + tun_qp->qp = NULL; + pr_err("Couldn't create %s QP (%d)\n", + create_tun ? "tunnel" : "special", ret); + return ret; + } + + memset(&attr, 0, sizeof attr); + attr.qp_state = IB_QPS_INIT; + ret = 0; + if (create_tun) + ret = find_slave_port_pkey_ix(to_mdev(ctx->ib_dev), ctx->slave, + ctx->port, IB_DEFAULT_PKEY_FULL, + &attr.pkey_index); + if (ret || !create_tun) + attr.pkey_index = + to_mdev(ctx->ib_dev)->pkeys.virt2phys_pkey[ctx->slave][ctx->port - 1][0]; + attr.qkey = IB_QP1_QKEY; + attr.port_num = ctx->port; + ret = ib_modify_qp(tun_qp->qp, &attr, qp_attr_mask_INIT); + if (ret) { + pr_err("Couldn't change %s qp state to INIT (%d)\n", + create_tun ? "tunnel" : "special", ret); + goto err_qp; + } + attr.qp_state = IB_QPS_RTR; + ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE); + if (ret) { + pr_err("Couldn't change %s qp state to RTR (%d)\n", + create_tun ? "tunnel" : "special", ret); + goto err_qp; + } + attr.qp_state = IB_QPS_RTS; + attr.sq_psn = 0; + ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN); + if (ret) { + pr_err("Couldn't change %s qp state to RTS (%d)\n", + create_tun ? "tunnel" : "special", ret); + goto err_qp; + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, i); + if (ret) { + pr_err(" mlx4_ib_post_pv_buf error" + " (err = %d, i = %d)\n", ret, i); + goto err_qp; + } + } + return 0; + +err_qp: + ib_destroy_qp(tun_qp->qp); + tun_qp->qp = NULL; + return ret; +} + +/* + * IB MAD completion callback for real SQPs + */ +static void mlx4_ib_sqp_comp_worker(struct work_struct *work) +{ + struct mlx4_ib_demux_pv_ctx *ctx; + struct mlx4_ib_demux_pv_qp *sqp; + struct ib_wc wc; + struct ib_grh *grh; + struct ib_mad *mad; + + ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work); + ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); + + while (mlx4_ib_poll_cq(ctx->cq, 1, &wc) == 1) { + sqp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)]; + if (wc.status == IB_WC_SUCCESS) { + switch (wc.opcode) { + case IB_WC_SEND: + rdma_destroy_ah(sqp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&sqp->tx_lock); + sqp->tx_ix_tail++; + spin_unlock(&sqp->tx_lock); + break; + case IB_WC_RECV: + mad = (struct ib_mad *) &(((struct mlx4_mad_rcv_buf *) + (sqp->ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].addr))->payload); + grh = &(((struct mlx4_mad_rcv_buf *) + (sqp->ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].addr))->grh); + mlx4_ib_demux_mad(ctx->ib_dev, ctx->port, &wc, grh, mad); + if (mlx4_ib_post_pv_qp_buf(ctx, sqp, wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1))) + pr_err("Failed reposting SQP " + "buf:%lld\n", wc.wr_id); + break; + default: + break; + } + } else { + pr_debug("mlx4_ib: completion error in tunnel: %d." + " status = %d, wrid = 0x%llx\n", + ctx->slave, wc.status, wc.wr_id); + if (!MLX4_TUN_IS_RECV(wc.wr_id)) { + rdma_destroy_ah(sqp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&sqp->tx_lock); + sqp->tx_ix_tail++; + spin_unlock(&sqp->tx_lock); + } + } + } +} + +static int alloc_pv_object(struct mlx4_ib_dev *dev, int slave, int port, + struct mlx4_ib_demux_pv_ctx **ret_ctx) +{ + struct mlx4_ib_demux_pv_ctx *ctx; + + *ret_ctx = NULL; + ctx = kzalloc(sizeof (struct mlx4_ib_demux_pv_ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->ib_dev = &dev->ib_dev; + ctx->port = port; + ctx->slave = slave; + *ret_ctx = ctx; + return 0; +} + +static void free_pv_object(struct mlx4_ib_dev *dev, int slave, int port) +{ + if (dev->sriov.demux[port - 1].tun[slave]) { + kfree(dev->sriov.demux[port - 1].tun[slave]); + dev->sriov.demux[port - 1].tun[slave] = NULL; + } +} + +static int create_pv_resources(struct ib_device *ibdev, int slave, int port, + int create_tun, struct mlx4_ib_demux_pv_ctx *ctx) +{ + int ret, cq_size; + struct ib_cq_init_attr cq_attr = {}; + + if (ctx->state != DEMUX_PV_STATE_DOWN) + return -EEXIST; + + ctx->state = DEMUX_PV_STATE_STARTING; + /* have QP0 only if link layer is IB */ + if (rdma_port_get_link_layer(ibdev, ctx->port) == + IB_LINK_LAYER_INFINIBAND) + ctx->has_smi = 1; + + if (ctx->has_smi) { + ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_SMI, create_tun); + if (ret) { + pr_err("Failed allocating qp0 tunnel bufs (%d)\n", ret); + goto err_out; + } + } + + ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_GSI, create_tun); + if (ret) { + pr_err("Failed allocating qp1 tunnel bufs (%d)\n", ret); + goto err_out_qp0; + } + + cq_size = 2 * MLX4_NUM_TUNNEL_BUFS; + if (ctx->has_smi) + cq_size *= 2; + + cq_attr.cqe = cq_size; + ctx->cq = ib_create_cq(ctx->ib_dev, + create_tun ? mlx4_ib_tunnel_comp_handler : mlx4_ib_wire_comp_handler, + NULL, ctx, &cq_attr); + if (IS_ERR(ctx->cq)) { + ret = PTR_ERR(ctx->cq); + pr_err("Couldn't create tunnel CQ (%d)\n", ret); + goto err_buf; + } + + ctx->pd = ib_alloc_pd(ctx->ib_dev, 0); + if (IS_ERR(ctx->pd)) { + ret = PTR_ERR(ctx->pd); + pr_err("Couldn't create tunnel PD (%d)\n", ret); + goto err_cq; + } + + if (ctx->has_smi) { + ret = create_pv_sqp(ctx, IB_QPT_SMI, create_tun); + if (ret) { + pr_err("Couldn't create %s QP0 (%d)\n", + create_tun ? "tunnel for" : "", ret); + goto err_pd; + } + } + + ret = create_pv_sqp(ctx, IB_QPT_GSI, create_tun); + if (ret) { + pr_err("Couldn't create %s QP1 (%d)\n", + create_tun ? "tunnel for" : "", ret); + goto err_qp0; + } + + if (create_tun) + INIT_WORK(&ctx->work, mlx4_ib_tunnel_comp_worker); + else + INIT_WORK(&ctx->work, mlx4_ib_sqp_comp_worker); + + ctx->wq = to_mdev(ibdev)->sriov.demux[port - 1].wq; + ctx->wi_wq = to_mdev(ibdev)->sriov.demux[port - 1].wi_wq; + + ret = ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); + if (ret) { + pr_err("Couldn't arm tunnel cq (%d)\n", ret); + goto err_wq; + } + ctx->state = DEMUX_PV_STATE_ACTIVE; + return 0; + +err_wq: + ctx->wq = NULL; + ib_destroy_qp(ctx->qp[1].qp); + ctx->qp[1].qp = NULL; + + +err_qp0: + if (ctx->has_smi) + ib_destroy_qp(ctx->qp[0].qp); + ctx->qp[0].qp = NULL; + +err_pd: + ib_dealloc_pd(ctx->pd); + ctx->pd = NULL; + +err_cq: + ib_destroy_cq(ctx->cq); + ctx->cq = NULL; + +err_buf: + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, create_tun); + +err_out_qp0: + if (ctx->has_smi) + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, create_tun); +err_out: + ctx->state = DEMUX_PV_STATE_DOWN; + return ret; +} + +static void destroy_pv_resources(struct mlx4_ib_dev *dev, int slave, int port, + struct mlx4_ib_demux_pv_ctx *ctx, int flush) +{ + if (!ctx) + return; + if (ctx->state > DEMUX_PV_STATE_DOWN) { + ctx->state = DEMUX_PV_STATE_DOWNING; + if (flush) + flush_workqueue(ctx->wq); + if (ctx->has_smi) { + ib_destroy_qp(ctx->qp[0].qp); + ctx->qp[0].qp = NULL; + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, 1); + } + ib_destroy_qp(ctx->qp[1].qp); + ctx->qp[1].qp = NULL; + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, 1); + ib_dealloc_pd(ctx->pd); + ctx->pd = NULL; + ib_destroy_cq(ctx->cq); + ctx->cq = NULL; + ctx->state = DEMUX_PV_STATE_DOWN; + } +} + +static int mlx4_ib_tunnels_update(struct mlx4_ib_dev *dev, int slave, + int port, int do_init) +{ + int ret = 0; + + if (!do_init) { + clean_vf_mcast(&dev->sriov.demux[port - 1], slave); + /* for master, destroy real sqp resources */ + if (slave == mlx4_master_func_num(dev->dev)) + destroy_pv_resources(dev, slave, port, + dev->sriov.sqps[port - 1], 1); + /* destroy the tunnel qp resources */ + destroy_pv_resources(dev, slave, port, + dev->sriov.demux[port - 1].tun[slave], 1); + return 0; + } + + /* create the tunnel qp resources */ + ret = create_pv_resources(&dev->ib_dev, slave, port, 1, + dev->sriov.demux[port - 1].tun[slave]); + + /* for master, create the real sqp resources */ + if (!ret && slave == mlx4_master_func_num(dev->dev)) + ret = create_pv_resources(&dev->ib_dev, slave, port, 0, + dev->sriov.sqps[port - 1]); + return ret; +} + +void mlx4_ib_tunnels_update_work(struct work_struct *work) +{ + struct mlx4_ib_demux_work *dmxw; + + dmxw = container_of(work, struct mlx4_ib_demux_work, work); + mlx4_ib_tunnels_update(dmxw->dev, dmxw->slave, (int) dmxw->port, + dmxw->do_init); + kfree(dmxw); + return; +} + +static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev, + struct mlx4_ib_demux_ctx *ctx, + int port) +{ + char name[12]; + int ret = 0; + int i; + + ctx->tun = kcalloc(dev->dev->caps.sqp_demux, + sizeof (struct mlx4_ib_demux_pv_ctx *), GFP_KERNEL); + if (!ctx->tun) + return -ENOMEM; + + ctx->dev = dev; + ctx->port = port; + ctx->ib_dev = &dev->ib_dev; + + for (i = 0; + i < min(dev->dev->caps.sqp_demux, + (u16)(dev->dev->persist->num_vfs + 1)); + i++) { + struct mlx4_active_ports actv_ports = + mlx4_get_active_ports(dev->dev, i); + + if (!test_bit(port - 1, actv_ports.ports)) + continue; + + ret = alloc_pv_object(dev, i, port, &ctx->tun[i]); + if (ret) { + ret = -ENOMEM; + goto err_mcg; + } + } + + ret = mlx4_ib_mcg_port_init(ctx); + if (ret) { + pr_err("Failed initializing mcg para-virt (%d)\n", ret); + goto err_mcg; + } + + snprintf(name, sizeof(name), "mlx4_ibt%d", port); + ctx->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); + if (!ctx->wq) { + pr_err("Failed to create tunnelling WQ for port %d\n", port); + ret = -ENOMEM; + goto err_wq; + } + + snprintf(name, sizeof(name), "mlx4_ibwi%d", port); + ctx->wi_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); + if (!ctx->wi_wq) { + pr_err("Failed to create wire WQ for port %d\n", port); + ret = -ENOMEM; + goto err_wiwq; + } + + snprintf(name, sizeof(name), "mlx4_ibud%d", port); + ctx->ud_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); + if (!ctx->ud_wq) { + pr_err("Failed to create up/down WQ for port %d\n", port); + ret = -ENOMEM; + goto err_udwq; + } + + return 0; + +err_udwq: + destroy_workqueue(ctx->wi_wq); + ctx->wi_wq = NULL; + +err_wiwq: + destroy_workqueue(ctx->wq); + ctx->wq = NULL; + +err_wq: + mlx4_ib_mcg_port_cleanup(ctx, 1); +err_mcg: + for (i = 0; i < dev->dev->caps.sqp_demux; i++) + free_pv_object(dev, i, port); + kfree(ctx->tun); + ctx->tun = NULL; + return ret; +} + +static void mlx4_ib_free_sqp_ctx(struct mlx4_ib_demux_pv_ctx *sqp_ctx) +{ + if (sqp_ctx->state > DEMUX_PV_STATE_DOWN) { + sqp_ctx->state = DEMUX_PV_STATE_DOWNING; + flush_workqueue(sqp_ctx->wq); + if (sqp_ctx->has_smi) { + ib_destroy_qp(sqp_ctx->qp[0].qp); + sqp_ctx->qp[0].qp = NULL; + mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_SMI, 0); + } + ib_destroy_qp(sqp_ctx->qp[1].qp); + sqp_ctx->qp[1].qp = NULL; + mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_GSI, 0); + ib_dealloc_pd(sqp_ctx->pd); + sqp_ctx->pd = NULL; + ib_destroy_cq(sqp_ctx->cq); + sqp_ctx->cq = NULL; + sqp_ctx->state = DEMUX_PV_STATE_DOWN; + } +} + +static void mlx4_ib_free_demux_ctx(struct mlx4_ib_demux_ctx *ctx) +{ + int i; + if (ctx) { + struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); + mlx4_ib_mcg_port_cleanup(ctx, 1); + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + if (!ctx->tun[i]) + continue; + if (ctx->tun[i]->state > DEMUX_PV_STATE_DOWN) + ctx->tun[i]->state = DEMUX_PV_STATE_DOWNING; + } + flush_workqueue(ctx->wq); + flush_workqueue(ctx->wi_wq); + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + destroy_pv_resources(dev, i, ctx->port, ctx->tun[i], 0); + free_pv_object(dev, i, ctx->port); + } + kfree(ctx->tun); + destroy_workqueue(ctx->ud_wq); + destroy_workqueue(ctx->wi_wq); + destroy_workqueue(ctx->wq); + } +} + +static void mlx4_ib_master_tunnels(struct mlx4_ib_dev *dev, int do_init) +{ + int i; + + if (!mlx4_is_master(dev->dev)) + return; + /* initialize or tear down tunnel QPs for the master */ + for (i = 0; i < dev->dev->caps.num_ports; i++) + mlx4_ib_tunnels_update(dev, mlx4_master_func_num(dev->dev), i + 1, do_init); + return; +} + +int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev) +{ + int i = 0; + int err; + + if (!mlx4_is_mfunc(dev->dev)) + return 0; + + dev->sriov.is_going_down = 0; + spin_lock_init(&dev->sriov.going_down_lock); + mlx4_ib_cm_paravirt_init(dev); + + mlx4_ib_warn(&dev->ib_dev, "multi-function enabled\n"); + + if (mlx4_is_slave(dev->dev)) { + mlx4_ib_warn(&dev->ib_dev, "operating in qp1 tunnel mode\n"); + return 0; + } + + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + if (i == mlx4_master_func_num(dev->dev)) + mlx4_put_slave_node_guid(dev->dev, i, dev->ib_dev.node_guid); + else + mlx4_put_slave_node_guid(dev->dev, i, mlx4_ib_gen_node_guid()); + } + + err = mlx4_ib_init_alias_guid_service(dev); + if (err) { + mlx4_ib_warn(&dev->ib_dev, "Failed init alias guid process.\n"); + goto paravirt_err; + } + err = mlx4_ib_device_register_sysfs(dev); + if (err) { + mlx4_ib_warn(&dev->ib_dev, "Failed to register sysfs\n"); + goto sysfs_err; + } + + mlx4_ib_warn(&dev->ib_dev, "initializing demux service for %d qp1 clients\n", + dev->dev->caps.sqp_demux); + for (i = 0; i < dev->num_ports; i++) { + union ib_gid gid; + err = __mlx4_ib_query_gid(&dev->ib_dev, i + 1, 0, &gid, 1); + if (err) + goto demux_err; + dev->sriov.demux[i].guid_cache[0] = gid.global.interface_id; + atomic64_set(&dev->sriov.demux[i].subnet_prefix, + be64_to_cpu(gid.global.subnet_prefix)); + err = alloc_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1, + &dev->sriov.sqps[i]); + if (err) + goto demux_err; + err = mlx4_ib_alloc_demux_ctx(dev, &dev->sriov.demux[i], i + 1); + if (err) + goto free_pv; + } + mlx4_ib_master_tunnels(dev, 1); + return 0; + +free_pv: + free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1); +demux_err: + while (--i >= 0) { + free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1); + mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]); + } + mlx4_ib_device_unregister_sysfs(dev); + +sysfs_err: + mlx4_ib_destroy_alias_guid_service(dev); + +paravirt_err: + mlx4_ib_cm_paravirt_clean(dev, -1); + + return err; +} + +void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev) +{ + int i; + unsigned long flags; + + if (!mlx4_is_mfunc(dev->dev)) + return; + + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + dev->sriov.is_going_down = 1; + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); + if (mlx4_is_master(dev->dev)) { + for (i = 0; i < dev->num_ports; i++) { + flush_workqueue(dev->sriov.demux[i].ud_wq); + mlx4_ib_free_sqp_ctx(dev->sriov.sqps[i]); + kfree(dev->sriov.sqps[i]); + dev->sriov.sqps[i] = NULL; + mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]); + } + + mlx4_ib_cm_paravirt_clean(dev, -1); + mlx4_ib_destroy_alias_guid_service(dev); + mlx4_ib_device_unregister_sysfs(dev); + } +} diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c new file mode 100644 index 000000000..10d7aa87b --- /dev/null +++ b/drivers/infiniband/hw/mlx4/main.c @@ -0,0 +1,3450 @@ +/* + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/rtnetlink.h> +#include <linux/if_vlan.h> +#include <linux/sched/mm.h> +#include <linux/sched/task.h> + +#include <net/ipv6.h> +#include <net/addrconf.h> +#include <net/devlink.h> + +#include <rdma/ib_smi.h> +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_addr.h> +#include <rdma/ib_cache.h> + +#include <net/bonding.h> + +#include <linux/mlx4/driver.h> +#include <linux/mlx4/cmd.h> +#include <linux/mlx4/qp.h> + +#include "mlx4_ib.h" +#include <rdma/mlx4-abi.h> + +#define DRV_NAME MLX4_IB_DRV_NAME +#define DRV_VERSION "4.0-0" + +#define MLX4_IB_FLOW_MAX_PRIO 0xFFF +#define MLX4_IB_FLOW_QPN_MASK 0xFFFFFF +#define MLX4_IB_CARD_REV_A0 0xA0 + +MODULE_AUTHOR("Roland Dreier"); +MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); +MODULE_LICENSE("Dual BSD/GPL"); + +int mlx4_ib_sm_guid_assign = 0; +module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444); +MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 0)"); + +static const char mlx4_ib_version[] = + DRV_NAME ": Mellanox ConnectX InfiniBand driver v" + DRV_VERSION "\n"; + +static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init); +static enum rdma_link_layer mlx4_ib_port_link_layer(struct ib_device *device, + u8 port_num); + +static struct workqueue_struct *wq; + +static void init_query_mad(struct ib_smp *mad) +{ + mad->base_version = 1; + mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + mad->class_version = 1; + mad->method = IB_MGMT_METHOD_GET; +} + +static int check_flow_steering_support(struct mlx4_dev *dev) +{ + int eth_num_ports = 0; + int ib_num_ports = 0; + + int dmfs = dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED; + + if (dmfs) { + int i; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) + eth_num_ports++; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + ib_num_ports++; + dmfs &= (!ib_num_ports || + (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB)) && + (!eth_num_ports || + (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN)); + if (ib_num_ports && mlx4_is_mfunc(dev)) { + pr_warn("Device managed flow steering is unavailable for IB port in multifunction env.\n"); + dmfs = 0; + } + } + return dmfs; +} + +static int num_ib_ports(struct mlx4_dev *dev) +{ + int ib_ports = 0; + int i; + + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + ib_ports++; + + return ib_ports; +} + +static struct net_device *mlx4_ib_get_netdev(struct ib_device *device, u8 port_num) +{ + struct mlx4_ib_dev *ibdev = to_mdev(device); + struct net_device *dev; + + rcu_read_lock(); + dev = mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port_num); + + if (dev) { + if (mlx4_is_bonded(ibdev->dev)) { + struct net_device *upper = NULL; + + upper = netdev_master_upper_dev_get_rcu(dev); + if (upper) { + struct net_device *active; + + active = bond_option_active_slave_get_rcu(netdev_priv(upper)); + if (active) + dev = active; + } + } + } + if (dev) + dev_hold(dev); + + rcu_read_unlock(); + return dev; +} + +static int mlx4_ib_update_gids_v1(struct gid_entry *gids, + struct mlx4_ib_dev *ibdev, + u8 port_num) +{ + struct mlx4_cmd_mailbox *mailbox; + int err; + struct mlx4_dev *dev = ibdev->dev; + int i; + union ib_gid *gid_tbl; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return -ENOMEM; + + gid_tbl = mailbox->buf; + + for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) + memcpy(&gid_tbl[i], &gids[i].gid, sizeof(union ib_gid)); + + err = mlx4_cmd(dev, mailbox->dma, + MLX4_SET_PORT_GID_TABLE << 8 | port_num, + 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + if (mlx4_is_bonded(dev)) + err += mlx4_cmd(dev, mailbox->dma, + MLX4_SET_PORT_GID_TABLE << 8 | 2, + 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +static int mlx4_ib_update_gids_v1_v2(struct gid_entry *gids, + struct mlx4_ib_dev *ibdev, + u8 port_num) +{ + struct mlx4_cmd_mailbox *mailbox; + int err; + struct mlx4_dev *dev = ibdev->dev; + int i; + struct { + union ib_gid gid; + __be32 rsrvd1[2]; + __be16 rsrvd2; + u8 type; + u8 version; + __be32 rsrvd3; + } *gid_tbl; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return -ENOMEM; + + gid_tbl = mailbox->buf; + for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) { + memcpy(&gid_tbl[i].gid, &gids[i].gid, sizeof(union ib_gid)); + if (gids[i].gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { + gid_tbl[i].version = 2; + if (!ipv6_addr_v4mapped((struct in6_addr *)&gids[i].gid)) + gid_tbl[i].type = 1; + } + } + + err = mlx4_cmd(dev, mailbox->dma, + MLX4_SET_PORT_ROCE_ADDR << 8 | port_num, + 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + if (mlx4_is_bonded(dev)) + err += mlx4_cmd(dev, mailbox->dma, + MLX4_SET_PORT_ROCE_ADDR << 8 | 2, + 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +static int mlx4_ib_update_gids(struct gid_entry *gids, + struct mlx4_ib_dev *ibdev, + u8 port_num) +{ + if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) + return mlx4_ib_update_gids_v1_v2(gids, ibdev, port_num); + + return mlx4_ib_update_gids_v1(gids, ibdev, port_num); +} + +static void free_gid_entry(struct gid_entry *entry) +{ + memset(&entry->gid, 0, sizeof(entry->gid)); + kfree(entry->ctx); + entry->ctx = NULL; +} + +static int mlx4_ib_add_gid(const struct ib_gid_attr *attr, void **context) +{ + struct mlx4_ib_dev *ibdev = to_mdev(attr->device); + struct mlx4_ib_iboe *iboe = &ibdev->iboe; + struct mlx4_port_gid_table *port_gid_table; + int free = -1, found = -1; + int ret = 0; + int hw_update = 0; + int i; + struct gid_entry *gids = NULL; + + if (!rdma_cap_roce_gid_table(attr->device, attr->port_num)) + return -EINVAL; + + if (attr->port_num > MLX4_MAX_PORTS) + return -EINVAL; + + if (!context) + return -EINVAL; + + port_gid_table = &iboe->gids[attr->port_num - 1]; + spin_lock_bh(&iboe->lock); + for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) { + if (!memcmp(&port_gid_table->gids[i].gid, + &attr->gid, sizeof(attr->gid)) && + port_gid_table->gids[i].gid_type == attr->gid_type) { + found = i; + break; + } + if (free < 0 && rdma_is_zero_gid(&port_gid_table->gids[i].gid)) + free = i; /* HW has space */ + } + + if (found < 0) { + if (free < 0) { + ret = -ENOSPC; + } else { + port_gid_table->gids[free].ctx = kmalloc(sizeof(*port_gid_table->gids[free].ctx), GFP_ATOMIC); + if (!port_gid_table->gids[free].ctx) { + ret = -ENOMEM; + } else { + *context = port_gid_table->gids[free].ctx; + memcpy(&port_gid_table->gids[free].gid, + &attr->gid, sizeof(attr->gid)); + port_gid_table->gids[free].gid_type = attr->gid_type; + port_gid_table->gids[free].ctx->real_index = free; + port_gid_table->gids[free].ctx->refcount = 1; + hw_update = 1; + } + } + } else { + struct gid_cache_context *ctx = port_gid_table->gids[found].ctx; + *context = ctx; + ctx->refcount++; + } + if (!ret && hw_update) { + gids = kmalloc_array(MLX4_MAX_PORT_GIDS, sizeof(*gids), + GFP_ATOMIC); + if (!gids) { + ret = -ENOMEM; + *context = NULL; + free_gid_entry(&port_gid_table->gids[free]); + } else { + for (i = 0; i < MLX4_MAX_PORT_GIDS; i++) { + memcpy(&gids[i].gid, &port_gid_table->gids[i].gid, sizeof(union ib_gid)); + gids[i].gid_type = port_gid_table->gids[i].gid_type; + } + } + } + spin_unlock_bh(&iboe->lock); + + if (!ret && hw_update) { + ret = mlx4_ib_update_gids(gids, ibdev, attr->port_num); + if (ret) { + spin_lock_bh(&iboe->lock); + *context = NULL; + free_gid_entry(&port_gid_table->gids[free]); + spin_unlock_bh(&iboe->lock); + } + kfree(gids); + } + + return ret; +} + +static int mlx4_ib_del_gid(const struct ib_gid_attr *attr, void **context) +{ + struct gid_cache_context *ctx = *context; + struct mlx4_ib_dev *ibdev = to_mdev(attr->device); + struct mlx4_ib_iboe *iboe = &ibdev->iboe; + struct mlx4_port_gid_table *port_gid_table; + int ret = 0; + int hw_update = 0; + struct gid_entry *gids = NULL; + + if (!rdma_cap_roce_gid_table(attr->device, attr->port_num)) + return -EINVAL; + + if (attr->port_num > MLX4_MAX_PORTS) + return -EINVAL; + + port_gid_table = &iboe->gids[attr->port_num - 1]; + spin_lock_bh(&iboe->lock); + if (ctx) { + ctx->refcount--; + if (!ctx->refcount) { + unsigned int real_index = ctx->real_index; + + free_gid_entry(&port_gid_table->gids[real_index]); + hw_update = 1; + } + } + if (!ret && hw_update) { + int i; + + gids = kmalloc_array(MLX4_MAX_PORT_GIDS, sizeof(*gids), + GFP_ATOMIC); + if (!gids) { + ret = -ENOMEM; + } else { + for (i = 0; i < MLX4_MAX_PORT_GIDS; i++) { + memcpy(&gids[i].gid, + &port_gid_table->gids[i].gid, + sizeof(union ib_gid)); + gids[i].gid_type = + port_gid_table->gids[i].gid_type; + } + } + } + spin_unlock_bh(&iboe->lock); + + if (!ret && hw_update) { + ret = mlx4_ib_update_gids(gids, ibdev, attr->port_num); + kfree(gids); + } + return ret; +} + +int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev, + const struct ib_gid_attr *attr) +{ + struct mlx4_ib_iboe *iboe = &ibdev->iboe; + struct gid_cache_context *ctx = NULL; + struct mlx4_port_gid_table *port_gid_table; + int real_index = -EINVAL; + int i; + unsigned long flags; + u8 port_num = attr->port_num; + + if (port_num > MLX4_MAX_PORTS) + return -EINVAL; + + if (mlx4_is_bonded(ibdev->dev)) + port_num = 1; + + if (!rdma_cap_roce_gid_table(&ibdev->ib_dev, port_num)) + return attr->index; + + spin_lock_irqsave(&iboe->lock, flags); + port_gid_table = &iboe->gids[port_num - 1]; + + for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) + if (!memcmp(&port_gid_table->gids[i].gid, + &attr->gid, sizeof(attr->gid)) && + attr->gid_type == port_gid_table->gids[i].gid_type) { + ctx = port_gid_table->gids[i].ctx; + break; + } + if (ctx) + real_index = ctx->real_index; + spin_unlock_irqrestore(&iboe->lock, flags); + return real_index; +} + +#define field_avail(type, fld, sz) (offsetof(type, fld) + \ + sizeof(((type *)0)->fld) <= (sz)) + +static int mlx4_ib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props, + struct ib_udata *uhw) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err; + int have_ib_ports; + struct mlx4_uverbs_ex_query_device cmd; + struct mlx4_uverbs_ex_query_device_resp resp = {.comp_mask = 0}; + struct mlx4_clock_params clock_params; + + if (uhw->inlen) { + if (uhw->inlen < sizeof(cmd)) + return -EINVAL; + + err = ib_copy_from_udata(&cmd, uhw, sizeof(cmd)); + if (err) + return err; + + if (cmd.comp_mask) + return -EINVAL; + + if (cmd.reserved) + return -EINVAL; + } + + resp.response_length = offsetof(typeof(resp), response_length) + + sizeof(resp.response_length); + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + err = -ENOMEM; + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx4_MAD_IFC(to_mdev(ibdev), MLX4_MAD_IFC_IGNORE_KEYS, + 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memset(props, 0, sizeof *props); + + have_ib_ports = num_ib_ports(dev->dev); + + props->fw_ver = dev->dev->caps.fw_ver; + props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | + IB_DEVICE_PORT_ACTIVE_EVENT | + IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_RC_RNR_NAK_GEN | + IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR) + props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR) + props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM && have_ib_ports) + props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT) + props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM) + props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; + if (dev->dev->caps.max_gso_sz && + (dev->dev->rev_id != MLX4_IB_CARD_REV_A0) && + (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH)) + props->device_cap_flags |= IB_DEVICE_UD_TSO; + if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY) + props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; + if ((dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_LOCAL_INV) && + (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_REMOTE_INV) && + (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_FAST_REG_WR)) + props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) + props->device_cap_flags |= IB_DEVICE_XRC; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW) + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW; + if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) { + if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_WIN_TYPE_2B) + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B; + else + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A; + } + if (dev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) + props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; + + props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM; + + props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) & + 0xffffff; + props->vendor_part_id = dev->dev->persist->pdev->device; + props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32)); + memcpy(&props->sys_image_guid, out_mad->data + 4, 8); + + props->max_mr_size = ~0ull; + props->page_size_cap = dev->dev->caps.page_size_cap; + props->max_qp = dev->dev->quotas.qp; + props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; + props->max_send_sge = + min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); + props->max_recv_sge = + min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); + props->max_sge_rd = MLX4_MAX_SGE_RD; + props->max_cq = dev->dev->quotas.cq; + props->max_cqe = dev->dev->caps.max_cqes; + props->max_mr = dev->dev->quotas.mpt; + props->max_pd = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds; + props->max_qp_rd_atom = dev->dev->caps.max_qp_dest_rdma; + props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma; + props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; + props->max_srq = dev->dev->quotas.srq; + props->max_srq_wr = dev->dev->caps.max_srq_wqes - 1; + props->max_srq_sge = dev->dev->caps.max_srq_sge; + props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES; + props->local_ca_ack_delay = dev->dev->caps.local_ca_ack_delay; + props->atomic_cap = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ? + IB_ATOMIC_HCA : IB_ATOMIC_NONE; + props->masked_atomic_cap = props->atomic_cap; + props->max_pkeys = dev->dev->caps.pkey_table_len[1]; + props->max_mcast_grp = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms; + props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm; + props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * + props->max_mcast_grp; + props->max_map_per_fmr = dev->dev->caps.max_fmr_maps; + props->hca_core_clock = dev->dev->caps.hca_core_clock * 1000UL; + props->timestamp_mask = 0xFFFFFFFFFFFFULL; + props->max_ah = INT_MAX; + + if (mlx4_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET || + mlx4_ib_port_link_layer(ibdev, 2) == IB_LINK_LAYER_ETHERNET) { + if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) { + props->rss_caps.max_rwq_indirection_tables = + props->max_qp; + props->rss_caps.max_rwq_indirection_table_size = + dev->dev->caps.max_rss_tbl_sz; + props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET; + props->max_wq_type_rq = props->max_qp; + } + + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP) + props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS; + } + + props->cq_caps.max_cq_moderation_count = MLX4_MAX_CQ_COUNT; + props->cq_caps.max_cq_moderation_period = MLX4_MAX_CQ_PERIOD; + + if (uhw->outlen >= resp.response_length + sizeof(resp.hca_core_clock_offset)) { + resp.response_length += sizeof(resp.hca_core_clock_offset); + if (!mlx4_get_internal_clock_params(dev->dev, &clock_params)) { + resp.comp_mask |= MLX4_IB_QUERY_DEV_RESP_MASK_CORE_CLOCK_OFFSET; + resp.hca_core_clock_offset = clock_params.offset % PAGE_SIZE; + } + } + + if (uhw->outlen >= resp.response_length + + sizeof(resp.max_inl_recv_sz)) { + resp.response_length += sizeof(resp.max_inl_recv_sz); + resp.max_inl_recv_sz = dev->dev->caps.max_rq_sg * + sizeof(struct mlx4_wqe_data_seg); + } + + if (field_avail(typeof(resp), rss_caps, uhw->outlen)) { + if (props->rss_caps.supported_qpts) { + resp.rss_caps.rx_hash_function = + MLX4_IB_RX_HASH_FUNC_TOEPLITZ; + + resp.rss_caps.rx_hash_fields_mask = + MLX4_IB_RX_HASH_SRC_IPV4 | + MLX4_IB_RX_HASH_DST_IPV4 | + MLX4_IB_RX_HASH_SRC_IPV6 | + MLX4_IB_RX_HASH_DST_IPV6 | + MLX4_IB_RX_HASH_SRC_PORT_TCP | + MLX4_IB_RX_HASH_DST_PORT_TCP | + MLX4_IB_RX_HASH_SRC_PORT_UDP | + MLX4_IB_RX_HASH_DST_PORT_UDP; + + if (dev->dev->caps.tunnel_offload_mode == + MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) + resp.rss_caps.rx_hash_fields_mask |= + MLX4_IB_RX_HASH_INNER; + } + resp.response_length = offsetof(typeof(resp), rss_caps) + + sizeof(resp.rss_caps); + } + + if (field_avail(typeof(resp), tso_caps, uhw->outlen)) { + if (dev->dev->caps.max_gso_sz && + ((mlx4_ib_port_link_layer(ibdev, 1) == + IB_LINK_LAYER_ETHERNET) || + (mlx4_ib_port_link_layer(ibdev, 2) == + IB_LINK_LAYER_ETHERNET))) { + resp.tso_caps.max_tso = dev->dev->caps.max_gso_sz; + resp.tso_caps.supported_qpts |= + 1 << IB_QPT_RAW_PACKET; + } + resp.response_length = offsetof(typeof(resp), tso_caps) + + sizeof(resp.tso_caps); + } + + if (uhw->outlen) { + err = ib_copy_to_udata(uhw, &resp, resp.response_length); + if (err) + goto out; + } +out: + kfree(in_mad); + kfree(out_mad); + + return err; +} + +static enum rdma_link_layer +mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num) +{ + struct mlx4_dev *dev = to_mdev(device)->dev; + + return dev->caps.port_mask[port_num] == MLX4_PORT_TYPE_IB ? + IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; +} + +static int ib_link_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, int netw_view) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int ext_active_speed; + int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) + mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; + + err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, + in_mad, out_mad); + if (err) + goto out; + + + props->lid = be16_to_cpup((__be16 *) (out_mad->data + 16)); + props->lmc = out_mad->data[34] & 0x7; + props->sm_lid = be16_to_cpup((__be16 *) (out_mad->data + 18)); + props->sm_sl = out_mad->data[36] & 0xf; + props->state = out_mad->data[32] & 0xf; + props->phys_state = out_mad->data[33] >> 4; + props->port_cap_flags = be32_to_cpup((__be32 *) (out_mad->data + 20)); + if (netw_view) + props->gid_tbl_len = out_mad->data[50]; + else + props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port]; + props->max_msg_sz = to_mdev(ibdev)->dev->caps.max_msg_sz; + props->pkey_tbl_len = to_mdev(ibdev)->dev->caps.pkey_table_len[port]; + props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46)); + props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data + 48)); + props->active_width = out_mad->data[31] & 0xf; + props->active_speed = out_mad->data[35] >> 4; + props->max_mtu = out_mad->data[41] & 0xf; + props->active_mtu = out_mad->data[36] >> 4; + props->subnet_timeout = out_mad->data[51] & 0x1f; + props->max_vl_num = out_mad->data[37] >> 4; + props->init_type_reply = out_mad->data[41] >> 4; + + /* Check if extended speeds (EDR/FDR/...) are supported */ + if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) { + ext_active_speed = out_mad->data[62] >> 4; + + switch (ext_active_speed) { + case 1: + props->active_speed = IB_SPEED_FDR; + break; + case 2: + props->active_speed = IB_SPEED_EDR; + break; + } + } + + /* If reported active speed is QDR, check if is FDR-10 */ + if (props->active_speed == IB_SPEED_QDR) { + init_query_mad(in_mad); + in_mad->attr_id = MLX4_ATTR_EXTENDED_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, + NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + /* Checking LinkSpeedActive for FDR-10 */ + if (out_mad->data[15] & 0x1) + props->active_speed = IB_SPEED_FDR10; + } + + /* Avoid wrong speed value returned by FW if the IB link is down. */ + if (props->state == IB_PORT_DOWN) + props->active_speed = IB_SPEED_SDR; + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static u8 state_to_phys_state(enum ib_port_state state) +{ + return state == IB_PORT_ACTIVE ? 5 : 3; +} + +static int eth_link_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + + struct mlx4_ib_dev *mdev = to_mdev(ibdev); + struct mlx4_ib_iboe *iboe = &mdev->iboe; + struct net_device *ndev; + enum ib_mtu tmp; + struct mlx4_cmd_mailbox *mailbox; + int err = 0; + int is_bonded = mlx4_is_bonded(mdev->dev); + + mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0, + MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + if (err) + goto out; + + props->active_width = (((u8 *)mailbox->buf)[5] == 0x40) || + (((u8 *)mailbox->buf)[5] == 0x20 /*56Gb*/) ? + IB_WIDTH_4X : IB_WIDTH_1X; + props->active_speed = (((u8 *)mailbox->buf)[5] == 0x20 /*56Gb*/) ? + IB_SPEED_FDR : IB_SPEED_QDR; + props->port_cap_flags = IB_PORT_CM_SUP; + props->ip_gids = true; + props->gid_tbl_len = mdev->dev->caps.gid_table_len[port]; + props->max_msg_sz = mdev->dev->caps.max_msg_sz; + props->pkey_tbl_len = 1; + props->max_mtu = IB_MTU_4096; + props->max_vl_num = 2; + props->state = IB_PORT_DOWN; + props->phys_state = state_to_phys_state(props->state); + props->active_mtu = IB_MTU_256; + spin_lock_bh(&iboe->lock); + ndev = iboe->netdevs[port - 1]; + if (ndev && is_bonded) { + rcu_read_lock(); /* required to get upper dev */ + ndev = netdev_master_upper_dev_get_rcu(ndev); + rcu_read_unlock(); + } + if (!ndev) + goto out_unlock; + + tmp = iboe_get_mtu(ndev->mtu); + props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256; + + props->state = (netif_running(ndev) && netif_carrier_ok(ndev)) ? + IB_PORT_ACTIVE : IB_PORT_DOWN; + props->phys_state = state_to_phys_state(props->state); +out_unlock: + spin_unlock_bh(&iboe->lock); +out: + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return err; +} + +int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, int netw_view) +{ + int err; + + /* props being zeroed by the caller, avoid zeroing it here */ + + err = mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ? + ib_link_query_port(ibdev, port, props, netw_view) : + eth_link_query_port(ibdev, port, props); + + return err; +} + +static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + /* returns host view */ + return __mlx4_ib_query_port(ibdev, port, props, 0); +} + +int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid, int netw_view) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int clear = 0; + int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + if (mlx4_is_mfunc(dev->dev) && netw_view) + mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; + + err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(gid->raw, out_mad->data + 8, 8); + + if (mlx4_is_mfunc(dev->dev) && !netw_view) { + if (index) { + /* For any index > 0, return the null guid */ + err = 0; + clear = 1; + goto out; + } + } + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(index / 8); + + err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, + NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); + +out: + if (clear) + memset(gid->raw + 8, 0, 8); + kfree(in_mad); + kfree(out_mad); + return err; +} + +static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + if (rdma_protocol_ib(ibdev, port)) + return __mlx4_ib_query_gid(ibdev, port, index, gid, 0); + return 0; +} + +static int mlx4_ib_query_sl2vl(struct ib_device *ibdev, u8 port, u64 *sl2vl_tbl) +{ + union sl2vl_tbl_to_u64 sl2vl64; + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; + int err = -ENOMEM; + int jj; + + if (mlx4_is_slave(to_mdev(ibdev)->dev)) { + *sl2vl_tbl = 0; + return 0; + } + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_SL_TO_VL_TABLE; + in_mad->attr_mod = 0; + + if (mlx4_is_mfunc(to_mdev(ibdev)->dev)) + mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; + + err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, + in_mad, out_mad); + if (err) + goto out; + + for (jj = 0; jj < 8; jj++) + sl2vl64.sl8[jj] = ((struct ib_smp *)out_mad)->data[jj]; + *sl2vl_tbl = sl2vl64.sl64; + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static void mlx4_init_sl2vl_tbl(struct mlx4_ib_dev *mdev) +{ + u64 sl2vl; + int i; + int err; + + for (i = 1; i <= mdev->dev->caps.num_ports; i++) { + if (mdev->dev->caps.port_type[i] == MLX4_PORT_TYPE_ETH) + continue; + err = mlx4_ib_query_sl2vl(&mdev->ib_dev, i, &sl2vl); + if (err) { + pr_err("Unable to get default sl to vl mapping for port %d. Using all zeroes (%d)\n", + i, err); + sl2vl = 0; + } + atomic64_set(&mdev->sl2vl[i - 1], sl2vl); + } +} + +int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey, int netw_view) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; + in_mad->attr_mod = cpu_to_be32(index / 32); + + if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) + mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; + + err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, + in_mad, out_mad); + if (err) + goto out; + + *pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) +{ + return __mlx4_ib_query_pkey(ibdev, port, index, pkey, 0); +} + +static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, + struct ib_device_modify *props) +{ + struct mlx4_cmd_mailbox *mailbox; + unsigned long flags; + + if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) + return -EOPNOTSUPP; + + if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) + return 0; + + if (mlx4_is_slave(to_mdev(ibdev)->dev)) + return -EOPNOTSUPP; + + spin_lock_irqsave(&to_mdev(ibdev)->sm_lock, flags); + memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX); + spin_unlock_irqrestore(&to_mdev(ibdev)->sm_lock, flags); + + /* + * If possible, pass node desc to FW, so it can generate + * a 144 trap. If cmd fails, just ignore. + */ + mailbox = mlx4_alloc_cmd_mailbox(to_mdev(ibdev)->dev); + if (IS_ERR(mailbox)) + return 0; + + memcpy(mailbox->buf, props->node_desc, IB_DEVICE_NODE_DESC_MAX); + mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0, + MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + + mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox); + + return 0; +} + +static int mlx4_ib_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, + u32 cap_mask) +{ + struct mlx4_cmd_mailbox *mailbox; + int err; + + mailbox = mlx4_alloc_cmd_mailbox(dev->dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { + *(u8 *) mailbox->buf = !!reset_qkey_viols << 6; + ((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask); + } else { + ((u8 *) mailbox->buf)[3] = !!reset_qkey_viols; + ((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask); + } + + err = mlx4_cmd(dev->dev, mailbox->dma, port, MLX4_SET_PORT_IB_OPCODE, + MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + + mlx4_free_cmd_mailbox(dev->dev, mailbox); + return err; +} + +static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, + struct ib_port_modify *props) +{ + struct mlx4_ib_dev *mdev = to_mdev(ibdev); + u8 is_eth = mdev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; + struct ib_port_attr attr; + u32 cap_mask; + int err; + + /* return OK if this is RoCE. CM calls ib_modify_port() regardless + * of whether port link layer is ETH or IB. For ETH ports, qkey + * violations and port capabilities are not meaningful. + */ + if (is_eth) + return 0; + + mutex_lock(&mdev->cap_mask_mutex); + + err = ib_query_port(ibdev, port, &attr); + if (err) + goto out; + + cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) & + ~props->clr_port_cap_mask; + + err = mlx4_ib_SET_PORT(mdev, port, + !!(mask & IB_PORT_RESET_QKEY_CNTR), + cap_mask); + +out: + mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex); + return err; +} + +static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct mlx4_ib_ucontext *context; + struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3; + struct mlx4_ib_alloc_ucontext_resp resp; + int err; + + if (!dev->ib_active) + return ERR_PTR(-EAGAIN); + + if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) { + resp_v3.qp_tab_size = dev->dev->caps.num_qps; + resp_v3.bf_reg_size = dev->dev->caps.bf_reg_size; + resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; + } else { + resp.dev_caps = dev->dev->caps.userspace_caps; + resp.qp_tab_size = dev->dev->caps.num_qps; + resp.bf_reg_size = dev->dev->caps.bf_reg_size; + resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; + resp.cqe_size = dev->dev->caps.cqe_size; + } + + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + + err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar); + if (err) { + kfree(context); + return ERR_PTR(err); + } + + INIT_LIST_HEAD(&context->db_page_list); + mutex_init(&context->db_page_mutex); + + INIT_LIST_HEAD(&context->wqn_ranges_list); + mutex_init(&context->wqn_ranges_mutex); + + if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) + err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3)); + else + err = ib_copy_to_udata(udata, &resp, sizeof(resp)); + + if (err) { + mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar); + kfree(context); + return ERR_PTR(-EFAULT); + } + + return &context->ibucontext; +} + +static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +{ + struct mlx4_ib_ucontext *context = to_mucontext(ibcontext); + + mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar); + kfree(context); + + return 0; +} + +static void mlx4_ib_vma_open(struct vm_area_struct *area) +{ + /* vma_open is called when a new VMA is created on top of our VMA. + * This is done through either mremap flow or split_vma (usually due + * to mlock, madvise, munmap, etc.). We do not support a clone of the + * vma, as this VMA is strongly hardware related. Therefore we set the + * vm_ops of the newly created/cloned VMA to NULL, to prevent it from + * calling us again and trying to do incorrect actions. We assume that + * the original vma size is exactly a single page that there will be no + * "splitting" operations on. + */ + area->vm_ops = NULL; +} + +static void mlx4_ib_vma_close(struct vm_area_struct *area) +{ + struct mlx4_ib_vma_private_data *mlx4_ib_vma_priv_data; + + /* It's guaranteed that all VMAs opened on a FD are closed before the + * file itself is closed, therefore no sync is needed with the regular + * closing flow. (e.g. mlx4_ib_dealloc_ucontext) However need a sync + * with accessing the vma as part of mlx4_ib_disassociate_ucontext. + * The close operation is usually called under mm->mmap_sem except when + * process is exiting. The exiting case is handled explicitly as part + * of mlx4_ib_disassociate_ucontext. + */ + mlx4_ib_vma_priv_data = (struct mlx4_ib_vma_private_data *) + area->vm_private_data; + + /* set the vma context pointer to null in the mlx4_ib driver's private + * data to protect against a race condition in mlx4_ib_dissassociate_ucontext(). + */ + mlx4_ib_vma_priv_data->vma = NULL; +} + +static const struct vm_operations_struct mlx4_ib_vm_ops = { + .open = mlx4_ib_vma_open, + .close = mlx4_ib_vma_close +}; + +static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) +{ + int i; + struct vm_area_struct *vma; + struct mlx4_ib_ucontext *context = to_mucontext(ibcontext); + + /* need to protect from a race on closing the vma as part of + * mlx4_ib_vma_close(). + */ + for (i = 0; i < HW_BAR_COUNT; i++) { + vma = context->hw_bar_info[i].vma; + if (!vma) + continue; + + zap_vma_ptes(context->hw_bar_info[i].vma, + context->hw_bar_info[i].vma->vm_start, PAGE_SIZE); + + context->hw_bar_info[i].vma->vm_flags &= + ~(VM_SHARED | VM_MAYSHARE); + /* context going to be destroyed, should not access ops any more */ + context->hw_bar_info[i].vma->vm_ops = NULL; + } +} + +static void mlx4_ib_set_vma_data(struct vm_area_struct *vma, + struct mlx4_ib_vma_private_data *vma_private_data) +{ + vma_private_data->vma = vma; + vma->vm_private_data = vma_private_data; + vma->vm_ops = &mlx4_ib_vm_ops; +} + +static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + struct mlx4_ib_dev *dev = to_mdev(context->device); + struct mlx4_ib_ucontext *mucontext = to_mucontext(context); + + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + if (vma->vm_pgoff == 0) { + /* We prevent double mmaping on same context */ + if (mucontext->hw_bar_info[HW_BAR_DB].vma) + return -EINVAL; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + if (io_remap_pfn_range(vma, vma->vm_start, + to_mucontext(context)->uar.pfn, + PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + + mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_DB]); + + } else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) { + /* We prevent double mmaping on same context */ + if (mucontext->hw_bar_info[HW_BAR_BF].vma) + return -EINVAL; + + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + + if (io_remap_pfn_range(vma, vma->vm_start, + to_mucontext(context)->uar.pfn + + dev->dev->caps.num_uars, + PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + + mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_BF]); + + } else if (vma->vm_pgoff == 3) { + struct mlx4_clock_params params; + int ret; + + /* We prevent double mmaping on same context */ + if (mucontext->hw_bar_info[HW_BAR_CLOCK].vma) + return -EINVAL; + + ret = mlx4_get_internal_clock_params(dev->dev, ¶ms); + + if (ret) + return ret; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + if (io_remap_pfn_range(vma, vma->vm_start, + (pci_resource_start(dev->dev->persist->pdev, + params.bar) + + params.offset) + >> PAGE_SHIFT, + PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + + mlx4_ib_set_vma_data(vma, + &mucontext->hw_bar_info[HW_BAR_CLOCK]); + } else { + return -EINVAL; + } + + return 0; +} + +static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx4_ib_pd *pd; + int err; + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn); + if (err) { + kfree(pd); + return ERR_PTR(err); + } + + if (context) + if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) { + mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn); + kfree(pd); + return ERR_PTR(-EFAULT); + } + return &pd->ibpd; +} + +static int mlx4_ib_dealloc_pd(struct ib_pd *pd) +{ + mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn); + kfree(pd); + + return 0; +} + +static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx4_ib_xrcd *xrcd; + struct ib_cq_init_attr cq_attr = {}; + int err; + + if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) + return ERR_PTR(-ENOSYS); + + xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL); + if (!xrcd) + return ERR_PTR(-ENOMEM); + + err = mlx4_xrcd_alloc(to_mdev(ibdev)->dev, &xrcd->xrcdn); + if (err) + goto err1; + + xrcd->pd = ib_alloc_pd(ibdev, 0); + if (IS_ERR(xrcd->pd)) { + err = PTR_ERR(xrcd->pd); + goto err2; + } + + cq_attr.cqe = 1; + xrcd->cq = ib_create_cq(ibdev, NULL, NULL, xrcd, &cq_attr); + if (IS_ERR(xrcd->cq)) { + err = PTR_ERR(xrcd->cq); + goto err3; + } + + return &xrcd->ibxrcd; + +err3: + ib_dealloc_pd(xrcd->pd); +err2: + mlx4_xrcd_free(to_mdev(ibdev)->dev, xrcd->xrcdn); +err1: + kfree(xrcd); + return ERR_PTR(err); +} + +static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd) +{ + ib_destroy_cq(to_mxrcd(xrcd)->cq); + ib_dealloc_pd(to_mxrcd(xrcd)->pd); + mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn); + kfree(xrcd); + + return 0; +} + +static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid) +{ + struct mlx4_ib_qp *mqp = to_mqp(ibqp); + struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + struct mlx4_ib_gid_entry *ge; + + ge = kzalloc(sizeof *ge, GFP_KERNEL); + if (!ge) + return -ENOMEM; + + ge->gid = *gid; + if (mlx4_ib_add_mc(mdev, mqp, gid)) { + ge->port = mqp->port; + ge->added = 1; + } + + mutex_lock(&mqp->mutex); + list_add_tail(&ge->list, &mqp->gid_list); + mutex_unlock(&mqp->mutex); + + return 0; +} + +static void mlx4_ib_delete_counters_table(struct mlx4_ib_dev *ibdev, + struct mlx4_ib_counters *ctr_table) +{ + struct counter_index *counter, *tmp_count; + + mutex_lock(&ctr_table->mutex); + list_for_each_entry_safe(counter, tmp_count, &ctr_table->counters_list, + list) { + if (counter->allocated) + mlx4_counter_free(ibdev->dev, counter->index); + list_del(&counter->list); + kfree(counter); + } + mutex_unlock(&ctr_table->mutex); +} + +int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + union ib_gid *gid) +{ + struct net_device *ndev; + int ret = 0; + + if (!mqp->port) + return 0; + + spin_lock_bh(&mdev->iboe.lock); + ndev = mdev->iboe.netdevs[mqp->port - 1]; + if (ndev) + dev_hold(ndev); + spin_unlock_bh(&mdev->iboe.lock); + + if (ndev) { + ret = 1; + dev_put(ndev); + } + + return ret; +} + +struct mlx4_ib_steering { + struct list_head list; + struct mlx4_flow_reg_id reg_id; + union ib_gid gid; +}; + +#define LAST_ETH_FIELD vlan_tag +#define LAST_IB_FIELD sl +#define LAST_IPV4_FIELD dst_ip +#define LAST_TCP_UDP_FIELD src_port + +/* Field is the last supported field */ +#define FIELDS_NOT_SUPPORTED(filter, field)\ + memchr_inv((void *)&filter.field +\ + sizeof(filter.field), 0,\ + sizeof(filter) -\ + offsetof(typeof(filter), field) -\ + sizeof(filter.field)) + +static int parse_flow_attr(struct mlx4_dev *dev, + u32 qp_num, + union ib_flow_spec *ib_spec, + struct _rule_hw *mlx4_spec) +{ + enum mlx4_net_trans_rule_id type; + + switch (ib_spec->type) { + case IB_FLOW_SPEC_ETH: + if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD)) + return -ENOTSUPP; + + type = MLX4_NET_TRANS_RULE_ID_ETH; + memcpy(mlx4_spec->eth.dst_mac, ib_spec->eth.val.dst_mac, + ETH_ALEN); + memcpy(mlx4_spec->eth.dst_mac_msk, ib_spec->eth.mask.dst_mac, + ETH_ALEN); + mlx4_spec->eth.vlan_tag = ib_spec->eth.val.vlan_tag; + mlx4_spec->eth.vlan_tag_msk = ib_spec->eth.mask.vlan_tag; + break; + case IB_FLOW_SPEC_IB: + if (FIELDS_NOT_SUPPORTED(ib_spec->ib.mask, LAST_IB_FIELD)) + return -ENOTSUPP; + + type = MLX4_NET_TRANS_RULE_ID_IB; + mlx4_spec->ib.l3_qpn = + cpu_to_be32(qp_num); + mlx4_spec->ib.qpn_mask = + cpu_to_be32(MLX4_IB_FLOW_QPN_MASK); + break; + + + case IB_FLOW_SPEC_IPV4: + if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD)) + return -ENOTSUPP; + + type = MLX4_NET_TRANS_RULE_ID_IPV4; + mlx4_spec->ipv4.src_ip = ib_spec->ipv4.val.src_ip; + mlx4_spec->ipv4.src_ip_msk = ib_spec->ipv4.mask.src_ip; + mlx4_spec->ipv4.dst_ip = ib_spec->ipv4.val.dst_ip; + mlx4_spec->ipv4.dst_ip_msk = ib_spec->ipv4.mask.dst_ip; + break; + + case IB_FLOW_SPEC_TCP: + case IB_FLOW_SPEC_UDP: + if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask, LAST_TCP_UDP_FIELD)) + return -ENOTSUPP; + + type = ib_spec->type == IB_FLOW_SPEC_TCP ? + MLX4_NET_TRANS_RULE_ID_TCP : + MLX4_NET_TRANS_RULE_ID_UDP; + mlx4_spec->tcp_udp.dst_port = ib_spec->tcp_udp.val.dst_port; + mlx4_spec->tcp_udp.dst_port_msk = ib_spec->tcp_udp.mask.dst_port; + mlx4_spec->tcp_udp.src_port = ib_spec->tcp_udp.val.src_port; + mlx4_spec->tcp_udp.src_port_msk = ib_spec->tcp_udp.mask.src_port; + break; + + default: + return -EINVAL; + } + if (mlx4_map_sw_to_hw_steering_id(dev, type) < 0 || + mlx4_hw_rule_sz(dev, type) < 0) + return -EINVAL; + mlx4_spec->id = cpu_to_be16(mlx4_map_sw_to_hw_steering_id(dev, type)); + mlx4_spec->size = mlx4_hw_rule_sz(dev, type) >> 2; + return mlx4_hw_rule_sz(dev, type); +} + +struct default_rules { + __u32 mandatory_fields[IB_FLOW_SPEC_SUPPORT_LAYERS]; + __u32 mandatory_not_fields[IB_FLOW_SPEC_SUPPORT_LAYERS]; + __u32 rules_create_list[IB_FLOW_SPEC_SUPPORT_LAYERS]; + __u8 link_layer; +}; +static const struct default_rules default_table[] = { + { + .mandatory_fields = {IB_FLOW_SPEC_IPV4}, + .mandatory_not_fields = {IB_FLOW_SPEC_ETH}, + .rules_create_list = {IB_FLOW_SPEC_IB}, + .link_layer = IB_LINK_LAYER_INFINIBAND + } +}; + +static int __mlx4_ib_default_rules_match(struct ib_qp *qp, + struct ib_flow_attr *flow_attr) +{ + int i, j, k; + void *ib_flow; + const struct default_rules *pdefault_rules = default_table; + u8 link_layer = rdma_port_get_link_layer(qp->device, flow_attr->port); + + for (i = 0; i < ARRAY_SIZE(default_table); i++, pdefault_rules++) { + __u32 field_types[IB_FLOW_SPEC_SUPPORT_LAYERS]; + memset(&field_types, 0, sizeof(field_types)); + + if (link_layer != pdefault_rules->link_layer) + continue; + + ib_flow = flow_attr + 1; + /* we assume the specs are sorted */ + for (j = 0, k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS && + j < flow_attr->num_of_specs; k++) { + union ib_flow_spec *current_flow = + (union ib_flow_spec *)ib_flow; + + /* same layer but different type */ + if (((current_flow->type & IB_FLOW_SPEC_LAYER_MASK) == + (pdefault_rules->mandatory_fields[k] & + IB_FLOW_SPEC_LAYER_MASK)) && + (current_flow->type != + pdefault_rules->mandatory_fields[k])) + goto out; + + /* same layer, try match next one */ + if (current_flow->type == + pdefault_rules->mandatory_fields[k]) { + j++; + ib_flow += + ((union ib_flow_spec *)ib_flow)->size; + } + } + + ib_flow = flow_attr + 1; + for (j = 0; j < flow_attr->num_of_specs; + j++, ib_flow += ((union ib_flow_spec *)ib_flow)->size) + for (k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS; k++) + /* same layer and same type */ + if (((union ib_flow_spec *)ib_flow)->type == + pdefault_rules->mandatory_not_fields[k]) + goto out; + + return i; + } +out: + return -1; +} + +static int __mlx4_ib_create_default_rules( + struct mlx4_ib_dev *mdev, + struct ib_qp *qp, + const struct default_rules *pdefault_rules, + struct _rule_hw *mlx4_spec) { + int size = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(pdefault_rules->rules_create_list); i++) { + union ib_flow_spec ib_spec = {}; + int ret; + + switch (pdefault_rules->rules_create_list[i]) { + case 0: + /* no rule */ + continue; + case IB_FLOW_SPEC_IB: + ib_spec.type = IB_FLOW_SPEC_IB; + ib_spec.size = sizeof(struct ib_flow_spec_ib); + + break; + default: + /* invalid rule */ + return -EINVAL; + } + /* We must put empty rule, qpn is being ignored */ + ret = parse_flow_attr(mdev->dev, 0, &ib_spec, + mlx4_spec); + if (ret < 0) { + pr_info("invalid parsing\n"); + return -EINVAL; + } + + mlx4_spec = (void *)mlx4_spec + ret; + size += ret; + } + return size; +} + +static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, + int domain, + enum mlx4_net_trans_promisc_mode flow_type, + u64 *reg_id) +{ + int ret, i; + int size = 0; + void *ib_flow; + struct mlx4_ib_dev *mdev = to_mdev(qp->device); + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_net_trans_rule_hw_ctrl *ctrl; + int default_flow; + + static const u16 __mlx4_domain[] = { + [IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS, + [IB_FLOW_DOMAIN_ETHTOOL] = MLX4_DOMAIN_ETHTOOL, + [IB_FLOW_DOMAIN_RFS] = MLX4_DOMAIN_RFS, + [IB_FLOW_DOMAIN_NIC] = MLX4_DOMAIN_NIC, + }; + + if (flow_attr->priority > MLX4_IB_FLOW_MAX_PRIO) { + pr_err("Invalid priority value %d\n", flow_attr->priority); + return -EINVAL; + } + + if (domain >= IB_FLOW_DOMAIN_NUM) { + pr_err("Invalid domain value %d\n", domain); + return -EINVAL; + } + + if (mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0) + return -EINVAL; + + mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + ctrl = mailbox->buf; + + ctrl->prio = cpu_to_be16(__mlx4_domain[domain] | + flow_attr->priority); + ctrl->type = mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type); + ctrl->port = flow_attr->port; + ctrl->qpn = cpu_to_be32(qp->qp_num); + + ib_flow = flow_attr + 1; + size += sizeof(struct mlx4_net_trans_rule_hw_ctrl); + /* Add default flows */ + default_flow = __mlx4_ib_default_rules_match(qp, flow_attr); + if (default_flow >= 0) { + ret = __mlx4_ib_create_default_rules( + mdev, qp, default_table + default_flow, + mailbox->buf + size); + if (ret < 0) { + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return -EINVAL; + } + size += ret; + } + for (i = 0; i < flow_attr->num_of_specs; i++) { + ret = parse_flow_attr(mdev->dev, qp->qp_num, ib_flow, + mailbox->buf + size); + if (ret < 0) { + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return -EINVAL; + } + ib_flow += ((union ib_flow_spec *) ib_flow)->size; + size += ret; + } + + if (mlx4_is_master(mdev->dev) && flow_type == MLX4_FS_REGULAR && + flow_attr->num_of_specs == 1) { + struct _rule_hw *rule_header = (struct _rule_hw *)(ctrl + 1); + enum ib_flow_spec_type header_spec = + ((union ib_flow_spec *)(flow_attr + 1))->type; + + if (header_spec == IB_FLOW_SPEC_ETH) + mlx4_handle_eth_header_mcast_prio(ctrl, rule_header); + } + + ret = mlx4_cmd_imm(mdev->dev, mailbox->dma, reg_id, size >> 2, 0, + MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (ret == -ENOMEM) + pr_err("mcg table is full. Fail to register network rule.\n"); + else if (ret == -ENXIO) + pr_err("Device managed flow steering is disabled. Fail to register network rule.\n"); + else if (ret) + pr_err("Invalid argument. Fail to register network rule.\n"); + + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return ret; +} + +static int __mlx4_ib_destroy_flow(struct mlx4_dev *dev, u64 reg_id) +{ + int err; + err = mlx4_cmd(dev, reg_id, 0, 0, + MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) + pr_err("Fail to detach network rule. registration id = 0x%llx\n", + reg_id); + return err; +} + +static int mlx4_ib_tunnel_steer_add(struct ib_qp *qp, struct ib_flow_attr *flow_attr, + u64 *reg_id) +{ + void *ib_flow; + union ib_flow_spec *ib_spec; + struct mlx4_dev *dev = to_mdev(qp->device)->dev; + int err = 0; + + if (dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN || + dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC) + return 0; /* do nothing */ + + ib_flow = flow_attr + 1; + ib_spec = (union ib_flow_spec *)ib_flow; + + if (ib_spec->type != IB_FLOW_SPEC_ETH || flow_attr->num_of_specs != 1) + return 0; /* do nothing */ + + err = mlx4_tunnel_steer_add(to_mdev(qp->device)->dev, ib_spec->eth.val.dst_mac, + flow_attr->port, qp->qp_num, + MLX4_DOMAIN_UVERBS | (flow_attr->priority & 0xff), + reg_id); + return err; +} + +static int mlx4_ib_add_dont_trap_rule(struct mlx4_dev *dev, + struct ib_flow_attr *flow_attr, + enum mlx4_net_trans_promisc_mode *type) +{ + int err = 0; + + if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER) || + (dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC) || + (flow_attr->num_of_specs > 1) || (flow_attr->priority != 0)) { + return -EOPNOTSUPP; + } + + if (flow_attr->num_of_specs == 0) { + type[0] = MLX4_FS_MC_SNIFFER; + type[1] = MLX4_FS_UC_SNIFFER; + } else { + union ib_flow_spec *ib_spec; + + ib_spec = (union ib_flow_spec *)(flow_attr + 1); + if (ib_spec->type != IB_FLOW_SPEC_ETH) + return -EINVAL; + + /* if all is zero than MC and UC */ + if (is_zero_ether_addr(ib_spec->eth.mask.dst_mac)) { + type[0] = MLX4_FS_MC_SNIFFER; + type[1] = MLX4_FS_UC_SNIFFER; + } else { + u8 mac[ETH_ALEN] = {ib_spec->eth.mask.dst_mac[0] ^ 0x01, + ib_spec->eth.mask.dst_mac[1], + ib_spec->eth.mask.dst_mac[2], + ib_spec->eth.mask.dst_mac[3], + ib_spec->eth.mask.dst_mac[4], + ib_spec->eth.mask.dst_mac[5]}; + + /* Above xor was only on MC bit, non empty mask is valid + * only if this bit is set and rest are zero. + */ + if (!is_zero_ether_addr(&mac[0])) + return -EINVAL; + + if (is_multicast_ether_addr(ib_spec->eth.val.dst_mac)) + type[0] = MLX4_FS_MC_SNIFFER; + else + type[0] = MLX4_FS_UC_SNIFFER; + } + } + + return err; +} + +static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, + struct ib_flow_attr *flow_attr, + int domain, struct ib_udata *udata) +{ + int err = 0, i = 0, j = 0; + struct mlx4_ib_flow *mflow; + enum mlx4_net_trans_promisc_mode type[2]; + struct mlx4_dev *dev = (to_mdev(qp->device))->dev; + int is_bonded = mlx4_is_bonded(dev); + + if (flow_attr->port < 1 || flow_attr->port > qp->device->phys_port_cnt) + return ERR_PTR(-EINVAL); + + if (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP) + return ERR_PTR(-EOPNOTSUPP); + + if ((flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) && + (flow_attr->type != IB_FLOW_ATTR_NORMAL)) + return ERR_PTR(-EOPNOTSUPP); + + if (udata && + udata->inlen && !ib_is_udata_cleared(udata, 0, udata->inlen)) + return ERR_PTR(-EOPNOTSUPP); + + memset(type, 0, sizeof(type)); + + mflow = kzalloc(sizeof(*mflow), GFP_KERNEL); + if (!mflow) { + err = -ENOMEM; + goto err_free; + } + + switch (flow_attr->type) { + case IB_FLOW_ATTR_NORMAL: + /* If dont trap flag (continue match) is set, under specific + * condition traffic be replicated to given qp, + * without stealing it + */ + if (unlikely(flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)) { + err = mlx4_ib_add_dont_trap_rule(dev, + flow_attr, + type); + if (err) + goto err_free; + } else { + type[0] = MLX4_FS_REGULAR; + } + break; + + case IB_FLOW_ATTR_ALL_DEFAULT: + type[0] = MLX4_FS_ALL_DEFAULT; + break; + + case IB_FLOW_ATTR_MC_DEFAULT: + type[0] = MLX4_FS_MC_DEFAULT; + break; + + case IB_FLOW_ATTR_SNIFFER: + type[0] = MLX4_FS_MIRROR_RX_PORT; + type[1] = MLX4_FS_MIRROR_SX_PORT; + break; + + default: + err = -EINVAL; + goto err_free; + } + + while (i < ARRAY_SIZE(type) && type[i]) { + err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[i], + &mflow->reg_id[i].id); + if (err) + goto err_create_flow; + if (is_bonded) { + /* Application always sees one port so the mirror rule + * must be on port #2 + */ + flow_attr->port = 2; + err = __mlx4_ib_create_flow(qp, flow_attr, + domain, type[j], + &mflow->reg_id[j].mirror); + flow_attr->port = 1; + if (err) + goto err_create_flow; + j++; + } + + i++; + } + + if (i < ARRAY_SIZE(type) && flow_attr->type == IB_FLOW_ATTR_NORMAL) { + err = mlx4_ib_tunnel_steer_add(qp, flow_attr, + &mflow->reg_id[i].id); + if (err) + goto err_create_flow; + + if (is_bonded) { + flow_attr->port = 2; + err = mlx4_ib_tunnel_steer_add(qp, flow_attr, + &mflow->reg_id[j].mirror); + flow_attr->port = 1; + if (err) + goto err_create_flow; + j++; + } + /* function to create mirror rule */ + i++; + } + + return &mflow->ibflow; + +err_create_flow: + while (i) { + (void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev, + mflow->reg_id[i].id); + i--; + } + + while (j) { + (void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev, + mflow->reg_id[j].mirror); + j--; + } +err_free: + kfree(mflow); + return ERR_PTR(err); +} + +static int mlx4_ib_destroy_flow(struct ib_flow *flow_id) +{ + int err, ret = 0; + int i = 0; + struct mlx4_ib_dev *mdev = to_mdev(flow_id->qp->device); + struct mlx4_ib_flow *mflow = to_mflow(flow_id); + + while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i].id) { + err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i].id); + if (err) + ret = err; + if (mflow->reg_id[i].mirror) { + err = __mlx4_ib_destroy_flow(mdev->dev, + mflow->reg_id[i].mirror); + if (err) + ret = err; + } + i++; + } + + kfree(mflow); + return ret; +} + +static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + int err; + struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + struct mlx4_dev *dev = mdev->dev; + struct mlx4_ib_qp *mqp = to_mqp(ibqp); + struct mlx4_ib_steering *ib_steering = NULL; + enum mlx4_protocol prot = MLX4_PROT_IB_IPV6; + struct mlx4_flow_reg_id reg_id; + + if (mdev->dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL); + if (!ib_steering) + return -ENOMEM; + } + + err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, mqp->port, + !!(mqp->flags & + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), + prot, ®_id.id); + if (err) { + pr_err("multicast attach op failed, err %d\n", err); + goto err_malloc; + } + + reg_id.mirror = 0; + if (mlx4_is_bonded(dev)) { + err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, + (mqp->port == 1) ? 2 : 1, + !!(mqp->flags & + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), + prot, ®_id.mirror); + if (err) + goto err_add; + } + + err = add_gid_entry(ibqp, gid); + if (err) + goto err_add; + + if (ib_steering) { + memcpy(ib_steering->gid.raw, gid->raw, 16); + ib_steering->reg_id = reg_id; + mutex_lock(&mqp->mutex); + list_add(&ib_steering->list, &mqp->steering_rules); + mutex_unlock(&mqp->mutex); + } + return 0; + +err_add: + mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, + prot, reg_id.id); + if (reg_id.mirror) + mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, + prot, reg_id.mirror); +err_malloc: + kfree(ib_steering); + + return err; +} + +static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw) +{ + struct mlx4_ib_gid_entry *ge; + struct mlx4_ib_gid_entry *tmp; + struct mlx4_ib_gid_entry *ret = NULL; + + list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { + if (!memcmp(raw, ge->gid.raw, 16)) { + ret = ge; + break; + } + } + + return ret; +} + +static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + int err; + struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + struct mlx4_dev *dev = mdev->dev; + struct mlx4_ib_qp *mqp = to_mqp(ibqp); + struct net_device *ndev; + struct mlx4_ib_gid_entry *ge; + struct mlx4_flow_reg_id reg_id = {0, 0}; + enum mlx4_protocol prot = MLX4_PROT_IB_IPV6; + + if (mdev->dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + struct mlx4_ib_steering *ib_steering; + + mutex_lock(&mqp->mutex); + list_for_each_entry(ib_steering, &mqp->steering_rules, list) { + if (!memcmp(ib_steering->gid.raw, gid->raw, 16)) { + list_del(&ib_steering->list); + break; + } + } + mutex_unlock(&mqp->mutex); + if (&ib_steering->list == &mqp->steering_rules) { + pr_err("Couldn't find reg_id for mgid. Steering rule is left attached\n"); + return -EINVAL; + } + reg_id = ib_steering->reg_id; + kfree(ib_steering); + } + + err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, + prot, reg_id.id); + if (err) + return err; + + if (mlx4_is_bonded(dev)) { + err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, + prot, reg_id.mirror); + if (err) + return err; + } + + mutex_lock(&mqp->mutex); + ge = find_gid_entry(mqp, gid->raw); + if (ge) { + spin_lock_bh(&mdev->iboe.lock); + ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL; + if (ndev) + dev_hold(ndev); + spin_unlock_bh(&mdev->iboe.lock); + if (ndev) + dev_put(ndev); + list_del(&ge->list); + kfree(ge); + } else + pr_warn("could not find mgid entry\n"); + + mutex_unlock(&mqp->mutex); + + return 0; +} + +static int init_node_data(struct mlx4_ib_dev *dev) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; + if (mlx4_is_master(dev->dev)) + mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; + + err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(dev->ib_dev.node_desc, out_mad->data, IB_DEVICE_NODE_DESC_MAX); + + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32)); + memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static ssize_t show_hca(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_dev *dev = + container_of(device, struct mlx4_ib_dev, ib_dev.dev); + return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device); +} + +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_dev *dev = + container_of(device, struct mlx4_ib_dev, ib_dev.dev); + return sprintf(buf, "%x\n", dev->dev->rev_id); +} + +static ssize_t show_board(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_dev *dev = + container_of(device, struct mlx4_ib_dev, ib_dev.dev); + return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN, + dev->dev->board_id); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); + +static struct device_attribute *mlx4_class_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_hca_type, + &dev_attr_board_id +}; + +struct diag_counter { + const char *name; + u32 offset; +}; + +#define DIAG_COUNTER(_name, _offset) \ + { .name = #_name, .offset = _offset } + +static const struct diag_counter diag_basic[] = { + DIAG_COUNTER(rq_num_lle, 0x00), + DIAG_COUNTER(sq_num_lle, 0x04), + DIAG_COUNTER(rq_num_lqpoe, 0x08), + DIAG_COUNTER(sq_num_lqpoe, 0x0C), + DIAG_COUNTER(rq_num_lpe, 0x18), + DIAG_COUNTER(sq_num_lpe, 0x1C), + DIAG_COUNTER(rq_num_wrfe, 0x20), + DIAG_COUNTER(sq_num_wrfe, 0x24), + DIAG_COUNTER(sq_num_mwbe, 0x2C), + DIAG_COUNTER(sq_num_bre, 0x34), + DIAG_COUNTER(sq_num_rire, 0x44), + DIAG_COUNTER(rq_num_rire, 0x48), + DIAG_COUNTER(sq_num_rae, 0x4C), + DIAG_COUNTER(rq_num_rae, 0x50), + DIAG_COUNTER(sq_num_roe, 0x54), + DIAG_COUNTER(sq_num_tree, 0x5C), + DIAG_COUNTER(sq_num_rree, 0x64), + DIAG_COUNTER(rq_num_rnr, 0x68), + DIAG_COUNTER(sq_num_rnr, 0x6C), + DIAG_COUNTER(rq_num_oos, 0x100), + DIAG_COUNTER(sq_num_oos, 0x104), +}; + +static const struct diag_counter diag_ext[] = { + DIAG_COUNTER(rq_num_dup, 0x130), + DIAG_COUNTER(sq_num_to, 0x134), +}; + +static const struct diag_counter diag_device_only[] = { + DIAG_COUNTER(num_cqovf, 0x1A0), + DIAG_COUNTER(rq_num_udsdprd, 0x118), +}; + +static struct rdma_hw_stats *mlx4_ib_alloc_hw_stats(struct ib_device *ibdev, + u8 port_num) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct mlx4_ib_diag_counters *diag = dev->diag_counters; + + if (!diag[!!port_num].name) + return NULL; + + return rdma_alloc_hw_stats_struct(diag[!!port_num].name, + diag[!!port_num].num_counters, + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} + +static int mlx4_ib_get_hw_stats(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + u8 port, int index) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct mlx4_ib_diag_counters *diag = dev->diag_counters; + u32 hw_value[ARRAY_SIZE(diag_device_only) + + ARRAY_SIZE(diag_ext) + ARRAY_SIZE(diag_basic)] = {}; + int ret; + int i; + + ret = mlx4_query_diag_counters(dev->dev, + MLX4_OP_MOD_QUERY_TRANSPORT_CI_ERRORS, + diag[!!port].offset, hw_value, + diag[!!port].num_counters, port); + + if (ret) + return ret; + + for (i = 0; i < diag[!!port].num_counters; i++) + stats->value[i] = hw_value[i]; + + return diag[!!port].num_counters; +} + +static int __mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev, + const char ***name, + u32 **offset, + u32 *num, + bool port) +{ + u32 num_counters; + + num_counters = ARRAY_SIZE(diag_basic); + + if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT) + num_counters += ARRAY_SIZE(diag_ext); + + if (!port) + num_counters += ARRAY_SIZE(diag_device_only); + + *name = kcalloc(num_counters, sizeof(**name), GFP_KERNEL); + if (!*name) + return -ENOMEM; + + *offset = kcalloc(num_counters, sizeof(**offset), GFP_KERNEL); + if (!*offset) + goto err_name; + + *num = num_counters; + + return 0; + +err_name: + kfree(*name); + return -ENOMEM; +} + +static void mlx4_ib_fill_diag_counters(struct mlx4_ib_dev *ibdev, + const char **name, + u32 *offset, + bool port) +{ + int i; + int j; + + for (i = 0, j = 0; i < ARRAY_SIZE(diag_basic); i++, j++) { + name[i] = diag_basic[i].name; + offset[i] = diag_basic[i].offset; + } + + if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT) { + for (i = 0; i < ARRAY_SIZE(diag_ext); i++, j++) { + name[j] = diag_ext[i].name; + offset[j] = diag_ext[i].offset; + } + } + + if (!port) { + for (i = 0; i < ARRAY_SIZE(diag_device_only); i++, j++) { + name[j] = diag_device_only[i].name; + offset[j] = diag_device_only[i].offset; + } + } +} + +static int mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev) +{ + struct mlx4_ib_diag_counters *diag = ibdev->diag_counters; + int i; + int ret; + bool per_port = !!(ibdev->dev->caps.flags2 & + MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT); + + if (mlx4_is_slave(ibdev->dev)) + return 0; + + for (i = 0; i < MLX4_DIAG_COUNTERS_TYPES; i++) { + /* i == 1 means we are building port counters */ + if (i && !per_port) + continue; + + ret = __mlx4_ib_alloc_diag_counters(ibdev, &diag[i].name, + &diag[i].offset, + &diag[i].num_counters, i); + if (ret) + goto err_alloc; + + mlx4_ib_fill_diag_counters(ibdev, diag[i].name, + diag[i].offset, i); + } + + ibdev->ib_dev.get_hw_stats = mlx4_ib_get_hw_stats; + ibdev->ib_dev.alloc_hw_stats = mlx4_ib_alloc_hw_stats; + + return 0; + +err_alloc: + if (i) { + kfree(diag[i - 1].name); + kfree(diag[i - 1].offset); + } + + return ret; +} + +static void mlx4_ib_diag_cleanup(struct mlx4_ib_dev *ibdev) +{ + int i; + + for (i = 0; i < MLX4_DIAG_COUNTERS_TYPES; i++) { + kfree(ibdev->diag_counters[i].offset); + kfree(ibdev->diag_counters[i].name); + } +} + +#define MLX4_IB_INVALID_MAC ((u64)-1) +static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev, + struct net_device *dev, + int port) +{ + u64 new_smac = 0; + u64 release_mac = MLX4_IB_INVALID_MAC; + struct mlx4_ib_qp *qp; + + read_lock(&dev_base_lock); + new_smac = mlx4_mac_to_u64(dev->dev_addr); + read_unlock(&dev_base_lock); + + atomic64_set(&ibdev->iboe.mac[port - 1], new_smac); + + /* no need for update QP1 and mac registration in non-SRIOV */ + if (!mlx4_is_mfunc(ibdev->dev)) + return; + + mutex_lock(&ibdev->qp1_proxy_lock[port - 1]); + qp = ibdev->qp1_proxy[port - 1]; + if (qp) { + int new_smac_index; + u64 old_smac; + struct mlx4_update_qp_params update_params; + + mutex_lock(&qp->mutex); + old_smac = qp->pri.smac; + if (new_smac == old_smac) + goto unlock; + + new_smac_index = mlx4_register_mac(ibdev->dev, port, new_smac); + + if (new_smac_index < 0) + goto unlock; + + update_params.smac_index = new_smac_index; + if (mlx4_update_qp(ibdev->dev, qp->mqp.qpn, MLX4_UPDATE_QP_SMAC, + &update_params)) { + release_mac = new_smac; + goto unlock; + } + /* if old port was zero, no mac was yet registered for this QP */ + if (qp->pri.smac_port) + release_mac = old_smac; + qp->pri.smac = new_smac; + qp->pri.smac_port = port; + qp->pri.smac_index = new_smac_index; + } + +unlock: + if (release_mac != MLX4_IB_INVALID_MAC) + mlx4_unregister_mac(ibdev->dev, port, release_mac); + if (qp) + mutex_unlock(&qp->mutex); + mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]); +} + +static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, + struct net_device *dev, + unsigned long event) + +{ + struct mlx4_ib_iboe *iboe; + int update_qps_port = -1; + int port; + + ASSERT_RTNL(); + + iboe = &ibdev->iboe; + + spin_lock_bh(&iboe->lock); + mlx4_foreach_ib_transport_port(port, ibdev->dev) { + + iboe->netdevs[port - 1] = + mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port); + + if (dev == iboe->netdevs[port - 1] && + (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER || + event == NETDEV_UP || event == NETDEV_CHANGE)) + update_qps_port = port; + + } + spin_unlock_bh(&iboe->lock); + + if (update_qps_port > 0) + mlx4_ib_update_qps(ibdev, dev, update_qps_port); +} + +static int mlx4_ib_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct mlx4_ib_dev *ibdev; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb); + mlx4_ib_scan_netdevs(ibdev, dev, event); + + return NOTIFY_DONE; +} + +static void init_pkeys(struct mlx4_ib_dev *ibdev) +{ + int port; + int slave; + int i; + + if (mlx4_is_master(ibdev->dev)) { + for (slave = 0; slave <= ibdev->dev->persist->num_vfs; + ++slave) { + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { + for (i = 0; + i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; + ++i) { + ibdev->pkeys.virt2phys_pkey[slave][port - 1][i] = + /* master has the identity virt2phys pkey mapping */ + (slave == mlx4_master_func_num(ibdev->dev) || !i) ? i : + ibdev->dev->phys_caps.pkey_phys_table_len[port] - 1; + mlx4_sync_pkey_table(ibdev->dev, slave, port, i, + ibdev->pkeys.virt2phys_pkey[slave][port - 1][i]); + } + } + } + /* initialize pkey cache */ + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { + for (i = 0; + i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; + ++i) + ibdev->pkeys.phys_pkey_cache[port-1][i] = + (i) ? 0 : 0xFFFF; + } + } +} + +static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) +{ + int i, j, eq = 0, total_eqs = 0; + + ibdev->eq_table = kcalloc(dev->caps.num_comp_vectors, + sizeof(ibdev->eq_table[0]), GFP_KERNEL); + if (!ibdev->eq_table) + return; + + for (i = 1; i <= dev->caps.num_ports; i++) { + for (j = 0; j < mlx4_get_eqs_per_port(dev, i); + j++, total_eqs++) { + if (i > 1 && mlx4_is_eq_shared(dev, total_eqs)) + continue; + ibdev->eq_table[eq] = total_eqs; + if (!mlx4_assign_eq(dev, i, + &ibdev->eq_table[eq])) + eq++; + else + ibdev->eq_table[eq] = -1; + } + } + + for (i = eq; i < dev->caps.num_comp_vectors; + ibdev->eq_table[i++] = -1) + ; + + /* Advertise the new number of EQs to clients */ + ibdev->ib_dev.num_comp_vectors = eq; +} + +static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) +{ + int i; + int total_eqs = ibdev->ib_dev.num_comp_vectors; + + /* no eqs were allocated */ + if (!ibdev->eq_table) + return; + + /* Reset the advertised EQ number */ + ibdev->ib_dev.num_comp_vectors = 0; + + for (i = 0; i < total_eqs; i++) + mlx4_release_eq(dev, ibdev->eq_table[i]); + + kfree(ibdev->eq_table); + ibdev->eq_table = NULL; +} + +static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num, + struct ib_port_immutable *immutable) +{ + struct ib_port_attr attr; + struct mlx4_ib_dev *mdev = to_mdev(ibdev); + int err; + + if (mlx4_ib_port_link_layer(ibdev, port_num) == IB_LINK_LAYER_INFINIBAND) { + immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB; + immutable->max_mad_size = IB_MGMT_MAD_SIZE; + } else { + if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) + immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE; + if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) + immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE | + RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + immutable->core_cap_flags |= RDMA_CORE_PORT_RAW_PACKET; + if (immutable->core_cap_flags & (RDMA_CORE_PORT_IBA_ROCE | + RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP)) + immutable->max_mad_size = IB_MGMT_MAD_SIZE; + } + + err = ib_query_port(ibdev, port_num, &attr); + if (err) + return err; + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + + return 0; +} + +static void get_fw_ver_str(struct ib_device *device, char *str) +{ + struct mlx4_ib_dev *dev = + container_of(device, struct mlx4_ib_dev, ib_dev); + snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d", + (int) (dev->dev->caps.fw_ver >> 32), + (int) (dev->dev->caps.fw_ver >> 16) & 0xffff, + (int) dev->dev->caps.fw_ver & 0xffff); +} + +static void *mlx4_ib_add(struct mlx4_dev *dev) +{ + struct mlx4_ib_dev *ibdev; + int num_ports = 0; + int i, j; + int err; + struct mlx4_ib_iboe *iboe; + int ib_num_ports = 0; + int num_req_counters; + int allocated; + u32 counter_index; + struct counter_index *new_counter_index = NULL; + + pr_info_once("%s", mlx4_ib_version); + + num_ports = 0; + mlx4_foreach_ib_transport_port(i, dev) + num_ports++; + + /* No point in registering a device with no ports... */ + if (num_ports == 0) + return NULL; + + ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev); + if (!ibdev) { + dev_err(&dev->persist->pdev->dev, + "Device struct alloc failed\n"); + return NULL; + } + + iboe = &ibdev->iboe; + + if (mlx4_pd_alloc(dev, &ibdev->priv_pdn)) + goto err_dealloc; + + if (mlx4_uar_alloc(dev, &ibdev->priv_uar)) + goto err_pd; + + ibdev->uar_map = ioremap((phys_addr_t) ibdev->priv_uar.pfn << PAGE_SHIFT, + PAGE_SIZE); + if (!ibdev->uar_map) + goto err_uar; + MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock); + + ibdev->dev = dev; + ibdev->bond_next_port = 0; + + strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX); + ibdev->ib_dev.owner = THIS_MODULE; + ibdev->ib_dev.node_type = RDMA_NODE_IB_CA; + ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey; + ibdev->num_ports = num_ports; + ibdev->ib_dev.phys_port_cnt = mlx4_is_bonded(dev) ? + 1 : ibdev->num_ports; + ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; + ibdev->ib_dev.dev.parent = &dev->persist->pdev->dev; + ibdev->ib_dev.get_netdev = mlx4_ib_get_netdev; + ibdev->ib_dev.add_gid = mlx4_ib_add_gid; + ibdev->ib_dev.del_gid = mlx4_ib_del_gid; + + if (dev->caps.userspace_caps) + ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; + else + ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION; + + ibdev->ib_dev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_REREG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | + (1ull << IB_USER_VERBS_CMD_OPEN_QP); + + ibdev->ib_dev.query_device = mlx4_ib_query_device; + ibdev->ib_dev.query_port = mlx4_ib_query_port; + ibdev->ib_dev.get_link_layer = mlx4_ib_port_link_layer; + ibdev->ib_dev.query_gid = mlx4_ib_query_gid; + ibdev->ib_dev.query_pkey = mlx4_ib_query_pkey; + ibdev->ib_dev.modify_device = mlx4_ib_modify_device; + ibdev->ib_dev.modify_port = mlx4_ib_modify_port; + ibdev->ib_dev.alloc_ucontext = mlx4_ib_alloc_ucontext; + ibdev->ib_dev.dealloc_ucontext = mlx4_ib_dealloc_ucontext; + ibdev->ib_dev.mmap = mlx4_ib_mmap; + ibdev->ib_dev.alloc_pd = mlx4_ib_alloc_pd; + ibdev->ib_dev.dealloc_pd = mlx4_ib_dealloc_pd; + ibdev->ib_dev.create_ah = mlx4_ib_create_ah; + ibdev->ib_dev.query_ah = mlx4_ib_query_ah; + ibdev->ib_dev.destroy_ah = mlx4_ib_destroy_ah; + ibdev->ib_dev.create_srq = mlx4_ib_create_srq; + ibdev->ib_dev.modify_srq = mlx4_ib_modify_srq; + ibdev->ib_dev.query_srq = mlx4_ib_query_srq; + ibdev->ib_dev.destroy_srq = mlx4_ib_destroy_srq; + ibdev->ib_dev.post_srq_recv = mlx4_ib_post_srq_recv; + ibdev->ib_dev.create_qp = mlx4_ib_create_qp; + ibdev->ib_dev.modify_qp = mlx4_ib_modify_qp; + ibdev->ib_dev.query_qp = mlx4_ib_query_qp; + ibdev->ib_dev.destroy_qp = mlx4_ib_destroy_qp; + ibdev->ib_dev.drain_sq = mlx4_ib_drain_sq; + ibdev->ib_dev.drain_rq = mlx4_ib_drain_rq; + ibdev->ib_dev.post_send = mlx4_ib_post_send; + ibdev->ib_dev.post_recv = mlx4_ib_post_recv; + ibdev->ib_dev.create_cq = mlx4_ib_create_cq; + ibdev->ib_dev.modify_cq = mlx4_ib_modify_cq; + ibdev->ib_dev.resize_cq = mlx4_ib_resize_cq; + ibdev->ib_dev.destroy_cq = mlx4_ib_destroy_cq; + ibdev->ib_dev.poll_cq = mlx4_ib_poll_cq; + ibdev->ib_dev.req_notify_cq = mlx4_ib_arm_cq; + ibdev->ib_dev.get_dma_mr = mlx4_ib_get_dma_mr; + ibdev->ib_dev.reg_user_mr = mlx4_ib_reg_user_mr; + ibdev->ib_dev.rereg_user_mr = mlx4_ib_rereg_user_mr; + ibdev->ib_dev.dereg_mr = mlx4_ib_dereg_mr; + ibdev->ib_dev.alloc_mr = mlx4_ib_alloc_mr; + ibdev->ib_dev.map_mr_sg = mlx4_ib_map_mr_sg; + ibdev->ib_dev.attach_mcast = mlx4_ib_mcg_attach; + ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach; + ibdev->ib_dev.process_mad = mlx4_ib_process_mad; + ibdev->ib_dev.get_port_immutable = mlx4_port_immutable; + ibdev->ib_dev.get_dev_fw_str = get_fw_ver_str; + ibdev->ib_dev.disassociate_ucontext = mlx4_ib_disassociate_ucontext; + + ibdev->ib_dev.uverbs_ex_cmd_mask |= + (1ull << IB_USER_VERBS_EX_CMD_MODIFY_CQ); + + if ((dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) && + ((mlx4_ib_port_link_layer(&ibdev->ib_dev, 1) == + IB_LINK_LAYER_ETHERNET) || + (mlx4_ib_port_link_layer(&ibdev->ib_dev, 2) == + IB_LINK_LAYER_ETHERNET))) { + ibdev->ib_dev.create_wq = mlx4_ib_create_wq; + ibdev->ib_dev.modify_wq = mlx4_ib_modify_wq; + ibdev->ib_dev.destroy_wq = mlx4_ib_destroy_wq; + ibdev->ib_dev.create_rwq_ind_table = + mlx4_ib_create_rwq_ind_table; + ibdev->ib_dev.destroy_rwq_ind_table = + mlx4_ib_destroy_rwq_ind_table; + ibdev->ib_dev.uverbs_ex_cmd_mask |= + (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL); + } + + if (!mlx4_is_slave(ibdev->dev)) { + ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc; + ibdev->ib_dev.map_phys_fmr = mlx4_ib_map_phys_fmr; + ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr; + ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc; + } + + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW || + dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) { + ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw; + ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw; + + ibdev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); + } + + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) { + ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd; + ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd; + ibdev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | + (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); + } + + if (check_flow_steering_support(dev)) { + ibdev->steering_support = MLX4_STEERING_MODE_DEVICE_MANAGED; + ibdev->ib_dev.create_flow = mlx4_ib_create_flow; + ibdev->ib_dev.destroy_flow = mlx4_ib_destroy_flow; + + ibdev->ib_dev.uverbs_ex_cmd_mask |= + (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); + } + + ibdev->ib_dev.uverbs_ex_cmd_mask |= + (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_EX_CMD_CREATE_QP); + + mlx4_ib_alloc_eqs(dev, ibdev); + + spin_lock_init(&iboe->lock); + + if (init_node_data(ibdev)) + goto err_map; + mlx4_init_sl2vl_tbl(ibdev); + + for (i = 0; i < ibdev->num_ports; ++i) { + mutex_init(&ibdev->counters_table[i].mutex); + INIT_LIST_HEAD(&ibdev->counters_table[i].counters_list); + } + + num_req_counters = mlx4_is_bonded(dev) ? 1 : ibdev->num_ports; + for (i = 0; i < num_req_counters; ++i) { + mutex_init(&ibdev->qp1_proxy_lock[i]); + allocated = 0; + if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) == + IB_LINK_LAYER_ETHERNET) { + err = mlx4_counter_alloc(ibdev->dev, &counter_index, + MLX4_RES_USAGE_DRIVER); + /* if failed to allocate a new counter, use default */ + if (err) + counter_index = + mlx4_get_default_counter_index(dev, + i + 1); + else + allocated = 1; + } else { /* IB_LINK_LAYER_INFINIBAND use the default counter */ + counter_index = mlx4_get_default_counter_index(dev, + i + 1); + } + new_counter_index = kmalloc(sizeof(*new_counter_index), + GFP_KERNEL); + if (!new_counter_index) { + if (allocated) + mlx4_counter_free(ibdev->dev, counter_index); + goto err_counter; + } + new_counter_index->index = counter_index; + new_counter_index->allocated = allocated; + list_add_tail(&new_counter_index->list, + &ibdev->counters_table[i].counters_list); + ibdev->counters_table[i].default_counter = counter_index; + pr_info("counter index %d for port %d allocated %d\n", + counter_index, i + 1, allocated); + } + if (mlx4_is_bonded(dev)) + for (i = 1; i < ibdev->num_ports ; ++i) { + new_counter_index = + kmalloc(sizeof(struct counter_index), + GFP_KERNEL); + if (!new_counter_index) + goto err_counter; + new_counter_index->index = counter_index; + new_counter_index->allocated = 0; + list_add_tail(&new_counter_index->list, + &ibdev->counters_table[i].counters_list); + ibdev->counters_table[i].default_counter = + counter_index; + } + + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + ib_num_ports++; + + spin_lock_init(&ibdev->sm_lock); + mutex_init(&ibdev->cap_mask_mutex); + INIT_LIST_HEAD(&ibdev->qp_list); + spin_lock_init(&ibdev->reset_flow_resource_lock); + + if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED && + ib_num_ports) { + ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS; + err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count, + MLX4_IB_UC_STEER_QPN_ALIGN, + &ibdev->steer_qpn_base, 0, + MLX4_RES_USAGE_DRIVER); + if (err) + goto err_counter; + + ibdev->ib_uc_qpns_bitmap = + kmalloc_array(BITS_TO_LONGS(ibdev->steer_qpn_count), + sizeof(long), + GFP_KERNEL); + if (!ibdev->ib_uc_qpns_bitmap) + goto err_steer_qp_release; + + if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB) { + bitmap_zero(ibdev->ib_uc_qpns_bitmap, + ibdev->steer_qpn_count); + err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE( + dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_base + + ibdev->steer_qpn_count - 1); + if (err) + goto err_steer_free_bitmap; + } else { + bitmap_fill(ibdev->ib_uc_qpns_bitmap, + ibdev->steer_qpn_count); + } + } + + for (j = 1; j <= ibdev->dev->caps.num_ports; j++) + atomic64_set(&iboe->mac[j - 1], ibdev->dev->caps.def_mac[j]); + + if (mlx4_ib_alloc_diag_counters(ibdev)) + goto err_steer_free_bitmap; + + ibdev->ib_dev.driver_id = RDMA_DRIVER_MLX4; + if (ib_register_device(&ibdev->ib_dev, NULL)) + goto err_diag_counters; + + if (mlx4_ib_mad_init(ibdev)) + goto err_reg; + + if (mlx4_ib_init_sriov(ibdev)) + goto err_mad; + + if (!iboe->nb.notifier_call) { + iboe->nb.notifier_call = mlx4_ib_netdev_event; + err = register_netdevice_notifier(&iboe->nb); + if (err) { + iboe->nb.notifier_call = NULL; + goto err_notif; + } + } + if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) { + err = mlx4_config_roce_v2_port(dev, ROCE_V2_UDP_DPORT); + if (err) + goto err_notif; + } + + for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { + if (device_create_file(&ibdev->ib_dev.dev, + mlx4_class_attributes[j])) + goto err_notif; + } + + ibdev->ib_active = true; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + devlink_port_type_ib_set(mlx4_get_devlink_port(dev, i), + &ibdev->ib_dev); + + if (mlx4_is_mfunc(ibdev->dev)) + init_pkeys(ibdev); + + /* create paravirt contexts for any VFs which are active */ + if (mlx4_is_master(ibdev->dev)) { + for (j = 0; j < MLX4_MFUNC_MAX; j++) { + if (j == mlx4_master_func_num(ibdev->dev)) + continue; + if (mlx4_is_slave_active(ibdev->dev, j)) + do_slave_init(ibdev, j, 1); + } + } + return ibdev; + +err_notif: + if (ibdev->iboe.nb.notifier_call) { + if (unregister_netdevice_notifier(&ibdev->iboe.nb)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb.notifier_call = NULL; + } + flush_workqueue(wq); + + mlx4_ib_close_sriov(ibdev); + +err_mad: + mlx4_ib_mad_cleanup(ibdev); + +err_reg: + ib_unregister_device(&ibdev->ib_dev); + +err_diag_counters: + mlx4_ib_diag_cleanup(ibdev); + +err_steer_free_bitmap: + kfree(ibdev->ib_uc_qpns_bitmap); + +err_steer_qp_release: + mlx4_qp_release_range(dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_count); +err_counter: + for (i = 0; i < ibdev->num_ports; ++i) + mlx4_ib_delete_counters_table(ibdev, &ibdev->counters_table[i]); + +err_map: + mlx4_ib_free_eqs(dev, ibdev); + iounmap(ibdev->uar_map); + +err_uar: + mlx4_uar_free(dev, &ibdev->priv_uar); + +err_pd: + mlx4_pd_free(dev, ibdev->priv_pdn); + +err_dealloc: + ib_dealloc_device(&ibdev->ib_dev); + + return NULL; +} + +int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn) +{ + int offset; + + WARN_ON(!dev->ib_uc_qpns_bitmap); + + offset = bitmap_find_free_region(dev->ib_uc_qpns_bitmap, + dev->steer_qpn_count, + get_count_order(count)); + if (offset < 0) + return offset; + + *qpn = dev->steer_qpn_base + offset; + return 0; +} + +void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count) +{ + if (!qpn || + dev->steering_support != MLX4_STEERING_MODE_DEVICE_MANAGED) + return; + + if (WARN(qpn < dev->steer_qpn_base, "qpn = %u, steer_qpn_base = %u\n", + qpn, dev->steer_qpn_base)) + /* not supposed to be here */ + return; + + bitmap_release_region(dev->ib_uc_qpns_bitmap, + qpn - dev->steer_qpn_base, + get_count_order(count)); +} + +int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + int is_attach) +{ + int err; + size_t flow_size; + struct ib_flow_attr *flow = NULL; + struct ib_flow_spec_ib *ib_spec; + + if (is_attach) { + flow_size = sizeof(struct ib_flow_attr) + + sizeof(struct ib_flow_spec_ib); + flow = kzalloc(flow_size, GFP_KERNEL); + if (!flow) + return -ENOMEM; + flow->port = mqp->port; + flow->num_of_specs = 1; + flow->size = flow_size; + ib_spec = (struct ib_flow_spec_ib *)(flow + 1); + ib_spec->type = IB_FLOW_SPEC_IB; + ib_spec->size = sizeof(struct ib_flow_spec_ib); + /* Add an empty rule for IB L2 */ + memset(&ib_spec->mask, 0, sizeof(ib_spec->mask)); + + err = __mlx4_ib_create_flow(&mqp->ibqp, flow, + IB_FLOW_DOMAIN_NIC, + MLX4_FS_REGULAR, + &mqp->reg_id); + } else { + err = __mlx4_ib_destroy_flow(mdev->dev, mqp->reg_id); + } + kfree(flow); + return err; +} + +static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) +{ + struct mlx4_ib_dev *ibdev = ibdev_ptr; + int p; + int i; + + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + devlink_port_type_clear(mlx4_get_devlink_port(dev, i)); + ibdev->ib_active = false; + flush_workqueue(wq); + + if (ibdev->iboe.nb.notifier_call) { + if (unregister_netdevice_notifier(&ibdev->iboe.nb)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb.notifier_call = NULL; + } + + mlx4_ib_close_sriov(ibdev); + mlx4_ib_mad_cleanup(ibdev); + ib_unregister_device(&ibdev->ib_dev); + mlx4_ib_diag_cleanup(ibdev); + + mlx4_qp_release_range(dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_count); + kfree(ibdev->ib_uc_qpns_bitmap); + + iounmap(ibdev->uar_map); + for (p = 0; p < ibdev->num_ports; ++p) + mlx4_ib_delete_counters_table(ibdev, &ibdev->counters_table[p]); + + mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB) + mlx4_CLOSE_PORT(dev, p); + + mlx4_ib_free_eqs(dev, ibdev); + + mlx4_uar_free(dev, &ibdev->priv_uar); + mlx4_pd_free(dev, ibdev->priv_pdn); + ib_dealloc_device(&ibdev->ib_dev); +} + +static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init) +{ + struct mlx4_ib_demux_work **dm = NULL; + struct mlx4_dev *dev = ibdev->dev; + int i; + unsigned long flags; + struct mlx4_active_ports actv_ports; + unsigned int ports; + unsigned int first_port; + + if (!mlx4_is_master(dev)) + return; + + actv_ports = mlx4_get_active_ports(dev, slave); + ports = bitmap_weight(actv_ports.ports, dev->caps.num_ports); + first_port = find_first_bit(actv_ports.ports, dev->caps.num_ports); + + dm = kcalloc(ports, sizeof(*dm), GFP_ATOMIC); + if (!dm) + return; + + for (i = 0; i < ports; i++) { + dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC); + if (!dm[i]) { + while (--i >= 0) + kfree(dm[i]); + goto out; + } + INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work); + dm[i]->port = first_port + i + 1; + dm[i]->slave = slave; + dm[i]->do_init = do_init; + dm[i]->dev = ibdev; + } + /* initialize or tear down tunnel QPs for the slave */ + spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags); + if (!ibdev->sriov.is_going_down) { + for (i = 0; i < ports; i++) + queue_work(ibdev->sriov.demux[i].ud_wq, &dm[i]->work); + spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags); + } else { + spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags); + for (i = 0; i < ports; i++) + kfree(dm[i]); + } +out: + kfree(dm); + return; +} + +static void mlx4_ib_handle_catas_error(struct mlx4_ib_dev *ibdev) +{ + struct mlx4_ib_qp *mqp; + unsigned long flags_qp; + unsigned long flags_cq; + struct mlx4_ib_cq *send_mcq, *recv_mcq; + struct list_head cq_notify_list; + struct mlx4_cq *mcq; + unsigned long flags; + + pr_warn("mlx4_ib_handle_catas_error was started\n"); + INIT_LIST_HEAD(&cq_notify_list); + + /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/ + spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags); + + list_for_each_entry(mqp, &ibdev->qp_list, qps_list) { + spin_lock_irqsave(&mqp->sq.lock, flags_qp); + if (mqp->sq.tail != mqp->sq.head) { + send_mcq = to_mcq(mqp->ibqp.send_cq); + spin_lock_irqsave(&send_mcq->lock, flags_cq); + if (send_mcq->mcq.comp && + mqp->ibqp.send_cq->comp_handler) { + if (!send_mcq->mcq.reset_notify_added) { + send_mcq->mcq.reset_notify_added = 1; + list_add_tail(&send_mcq->mcq.reset_notify, + &cq_notify_list); + } + } + spin_unlock_irqrestore(&send_mcq->lock, flags_cq); + } + spin_unlock_irqrestore(&mqp->sq.lock, flags_qp); + /* Now, handle the QP's receive queue */ + spin_lock_irqsave(&mqp->rq.lock, flags_qp); + /* no handling is needed for SRQ */ + if (!mqp->ibqp.srq) { + if (mqp->rq.tail != mqp->rq.head) { + recv_mcq = to_mcq(mqp->ibqp.recv_cq); + spin_lock_irqsave(&recv_mcq->lock, flags_cq); + if (recv_mcq->mcq.comp && + mqp->ibqp.recv_cq->comp_handler) { + if (!recv_mcq->mcq.reset_notify_added) { + recv_mcq->mcq.reset_notify_added = 1; + list_add_tail(&recv_mcq->mcq.reset_notify, + &cq_notify_list); + } + } + spin_unlock_irqrestore(&recv_mcq->lock, + flags_cq); + } + } + spin_unlock_irqrestore(&mqp->rq.lock, flags_qp); + } + + list_for_each_entry(mcq, &cq_notify_list, reset_notify) { + mcq->comp(mcq); + } + spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags); + pr_warn("mlx4_ib_handle_catas_error ended\n"); +} + +static void handle_bonded_port_state_event(struct work_struct *work) +{ + struct ib_event_work *ew = + container_of(work, struct ib_event_work, work); + struct mlx4_ib_dev *ibdev = ew->ib_dev; + enum ib_port_state bonded_port_state = IB_PORT_NOP; + int i; + struct ib_event ibev; + + kfree(ew); + spin_lock_bh(&ibdev->iboe.lock); + for (i = 0; i < MLX4_MAX_PORTS; ++i) { + struct net_device *curr_netdev = ibdev->iboe.netdevs[i]; + enum ib_port_state curr_port_state; + + if (!curr_netdev) + continue; + + curr_port_state = + (netif_running(curr_netdev) && + netif_carrier_ok(curr_netdev)) ? + IB_PORT_ACTIVE : IB_PORT_DOWN; + + bonded_port_state = (bonded_port_state != IB_PORT_ACTIVE) ? + curr_port_state : IB_PORT_ACTIVE; + } + spin_unlock_bh(&ibdev->iboe.lock); + + ibev.device = &ibdev->ib_dev; + ibev.element.port_num = 1; + ibev.event = (bonded_port_state == IB_PORT_ACTIVE) ? + IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; + + ib_dispatch_event(&ibev); +} + +void mlx4_ib_sl2vl_update(struct mlx4_ib_dev *mdev, int port) +{ + u64 sl2vl; + int err; + + err = mlx4_ib_query_sl2vl(&mdev->ib_dev, port, &sl2vl); + if (err) { + pr_err("Unable to get current sl to vl mapping for port %d. Using all zeroes (%d)\n", + port, err); + sl2vl = 0; + } + atomic64_set(&mdev->sl2vl[port - 1], sl2vl); +} + +static void ib_sl2vl_update_work(struct work_struct *work) +{ + struct ib_event_work *ew = container_of(work, struct ib_event_work, work); + struct mlx4_ib_dev *mdev = ew->ib_dev; + int port = ew->port; + + mlx4_ib_sl2vl_update(mdev, port); + + kfree(ew); +} + +void mlx4_sched_ib_sl2vl_update_work(struct mlx4_ib_dev *ibdev, + int port) +{ + struct ib_event_work *ew; + + ew = kmalloc(sizeof(*ew), GFP_ATOMIC); + if (ew) { + INIT_WORK(&ew->work, ib_sl2vl_update_work); + ew->port = port; + ew->ib_dev = ibdev; + queue_work(wq, &ew->work); + } +} + +static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, + enum mlx4_dev_event event, unsigned long param) +{ + struct ib_event ibev; + struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr); + struct mlx4_eqe *eqe = NULL; + struct ib_event_work *ew; + int p = 0; + + if (mlx4_is_bonded(dev) && + ((event == MLX4_DEV_EVENT_PORT_UP) || + (event == MLX4_DEV_EVENT_PORT_DOWN))) { + ew = kmalloc(sizeof(*ew), GFP_ATOMIC); + if (!ew) + return; + INIT_WORK(&ew->work, handle_bonded_port_state_event); + ew->ib_dev = ibdev; + queue_work(wq, &ew->work); + return; + } + + if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE) + eqe = (struct mlx4_eqe *)param; + else + p = (int) param; + + switch (event) { + case MLX4_DEV_EVENT_PORT_UP: + if (p > ibdev->num_ports) + return; + if (!mlx4_is_slave(dev) && + rdma_port_get_link_layer(&ibdev->ib_dev, p) == + IB_LINK_LAYER_INFINIBAND) { + if (mlx4_is_master(dev)) + mlx4_ib_invalidate_all_guid_record(ibdev, p); + if (ibdev->dev->flags & MLX4_FLAG_SECURE_HOST && + !(ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT)) + mlx4_sched_ib_sl2vl_update_work(ibdev, p); + } + ibev.event = IB_EVENT_PORT_ACTIVE; + break; + + case MLX4_DEV_EVENT_PORT_DOWN: + if (p > ibdev->num_ports) + return; + ibev.event = IB_EVENT_PORT_ERR; + break; + + case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: + ibdev->ib_active = false; + ibev.event = IB_EVENT_DEVICE_FATAL; + mlx4_ib_handle_catas_error(ibdev); + break; + + case MLX4_DEV_EVENT_PORT_MGMT_CHANGE: + ew = kmalloc(sizeof *ew, GFP_ATOMIC); + if (!ew) + return; + + INIT_WORK(&ew->work, handle_port_mgmt_change_event); + memcpy(&ew->ib_eqe, eqe, sizeof *eqe); + ew->ib_dev = ibdev; + /* need to queue only for port owner, which uses GEN_EQE */ + if (mlx4_is_master(dev)) + queue_work(wq, &ew->work); + else + handle_port_mgmt_change_event(&ew->work); + return; + + case MLX4_DEV_EVENT_SLAVE_INIT: + /* here, p is the slave id */ + do_slave_init(ibdev, p, 1); + if (mlx4_is_master(dev)) { + int i; + + for (i = 1; i <= ibdev->num_ports; i++) { + if (rdma_port_get_link_layer(&ibdev->ib_dev, i) + == IB_LINK_LAYER_INFINIBAND) + mlx4_ib_slave_alias_guid_event(ibdev, + p, i, + 1); + } + } + return; + + case MLX4_DEV_EVENT_SLAVE_SHUTDOWN: + if (mlx4_is_master(dev)) { + int i; + + for (i = 1; i <= ibdev->num_ports; i++) { + if (rdma_port_get_link_layer(&ibdev->ib_dev, i) + == IB_LINK_LAYER_INFINIBAND) + mlx4_ib_slave_alias_guid_event(ibdev, + p, i, + 0); + } + } + /* here, p is the slave id */ + do_slave_init(ibdev, p, 0); + return; + + default: + return; + } + + ibev.device = ibdev_ptr; + ibev.element.port_num = mlx4_is_bonded(ibdev->dev) ? 1 : (u8)p; + + ib_dispatch_event(&ibev); +} + +static struct mlx4_interface mlx4_ib_interface = { + .add = mlx4_ib_add, + .remove = mlx4_ib_remove, + .event = mlx4_ib_event, + .protocol = MLX4_PROT_IB_IPV6, + .flags = MLX4_INTFF_BONDING +}; + +static int __init mlx4_ib_init(void) +{ + int err; + + wq = alloc_ordered_workqueue("mlx4_ib", WQ_MEM_RECLAIM); + if (!wq) + return -ENOMEM; + + err = mlx4_ib_mcg_init(); + if (err) + goto clean_wq; + + err = mlx4_register_interface(&mlx4_ib_interface); + if (err) + goto clean_mcg; + + return 0; + +clean_mcg: + mlx4_ib_mcg_destroy(); + +clean_wq: + destroy_workqueue(wq); + return err; +} + +static void __exit mlx4_ib_cleanup(void) +{ + mlx4_unregister_interface(&mlx4_ib_interface); + mlx4_ib_mcg_destroy(); + destroy_workqueue(wq); +} + +module_init(mlx4_ib_init); +module_exit(mlx4_ib_cleanup); diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c new file mode 100644 index 000000000..81ffc007e --- /dev/null +++ b/drivers/infiniband/hw/mlx4/mcg.c @@ -0,0 +1,1257 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/ib_mad.h> +#include <rdma/ib_smi.h> +#include <rdma/ib_cache.h> +#include <rdma/ib_sa.h> + +#include <linux/mlx4/cmd.h> +#include <linux/rbtree.h> +#include <linux/delay.h> + +#include "mlx4_ib.h" + +#define MAX_VFS 80 +#define MAX_PEND_REQS_PER_FUNC 4 +#define MAD_TIMEOUT_MS 2000 + +#define mcg_warn(fmt, arg...) pr_warn("MCG WARNING: " fmt, ##arg) +#define mcg_error(fmt, arg...) pr_err(fmt, ##arg) +#define mcg_warn_group(group, format, arg...) \ + pr_warn("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\ + (group)->name, group->demux->port, ## arg) + +#define mcg_debug_group(group, format, arg...) \ + pr_debug("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\ + (group)->name, (group)->demux->port, ## arg) + +#define mcg_error_group(group, format, arg...) \ + pr_err(" %16s: " format, (group)->name, ## arg) + + +static union ib_gid mgid0; + +static struct workqueue_struct *clean_wq; + +enum mcast_state { + MCAST_NOT_MEMBER = 0, + MCAST_MEMBER, +}; + +enum mcast_group_state { + MCAST_IDLE, + MCAST_JOIN_SENT, + MCAST_LEAVE_SENT, + MCAST_RESP_READY +}; + +struct mcast_member { + enum mcast_state state; + uint8_t join_state; + int num_pend_reqs; + struct list_head pending; +}; + +struct ib_sa_mcmember_data { + union ib_gid mgid; + union ib_gid port_gid; + __be32 qkey; + __be16 mlid; + u8 mtusel_mtu; + u8 tclass; + __be16 pkey; + u8 ratesel_rate; + u8 lifetmsel_lifetm; + __be32 sl_flowlabel_hoplimit; + u8 scope_join_state; + u8 proxy_join; + u8 reserved[2]; +} __packed __aligned(4); + +struct mcast_group { + struct ib_sa_mcmember_data rec; + struct rb_node node; + struct list_head mgid0_list; + struct mlx4_ib_demux_ctx *demux; + struct mcast_member func[MAX_VFS]; + struct mutex lock; + struct work_struct work; + struct list_head pending_list; + int members[3]; + enum mcast_group_state state; + enum mcast_group_state prev_state; + struct ib_sa_mad response_sa_mad; + __be64 last_req_tid; + + char name[33]; /* MGID string */ + struct device_attribute dentry; + + /* refcount is the reference count for the following: + 1. Each queued request + 2. Each invocation of the worker thread + 3. Membership of the port at the SA + */ + atomic_t refcount; + + /* delayed work to clean pending SM request */ + struct delayed_work timeout_work; + struct list_head cleanup_list; +}; + +struct mcast_req { + int func; + struct ib_sa_mad sa_mad; + struct list_head group_list; + struct list_head func_list; + struct mcast_group *group; + int clean; +}; + + +#define safe_atomic_dec(ref) \ + do {\ + if (atomic_dec_and_test(ref)) \ + mcg_warn_group(group, "did not expect to reach zero\n"); \ + } while (0) + +static const char *get_state_string(enum mcast_group_state state) +{ + switch (state) { + case MCAST_IDLE: + return "MCAST_IDLE"; + case MCAST_JOIN_SENT: + return "MCAST_JOIN_SENT"; + case MCAST_LEAVE_SENT: + return "MCAST_LEAVE_SENT"; + case MCAST_RESP_READY: + return "MCAST_RESP_READY"; + } + return "Invalid State"; +} + +static struct mcast_group *mcast_find(struct mlx4_ib_demux_ctx *ctx, + union ib_gid *mgid) +{ + struct rb_node *node = ctx->mcg_table.rb_node; + struct mcast_group *group; + int ret; + + while (node) { + group = rb_entry(node, struct mcast_group, node); + ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid); + if (!ret) + return group; + + if (ret < 0) + node = node->rb_left; + else + node = node->rb_right; + } + return NULL; +} + +static struct mcast_group *mcast_insert(struct mlx4_ib_demux_ctx *ctx, + struct mcast_group *group) +{ + struct rb_node **link = &ctx->mcg_table.rb_node; + struct rb_node *parent = NULL; + struct mcast_group *cur_group; + int ret; + + while (*link) { + parent = *link; + cur_group = rb_entry(parent, struct mcast_group, node); + + ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw, + sizeof group->rec.mgid); + if (ret < 0) + link = &(*link)->rb_left; + else if (ret > 0) + link = &(*link)->rb_right; + else + return cur_group; + } + rb_link_node(&group->node, parent, link); + rb_insert_color(&group->node, &ctx->mcg_table); + return NULL; +} + +static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad) +{ + struct mlx4_ib_dev *dev = ctx->dev; + struct rdma_ah_attr ah_attr; + unsigned long flags; + + spin_lock_irqsave(&dev->sm_lock, flags); + if (!dev->sm_ah[ctx->port - 1]) { + /* port is not yet Active, sm_ah not ready */ + spin_unlock_irqrestore(&dev->sm_lock, flags); + return -EAGAIN; + } + mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr); + spin_unlock_irqrestore(&dev->sm_lock, flags); + return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev), + ctx->port, IB_QPT_GSI, 0, 1, IB_QP1_QKEY, + &ah_attr, NULL, 0xffff, mad); +} + +static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx, + struct ib_mad *mad) +{ + struct mlx4_ib_dev *dev = ctx->dev; + struct ib_mad_agent *agent = dev->send_agent[ctx->port - 1][1]; + struct ib_wc wc; + struct rdma_ah_attr ah_attr; + + /* Our agent might not yet be registered when mads start to arrive */ + if (!agent) + return -EAGAIN; + + rdma_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr); + + if (ib_find_cached_pkey(&dev->ib_dev, ctx->port, IB_DEFAULT_PKEY_FULL, &wc.pkey_index)) + return -EINVAL; + wc.sl = 0; + wc.dlid_path_bits = 0; + wc.port_num = ctx->port; + wc.slid = rdma_ah_get_dlid(&ah_attr); /* opensm lid */ + wc.src_qp = 1; + return mlx4_ib_send_to_slave(dev, slave, ctx->port, IB_QPT_GSI, &wc, NULL, mad); +} + +static int send_join_to_wire(struct mcast_group *group, struct ib_sa_mad *sa_mad) +{ + struct ib_sa_mad mad; + struct ib_sa_mcmember_data *sa_mad_data = (struct ib_sa_mcmember_data *)&mad.data; + int ret; + + /* we rely on a mad request as arrived from a VF */ + memcpy(&mad, sa_mad, sizeof mad); + + /* fix port GID to be the real one (slave 0) */ + sa_mad_data->port_gid.global.interface_id = group->demux->guid_cache[0]; + + /* assign our own TID */ + mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux); + group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */ + + ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad); + /* set timeout handler */ + if (!ret) { + /* calls mlx4_ib_mcg_timeout_handler */ + queue_delayed_work(group->demux->mcg_wq, &group->timeout_work, + msecs_to_jiffies(MAD_TIMEOUT_MS)); + } + + return ret; +} + +static int send_leave_to_wire(struct mcast_group *group, u8 join_state) +{ + struct ib_sa_mad mad; + struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data; + int ret; + + memset(&mad, 0, sizeof mad); + mad.mad_hdr.base_version = 1; + mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; + mad.mad_hdr.class_version = 2; + mad.mad_hdr.method = IB_SA_METHOD_DELETE; + mad.mad_hdr.status = cpu_to_be16(0); + mad.mad_hdr.class_specific = cpu_to_be16(0); + mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux); + group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */ + mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC); + mad.mad_hdr.attr_mod = cpu_to_be32(0); + mad.sa_hdr.sm_key = 0x0; + mad.sa_hdr.attr_offset = cpu_to_be16(7); + mad.sa_hdr.comp_mask = IB_SA_MCMEMBER_REC_MGID | + IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_JOIN_STATE; + + *sa_data = group->rec; + sa_data->scope_join_state = join_state; + + ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad); + if (ret) + group->state = MCAST_IDLE; + + /* set timeout handler */ + if (!ret) { + /* calls mlx4_ib_mcg_timeout_handler */ + queue_delayed_work(group->demux->mcg_wq, &group->timeout_work, + msecs_to_jiffies(MAD_TIMEOUT_MS)); + } + + return ret; +} + +static int send_reply_to_slave(int slave, struct mcast_group *group, + struct ib_sa_mad *req_sa_mad, u16 status) +{ + struct ib_sa_mad mad; + struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data; + struct ib_sa_mcmember_data *req_sa_data = (struct ib_sa_mcmember_data *)&req_sa_mad->data; + int ret; + + memset(&mad, 0, sizeof mad); + mad.mad_hdr.base_version = 1; + mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; + mad.mad_hdr.class_version = 2; + mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP; + mad.mad_hdr.status = cpu_to_be16(status); + mad.mad_hdr.class_specific = cpu_to_be16(0); + mad.mad_hdr.tid = req_sa_mad->mad_hdr.tid; + *(u8 *)&mad.mad_hdr.tid = 0; /* resetting tid to 0 */ + mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC); + mad.mad_hdr.attr_mod = cpu_to_be32(0); + mad.sa_hdr.sm_key = req_sa_mad->sa_hdr.sm_key; + mad.sa_hdr.attr_offset = cpu_to_be16(7); + mad.sa_hdr.comp_mask = 0; /* ignored on responses, see IBTA spec */ + + *sa_data = group->rec; + + /* reconstruct VF's requested join_state and port_gid */ + sa_data->scope_join_state &= 0xf0; + sa_data->scope_join_state |= (group->func[slave].join_state & 0x0f); + memcpy(&sa_data->port_gid, &req_sa_data->port_gid, sizeof req_sa_data->port_gid); + + ret = send_mad_to_slave(slave, group->demux, (struct ib_mad *)&mad); + return ret; +} + +static int check_selector(ib_sa_comp_mask comp_mask, + ib_sa_comp_mask selector_mask, + ib_sa_comp_mask value_mask, + u8 src_value, u8 dst_value) +{ + int err; + u8 selector = dst_value >> 6; + dst_value &= 0x3f; + src_value &= 0x3f; + + if (!(comp_mask & selector_mask) || !(comp_mask & value_mask)) + return 0; + + switch (selector) { + case IB_SA_GT: + err = (src_value <= dst_value); + break; + case IB_SA_LT: + err = (src_value >= dst_value); + break; + case IB_SA_EQ: + err = (src_value != dst_value); + break; + default: + err = 0; + break; + } + + return err; +} + +static u16 cmp_rec(struct ib_sa_mcmember_data *src, + struct ib_sa_mcmember_data *dst, ib_sa_comp_mask comp_mask) +{ + /* src is group record, dst is request record */ + /* MGID must already match */ + /* Port_GID we always replace to our Port_GID, so it is a match */ + +#define MAD_STATUS_REQ_INVALID 0x0200 + if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid) + return MAD_STATUS_REQ_INVALID; + if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR, + IB_SA_MCMEMBER_REC_MTU, + src->mtusel_mtu, dst->mtusel_mtu)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS && + src->tclass != dst->tclass) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey) + return MAD_STATUS_REQ_INVALID; + if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR, + IB_SA_MCMEMBER_REC_RATE, + src->ratesel_rate, dst->ratesel_rate)) + return MAD_STATUS_REQ_INVALID; + if (check_selector(comp_mask, + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, + src->lifetmsel_lifetm, dst->lifetmsel_lifetm)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_SL && + (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0xf0000000) != + (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0xf0000000)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL && + (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x0fffff00) != + (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x0fffff00)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT && + (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x000000ff) != + (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x000000ff)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && + (src->scope_join_state & 0xf0) != + (dst->scope_join_state & 0xf0)) + return MAD_STATUS_REQ_INVALID; + + /* join_state checked separately, proxy_join ignored */ + + return 0; +} + +/* release group, return 1 if this was last release and group is destroyed + * timout work is canceled sync */ +static int release_group(struct mcast_group *group, int from_timeout_handler) +{ + struct mlx4_ib_demux_ctx *ctx = group->demux; + int nzgroup; + + mutex_lock(&ctx->mcg_table_lock); + mutex_lock(&group->lock); + if (atomic_dec_and_test(&group->refcount)) { + if (!from_timeout_handler) { + if (group->state != MCAST_IDLE && + !cancel_delayed_work(&group->timeout_work)) { + atomic_inc(&group->refcount); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + return 0; + } + } + + nzgroup = memcmp(&group->rec.mgid, &mgid0, sizeof mgid0); + if (nzgroup) + del_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr); + if (!list_empty(&group->pending_list)) + mcg_warn_group(group, "releasing a group with non empty pending list\n"); + if (nzgroup) + rb_erase(&group->node, &ctx->mcg_table); + list_del_init(&group->mgid0_list); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + kfree(group); + return 1; + } else { + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + } + return 0; +} + +static void adjust_membership(struct mcast_group *group, u8 join_state, int inc) +{ + int i; + + for (i = 0; i < 3; i++, join_state >>= 1) + if (join_state & 0x1) + group->members[i] += inc; +} + +static u8 get_leave_state(struct mcast_group *group) +{ + u8 leave_state = 0; + int i; + + for (i = 0; i < 3; i++) + if (!group->members[i]) + leave_state |= (1 << i); + + return leave_state & (group->rec.scope_join_state & 0xf); +} + +static int join_group(struct mcast_group *group, int slave, u8 join_mask) +{ + int ret = 0; + u8 join_state; + + /* remove bits that slave is already member of, and adjust */ + join_state = join_mask & (~group->func[slave].join_state); + adjust_membership(group, join_state, 1); + group->func[slave].join_state |= join_state; + if (group->func[slave].state != MCAST_MEMBER && join_state) { + group->func[slave].state = MCAST_MEMBER; + ret = 1; + } + return ret; +} + +static int leave_group(struct mcast_group *group, int slave, u8 leave_state) +{ + int ret = 0; + + adjust_membership(group, leave_state, -1); + group->func[slave].join_state &= ~leave_state; + if (!group->func[slave].join_state) { + group->func[slave].state = MCAST_NOT_MEMBER; + ret = 1; + } + return ret; +} + +static int check_leave(struct mcast_group *group, int slave, u8 leave_mask) +{ + if (group->func[slave].state != MCAST_MEMBER) + return MAD_STATUS_REQ_INVALID; + + /* make sure we're not deleting unset bits */ + if (~group->func[slave].join_state & leave_mask) + return MAD_STATUS_REQ_INVALID; + + if (!leave_mask) + return MAD_STATUS_REQ_INVALID; + + return 0; +} + +static void mlx4_ib_mcg_timeout_handler(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + struct mcast_group *group; + struct mcast_req *req = NULL; + + group = container_of(delay, typeof(*group), timeout_work); + + mutex_lock(&group->lock); + if (group->state == MCAST_JOIN_SENT) { + if (!list_empty(&group->pending_list)) { + req = list_first_entry(&group->pending_list, struct mcast_req, group_list); + list_del(&req->group_list); + list_del(&req->func_list); + --group->func[req->func].num_pend_reqs; + mutex_unlock(&group->lock); + kfree(req); + if (memcmp(&group->rec.mgid, &mgid0, sizeof mgid0)) { + if (release_group(group, 1)) + return; + } else { + kfree(group); + return; + } + mutex_lock(&group->lock); + } else + mcg_warn_group(group, "DRIVER BUG\n"); + } else if (group->state == MCAST_LEAVE_SENT) { + if (group->rec.scope_join_state & 0xf) + group->rec.scope_join_state &= 0xf0; + group->state = MCAST_IDLE; + mutex_unlock(&group->lock); + if (release_group(group, 1)) + return; + mutex_lock(&group->lock); + } else + mcg_warn_group(group, "invalid state %s\n", get_state_string(group->state)); + group->state = MCAST_IDLE; + atomic_inc(&group->refcount); + if (!queue_work(group->demux->mcg_wq, &group->work)) + safe_atomic_dec(&group->refcount); + + mutex_unlock(&group->lock); +} + +static int handle_leave_req(struct mcast_group *group, u8 leave_mask, + struct mcast_req *req) +{ + u16 status; + + if (req->clean) + leave_mask = group->func[req->func].join_state; + + status = check_leave(group, req->func, leave_mask); + if (!status) + leave_group(group, req->func, leave_mask); + + if (!req->clean) + send_reply_to_slave(req->func, group, &req->sa_mad, status); + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + return 1; +} + +static int handle_join_req(struct mcast_group *group, u8 join_mask, + struct mcast_req *req) +{ + u8 group_join_state = group->rec.scope_join_state & 0xf; + int ref = 0; + u16 status; + struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data; + + if (join_mask == (group_join_state & join_mask)) { + /* port's membership need not change */ + status = cmp_rec(&group->rec, sa_data, req->sa_mad.sa_hdr.comp_mask); + if (!status) + join_group(group, req->func, join_mask); + + --group->func[req->func].num_pend_reqs; + send_reply_to_slave(req->func, group, &req->sa_mad, status); + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + ++ref; + } else { + /* port's membership needs to be updated */ + group->prev_state = group->state; + if (send_join_to_wire(group, &req->sa_mad)) { + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + ref = 1; + group->state = group->prev_state; + } else + group->state = MCAST_JOIN_SENT; + } + + return ref; +} + +static void mlx4_ib_mcg_work_handler(struct work_struct *work) +{ + struct mcast_group *group; + struct mcast_req *req = NULL; + struct ib_sa_mcmember_data *sa_data; + u8 req_join_state; + int rc = 1; /* release_count - this is for the scheduled work */ + u16 status; + u8 method; + + group = container_of(work, typeof(*group), work); + + mutex_lock(&group->lock); + + /* First, let's see if a response from SM is waiting regarding this group. + * If so, we need to update the group's REC. If this is a bad response, we + * may need to send a bad response to a VF waiting for it. If VF is waiting + * and this is a good response, the VF will be answered later in this func. */ + if (group->state == MCAST_RESP_READY) { + /* cancels mlx4_ib_mcg_timeout_handler */ + cancel_delayed_work(&group->timeout_work); + status = be16_to_cpu(group->response_sa_mad.mad_hdr.status); + method = group->response_sa_mad.mad_hdr.method; + if (group->last_req_tid != group->response_sa_mad.mad_hdr.tid) { + mcg_warn_group(group, "Got MAD response to existing MGID but wrong TID, dropping. Resp TID=%llx, group TID=%llx\n", + be64_to_cpu(group->response_sa_mad.mad_hdr.tid), + be64_to_cpu(group->last_req_tid)); + group->state = group->prev_state; + goto process_requests; + } + if (status) { + if (!list_empty(&group->pending_list)) + req = list_first_entry(&group->pending_list, + struct mcast_req, group_list); + if ((method == IB_MGMT_METHOD_GET_RESP)) { + if (req) { + send_reply_to_slave(req->func, group, &req->sa_mad, status); + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + ++rc; + } else + mcg_warn_group(group, "no request for failed join\n"); + } else if (method == IB_SA_METHOD_DELETE_RESP && group->demux->flushing) + ++rc; + } else { + u8 resp_join_state; + u8 cur_join_state; + + resp_join_state = ((struct ib_sa_mcmember_data *) + group->response_sa_mad.data)->scope_join_state & 0xf; + cur_join_state = group->rec.scope_join_state & 0xf; + + if (method == IB_MGMT_METHOD_GET_RESP) { + /* successfull join */ + if (!cur_join_state && resp_join_state) + --rc; + } else if (!resp_join_state) + ++rc; + memcpy(&group->rec, group->response_sa_mad.data, sizeof group->rec); + } + group->state = MCAST_IDLE; + } + +process_requests: + /* We should now go over pending join/leave requests, as long as we are idle. */ + while (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) { + req = list_first_entry(&group->pending_list, struct mcast_req, + group_list); + sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data; + req_join_state = sa_data->scope_join_state & 0xf; + + /* For a leave request, we will immediately answer the VF, and + * update our internal counters. The actual leave will be sent + * to SM later, if at all needed. We dequeue the request now. */ + if (req->sa_mad.mad_hdr.method == IB_SA_METHOD_DELETE) + rc += handle_leave_req(group, req_join_state, req); + else + rc += handle_join_req(group, req_join_state, req); + } + + /* Handle leaves */ + if (group->state == MCAST_IDLE) { + req_join_state = get_leave_state(group); + if (req_join_state) { + group->rec.scope_join_state &= ~req_join_state; + group->prev_state = group->state; + if (send_leave_to_wire(group, req_join_state)) { + group->state = group->prev_state; + ++rc; + } else + group->state = MCAST_LEAVE_SENT; + } + } + + if (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) + goto process_requests; + mutex_unlock(&group->lock); + + while (rc--) + release_group(group, 0); +} + +static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx *ctx, + __be64 tid, + union ib_gid *new_mgid) +{ + struct mcast_group *group = NULL, *cur_group, *n; + struct mcast_req *req; + + mutex_lock(&ctx->mcg_table_lock); + list_for_each_entry_safe(group, n, &ctx->mcg_mgid0_list, mgid0_list) { + mutex_lock(&group->lock); + if (group->last_req_tid == tid) { + if (memcmp(new_mgid, &mgid0, sizeof mgid0)) { + group->rec.mgid = *new_mgid; + sprintf(group->name, "%016llx%016llx", + be64_to_cpu(group->rec.mgid.global.subnet_prefix), + be64_to_cpu(group->rec.mgid.global.interface_id)); + list_del_init(&group->mgid0_list); + cur_group = mcast_insert(ctx, group); + if (cur_group) { + /* A race between our code and SM. Silently cleaning the new one */ + req = list_first_entry(&group->pending_list, + struct mcast_req, group_list); + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + release_group(group, 0); + return NULL; + } + + atomic_inc(&group->refcount); + add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + return group; + } else { + struct mcast_req *tmp1, *tmp2; + + list_del(&group->mgid0_list); + if (!list_empty(&group->pending_list) && group->state != MCAST_IDLE) + cancel_delayed_work_sync(&group->timeout_work); + + list_for_each_entry_safe(tmp1, tmp2, &group->pending_list, group_list) { + list_del(&tmp1->group_list); + kfree(tmp1); + } + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + kfree(group); + return NULL; + } + } + mutex_unlock(&group->lock); + } + mutex_unlock(&ctx->mcg_table_lock); + + return NULL; +} + +static ssize_t sysfs_show_group(struct device *dev, + struct device_attribute *attr, char *buf); + +static struct mcast_group *acquire_group(struct mlx4_ib_demux_ctx *ctx, + union ib_gid *mgid, int create) +{ + struct mcast_group *group, *cur_group; + int is_mgid0; + int i; + + is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0); + if (!is_mgid0) { + group = mcast_find(ctx, mgid); + if (group) + goto found; + } + + if (!create) + return ERR_PTR(-ENOENT); + + group = kzalloc(sizeof(*group), GFP_KERNEL); + if (!group) + return ERR_PTR(-ENOMEM); + + group->demux = ctx; + group->rec.mgid = *mgid; + INIT_LIST_HEAD(&group->pending_list); + INIT_LIST_HEAD(&group->mgid0_list); + for (i = 0; i < MAX_VFS; ++i) + INIT_LIST_HEAD(&group->func[i].pending); + INIT_WORK(&group->work, mlx4_ib_mcg_work_handler); + INIT_DELAYED_WORK(&group->timeout_work, mlx4_ib_mcg_timeout_handler); + mutex_init(&group->lock); + sprintf(group->name, "%016llx%016llx", + be64_to_cpu(group->rec.mgid.global.subnet_prefix), + be64_to_cpu(group->rec.mgid.global.interface_id)); + sysfs_attr_init(&group->dentry.attr); + group->dentry.show = sysfs_show_group; + group->dentry.store = NULL; + group->dentry.attr.name = group->name; + group->dentry.attr.mode = 0400; + group->state = MCAST_IDLE; + + if (is_mgid0) { + list_add(&group->mgid0_list, &ctx->mcg_mgid0_list); + goto found; + } + + cur_group = mcast_insert(ctx, group); + if (cur_group) { + mcg_warn("group just showed up %s - confused\n", cur_group->name); + kfree(group); + return ERR_PTR(-EINVAL); + } + + add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr); + +found: + atomic_inc(&group->refcount); + return group; +} + +static void queue_req(struct mcast_req *req) +{ + struct mcast_group *group = req->group; + + atomic_inc(&group->refcount); /* for the request */ + atomic_inc(&group->refcount); /* for scheduling the work */ + list_add_tail(&req->group_list, &group->pending_list); + list_add_tail(&req->func_list, &group->func[req->func].pending); + /* calls mlx4_ib_mcg_work_handler */ + if (!queue_work(group->demux->mcg_wq, &group->work)) + safe_atomic_dec(&group->refcount); +} + +int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *mad) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)mad->data; + struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1]; + struct mcast_group *group; + + switch (mad->mad_hdr.method) { + case IB_MGMT_METHOD_GET_RESP: + case IB_SA_METHOD_DELETE_RESP: + mutex_lock(&ctx->mcg_table_lock); + group = acquire_group(ctx, &rec->mgid, 0); + mutex_unlock(&ctx->mcg_table_lock); + if (IS_ERR(group)) { + if (mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP) { + __be64 tid = mad->mad_hdr.tid; + *(u8 *)(&tid) = (u8)slave; /* in group we kept the modified TID */ + group = search_relocate_mgid0_group(ctx, tid, &rec->mgid); + } else + group = NULL; + } + + if (!group) + return 1; + + mutex_lock(&group->lock); + group->response_sa_mad = *mad; + group->prev_state = group->state; + group->state = MCAST_RESP_READY; + /* calls mlx4_ib_mcg_work_handler */ + atomic_inc(&group->refcount); + if (!queue_work(ctx->mcg_wq, &group->work)) + safe_atomic_dec(&group->refcount); + mutex_unlock(&group->lock); + release_group(group, 0); + return 1; /* consumed */ + case IB_MGMT_METHOD_SET: + case IB_SA_METHOD_GET_TABLE: + case IB_SA_METHOD_GET_TABLE_RESP: + case IB_SA_METHOD_DELETE: + return 0; /* not consumed, pass-through to guest over tunnel */ + default: + mcg_warn("In demux, port %d: unexpected MCMember method: 0x%x, dropping\n", + port, mad->mad_hdr.method); + return 1; /* consumed */ + } +} + +int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, + int slave, struct ib_sa_mad *sa_mad) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)sa_mad->data; + struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1]; + struct mcast_group *group; + struct mcast_req *req; + int may_create = 0; + + if (ctx->flushing) + return -EAGAIN; + + switch (sa_mad->mad_hdr.method) { + case IB_MGMT_METHOD_SET: + may_create = 1; + /* fall through */ + case IB_SA_METHOD_DELETE: + req = kzalloc(sizeof *req, GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->func = slave; + req->sa_mad = *sa_mad; + + mutex_lock(&ctx->mcg_table_lock); + group = acquire_group(ctx, &rec->mgid, may_create); + mutex_unlock(&ctx->mcg_table_lock); + if (IS_ERR(group)) { + kfree(req); + return PTR_ERR(group); + } + mutex_lock(&group->lock); + if (group->func[slave].num_pend_reqs > MAX_PEND_REQS_PER_FUNC) { + mutex_unlock(&group->lock); + mcg_debug_group(group, "Port %d, Func %d has too many pending requests (%d), dropping\n", + port, slave, MAX_PEND_REQS_PER_FUNC); + release_group(group, 0); + kfree(req); + return -ENOMEM; + } + ++group->func[slave].num_pend_reqs; + req->group = group; + queue_req(req); + mutex_unlock(&group->lock); + release_group(group, 0); + return 1; /* consumed */ + case IB_SA_METHOD_GET_TABLE: + case IB_MGMT_METHOD_GET_RESP: + case IB_SA_METHOD_GET_TABLE_RESP: + case IB_SA_METHOD_DELETE_RESP: + return 0; /* not consumed, pass-through */ + default: + mcg_warn("In multiplex, port %d, func %d: unexpected MCMember method: 0x%x, dropping\n", + port, slave, sa_mad->mad_hdr.method); + return 1; /* consumed */ + } +} + +static ssize_t sysfs_show_group(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mcast_group *group = + container_of(attr, struct mcast_group, dentry); + struct mcast_req *req = NULL; + char pending_str[40]; + char state_str[40]; + ssize_t len = 0; + int f; + + if (group->state == MCAST_IDLE) + sprintf(state_str, "%s", get_state_string(group->state)); + else + sprintf(state_str, "%s(TID=0x%llx)", + get_state_string(group->state), + be64_to_cpu(group->last_req_tid)); + if (list_empty(&group->pending_list)) { + sprintf(pending_str, "No"); + } else { + req = list_first_entry(&group->pending_list, struct mcast_req, group_list); + sprintf(pending_str, "Yes(TID=0x%llx)", + be64_to_cpu(req->sa_mad.mad_hdr.tid)); + } + len += sprintf(buf + len, "%1d [%02d,%02d,%02d] %4d %4s %5s ", + group->rec.scope_join_state & 0xf, + group->members[2], group->members[1], group->members[0], + atomic_read(&group->refcount), + pending_str, + state_str); + for (f = 0; f < MAX_VFS; ++f) + if (group->func[f].state == MCAST_MEMBER) + len += sprintf(buf + len, "%d[%1x] ", + f, group->func[f].join_state); + + len += sprintf(buf + len, "\t\t(%4hx %4x %2x %2x %2x %2x %2x " + "%4x %4x %2x %2x)\n", + be16_to_cpu(group->rec.pkey), + be32_to_cpu(group->rec.qkey), + (group->rec.mtusel_mtu & 0xc0) >> 6, + group->rec.mtusel_mtu & 0x3f, + group->rec.tclass, + (group->rec.ratesel_rate & 0xc0) >> 6, + group->rec.ratesel_rate & 0x3f, + (be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0xf0000000) >> 28, + (be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x0fffff00) >> 8, + be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x000000ff, + group->rec.proxy_join); + + return len; +} + +int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx) +{ + char name[20]; + + atomic_set(&ctx->tid, 0); + sprintf(name, "mlx4_ib_mcg%d", ctx->port); + ctx->mcg_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); + if (!ctx->mcg_wq) + return -ENOMEM; + + mutex_init(&ctx->mcg_table_lock); + ctx->mcg_table = RB_ROOT; + INIT_LIST_HEAD(&ctx->mcg_mgid0_list); + ctx->flushing = 0; + + return 0; +} + +static void force_clean_group(struct mcast_group *group) +{ + struct mcast_req *req, *tmp + ; + list_for_each_entry_safe(req, tmp, &group->pending_list, group_list) { + list_del(&req->group_list); + kfree(req); + } + del_sysfs_port_mcg_attr(group->demux->dev, group->demux->port, &group->dentry.attr); + rb_erase(&group->node, &group->demux->mcg_table); + kfree(group); +} + +static void _mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq) +{ + int i; + struct rb_node *p; + struct mcast_group *group; + unsigned long end; + int count; + + for (i = 0; i < MAX_VFS; ++i) + clean_vf_mcast(ctx, i); + + end = jiffies + msecs_to_jiffies(MAD_TIMEOUT_MS + 3000); + do { + count = 0; + mutex_lock(&ctx->mcg_table_lock); + for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) + ++count; + mutex_unlock(&ctx->mcg_table_lock); + if (!count) + break; + + usleep_range(1000, 2000); + } while (time_after(end, jiffies)); + + flush_workqueue(ctx->mcg_wq); + if (destroy_wq) + destroy_workqueue(ctx->mcg_wq); + + mutex_lock(&ctx->mcg_table_lock); + while ((p = rb_first(&ctx->mcg_table)) != NULL) { + group = rb_entry(p, struct mcast_group, node); + if (atomic_read(&group->refcount)) + mcg_debug_group(group, "group refcount %d!!! (pointer %p)\n", + atomic_read(&group->refcount), group); + + force_clean_group(group); + } + mutex_unlock(&ctx->mcg_table_lock); +} + +struct clean_work { + struct work_struct work; + struct mlx4_ib_demux_ctx *ctx; + int destroy_wq; +}; + +static void mcg_clean_task(struct work_struct *work) +{ + struct clean_work *cw = container_of(work, struct clean_work, work); + + _mlx4_ib_mcg_port_cleanup(cw->ctx, cw->destroy_wq); + cw->ctx->flushing = 0; + kfree(cw); +} + +void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq) +{ + struct clean_work *work; + + if (ctx->flushing) + return; + + ctx->flushing = 1; + + if (destroy_wq) { + _mlx4_ib_mcg_port_cleanup(ctx, destroy_wq); + ctx->flushing = 0; + return; + } + + work = kmalloc(sizeof *work, GFP_KERNEL); + if (!work) { + ctx->flushing = 0; + return; + } + + work->ctx = ctx; + work->destroy_wq = destroy_wq; + INIT_WORK(&work->work, mcg_clean_task); + queue_work(clean_wq, &work->work); +} + +static void build_leave_mad(struct mcast_req *req) +{ + struct ib_sa_mad *mad = &req->sa_mad; + + mad->mad_hdr.method = IB_SA_METHOD_DELETE; +} + + +static void clear_pending_reqs(struct mcast_group *group, int vf) +{ + struct mcast_req *req, *tmp, *group_first = NULL; + int clear; + int pend = 0; + + if (!list_empty(&group->pending_list)) + group_first = list_first_entry(&group->pending_list, struct mcast_req, group_list); + + list_for_each_entry_safe(req, tmp, &group->func[vf].pending, func_list) { + clear = 1; + if (group_first == req && + (group->state == MCAST_JOIN_SENT || + group->state == MCAST_LEAVE_SENT)) { + clear = cancel_delayed_work(&group->timeout_work); + pend = !clear; + group->state = MCAST_IDLE; + } + if (clear) { + --group->func[vf].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + atomic_dec(&group->refcount); + } + } + + if (!pend && (!list_empty(&group->func[vf].pending) || group->func[vf].num_pend_reqs)) { + mcg_warn_group(group, "DRIVER BUG: list_empty %d, num_pend_reqs %d\n", + list_empty(&group->func[vf].pending), group->func[vf].num_pend_reqs); + } +} + +static int push_deleteing_req(struct mcast_group *group, int slave) +{ + struct mcast_req *req; + struct mcast_req *pend_req; + + if (!group->func[slave].join_state) + return 0; + + req = kzalloc(sizeof *req, GFP_KERNEL); + if (!req) + return -ENOMEM; + + if (!list_empty(&group->func[slave].pending)) { + pend_req = list_entry(group->func[slave].pending.prev, struct mcast_req, group_list); + if (pend_req->clean) { + kfree(req); + return 0; + } + } + + req->clean = 1; + req->func = slave; + req->group = group; + ++group->func[slave].num_pend_reqs; + build_leave_mad(req); + queue_req(req); + return 0; +} + +void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave) +{ + struct mcast_group *group; + struct rb_node *p; + + mutex_lock(&ctx->mcg_table_lock); + for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) { + group = rb_entry(p, struct mcast_group, node); + mutex_lock(&group->lock); + if (atomic_read(&group->refcount)) { + /* clear pending requests of this VF */ + clear_pending_reqs(group, slave); + push_deleteing_req(group, slave); + } + mutex_unlock(&group->lock); + } + mutex_unlock(&ctx->mcg_table_lock); +} + + +int mlx4_ib_mcg_init(void) +{ + clean_wq = alloc_ordered_workqueue("mlx4_ib_mcg", WQ_MEM_RECLAIM); + if (!clean_wq) + return -ENOMEM; + + return 0; +} + +void mlx4_ib_mcg_destroy(void) +{ + destroy_workqueue(clean_wq); +} diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h new file mode 100644 index 000000000..76ca67aa4 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -0,0 +1,932 @@ +/* + * Copyright (c) 2006, 2007 Cisco Systems. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_IB_H +#define MLX4_IB_H + +#include <linux/compiler.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/idr.h> + +#include <rdma/ib_verbs.h> +#include <rdma/ib_umem.h> +#include <rdma/ib_mad.h> +#include <rdma/ib_sa.h> + +#include <linux/mlx4/device.h> +#include <linux/mlx4/doorbell.h> +#include <linux/mlx4/qp.h> +#include <linux/mlx4/cq.h> + +#define MLX4_IB_DRV_NAME "mlx4_ib" + +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) "<" MLX4_IB_DRV_NAME "> %s: " fmt, __func__ + +#define mlx4_ib_warn(ibdev, format, arg...) \ + dev_warn((ibdev)->dev.parent, MLX4_IB_DRV_NAME ": " format, ## arg) + +enum { + MLX4_IB_SQ_MIN_WQE_SHIFT = 6, + MLX4_IB_MAX_HEADROOM = 2048 +}; + +#define MLX4_IB_SQ_HEADROOM(shift) ((MLX4_IB_MAX_HEADROOM >> (shift)) + 1) +#define MLX4_IB_SQ_MAX_SPARE (MLX4_IB_SQ_HEADROOM(MLX4_IB_SQ_MIN_WQE_SHIFT)) + +/*module param to indicate if SM assigns the alias_GUID*/ +extern int mlx4_ib_sm_guid_assign; + +#define MLX4_IB_UC_STEER_QPN_ALIGN 1 +#define MLX4_IB_UC_MAX_NUM_QPS 256 + +enum hw_bar_type { + HW_BAR_BF, + HW_BAR_DB, + HW_BAR_CLOCK, + HW_BAR_COUNT +}; + +struct mlx4_ib_vma_private_data { + struct vm_area_struct *vma; +}; + +struct mlx4_ib_ucontext { + struct ib_ucontext ibucontext; + struct mlx4_uar uar; + struct list_head db_page_list; + struct mutex db_page_mutex; + struct mlx4_ib_vma_private_data hw_bar_info[HW_BAR_COUNT]; + struct list_head wqn_ranges_list; + struct mutex wqn_ranges_mutex; /* protect wqn_ranges_list */ +}; + +struct mlx4_ib_pd { + struct ib_pd ibpd; + u32 pdn; +}; + +struct mlx4_ib_xrcd { + struct ib_xrcd ibxrcd; + u32 xrcdn; + struct ib_pd *pd; + struct ib_cq *cq; +}; + +struct mlx4_ib_cq_buf { + struct mlx4_buf buf; + struct mlx4_mtt mtt; + int entry_size; +}; + +struct mlx4_ib_cq_resize { + struct mlx4_ib_cq_buf buf; + int cqe; +}; + +struct mlx4_ib_cq { + struct ib_cq ibcq; + struct mlx4_cq mcq; + struct mlx4_ib_cq_buf buf; + struct mlx4_ib_cq_resize *resize_buf; + struct mlx4_db db; + spinlock_t lock; + struct mutex resize_mutex; + struct ib_umem *umem; + struct ib_umem *resize_umem; + int create_flags; + /* List of qps that it serves.*/ + struct list_head send_qp_list; + struct list_head recv_qp_list; +}; + +#define MLX4_MR_PAGES_ALIGN 0x40 + +struct mlx4_ib_mr { + struct ib_mr ibmr; + __be64 *pages; + dma_addr_t page_map; + u32 npages; + u32 max_pages; + struct mlx4_mr mmr; + struct ib_umem *umem; + size_t page_map_size; +}; + +struct mlx4_ib_mw { + struct ib_mw ibmw; + struct mlx4_mw mmw; +}; + +struct mlx4_ib_fmr { + struct ib_fmr ibfmr; + struct mlx4_fmr mfmr; +}; + +#define MAX_REGS_PER_FLOW 2 + +struct mlx4_flow_reg_id { + u64 id; + u64 mirror; +}; + +struct mlx4_ib_flow { + struct ib_flow ibflow; + /* translating DMFS verbs sniffer rule to FW API requires two reg IDs */ + struct mlx4_flow_reg_id reg_id[MAX_REGS_PER_FLOW]; +}; + +struct mlx4_ib_wq { + u64 *wrid; + spinlock_t lock; + int wqe_cnt; + int max_post; + int max_gs; + int offset; + int wqe_shift; + unsigned head; + unsigned tail; +}; + +enum { + MLX4_IB_QP_CREATE_ROCE_V2_GSI = IB_QP_CREATE_RESERVED_START +}; + +enum mlx4_ib_qp_flags { + MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, + MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP, + MLX4_IB_QP_SCATTER_FCS = IB_QP_CREATE_SCATTER_FCS, + + /* Mellanox specific flags start from IB_QP_CREATE_RESERVED_START */ + MLX4_IB_ROCE_V2_GSI_QP = MLX4_IB_QP_CREATE_ROCE_V2_GSI, + MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30, + MLX4_IB_SRIOV_SQP = 1 << 31, +}; + +struct mlx4_ib_gid_entry { + struct list_head list; + union ib_gid gid; + int added; + u8 port; +}; + +enum mlx4_ib_qp_type { + /* + * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries + * here (and in that order) since the MAD layer uses them as + * indices into a 2-entry table. + */ + MLX4_IB_QPT_SMI = IB_QPT_SMI, + MLX4_IB_QPT_GSI = IB_QPT_GSI, + + MLX4_IB_QPT_RC = IB_QPT_RC, + MLX4_IB_QPT_UC = IB_QPT_UC, + MLX4_IB_QPT_UD = IB_QPT_UD, + MLX4_IB_QPT_RAW_IPV6 = IB_QPT_RAW_IPV6, + MLX4_IB_QPT_RAW_ETHERTYPE = IB_QPT_RAW_ETHERTYPE, + MLX4_IB_QPT_RAW_PACKET = IB_QPT_RAW_PACKET, + MLX4_IB_QPT_XRC_INI = IB_QPT_XRC_INI, + MLX4_IB_QPT_XRC_TGT = IB_QPT_XRC_TGT, + + MLX4_IB_QPT_PROXY_SMI_OWNER = 1 << 16, + MLX4_IB_QPT_PROXY_SMI = 1 << 17, + MLX4_IB_QPT_PROXY_GSI = 1 << 18, + MLX4_IB_QPT_TUN_SMI_OWNER = 1 << 19, + MLX4_IB_QPT_TUN_SMI = 1 << 20, + MLX4_IB_QPT_TUN_GSI = 1 << 21, +}; + +#define MLX4_IB_QPT_ANY_SRIOV (MLX4_IB_QPT_PROXY_SMI_OWNER | \ + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER | \ + MLX4_IB_QPT_TUN_SMI | MLX4_IB_QPT_TUN_GSI) + +enum mlx4_ib_mad_ifc_flags { + MLX4_MAD_IFC_IGNORE_MKEY = 1, + MLX4_MAD_IFC_IGNORE_BKEY = 2, + MLX4_MAD_IFC_IGNORE_KEYS = (MLX4_MAD_IFC_IGNORE_MKEY | + MLX4_MAD_IFC_IGNORE_BKEY), + MLX4_MAD_IFC_NET_VIEW = 4, +}; + +enum { + MLX4_NUM_TUNNEL_BUFS = 256, +}; + +struct mlx4_ib_tunnel_header { + struct mlx4_av av; + __be32 remote_qpn; + __be32 qkey; + __be16 vlan; + u8 mac[6]; + __be16 pkey_index; + u8 reserved[6]; +}; + +struct mlx4_ib_buf { + void *addr; + dma_addr_t map; +}; + +struct mlx4_rcv_tunnel_hdr { + __be32 flags_src_qp; /* flags[6:5] is defined for VLANs: + * 0x0 - no vlan was in the packet + * 0x01 - C-VLAN was in the packet */ + u8 g_ml_path; /* gid bit stands for ipv6/4 header in RoCE */ + u8 reserved; + __be16 pkey_index; + __be16 sl_vid; + __be16 slid_mac_47_32; + __be32 mac_31_0; +}; + +struct mlx4_ib_proxy_sqp_hdr { + struct ib_grh grh; + struct mlx4_rcv_tunnel_hdr tun; +} __packed; + +struct mlx4_roce_smac_vlan_info { + u64 smac; + int smac_index; + int smac_port; + u64 candidate_smac; + int candidate_smac_index; + int candidate_smac_port; + u16 vid; + int vlan_index; + int vlan_port; + u16 candidate_vid; + int candidate_vlan_index; + int candidate_vlan_port; + int update_vid; +}; + +struct mlx4_wqn_range { + int base_wqn; + int size; + int refcount; + bool dirty; + struct list_head list; +}; + +struct mlx4_ib_rss { + unsigned int base_qpn_tbl_sz; + u8 flags; + u8 rss_key[MLX4_EN_RSS_KEY_SIZE]; +}; + +struct mlx4_ib_qp { + union { + struct ib_qp ibqp; + struct ib_wq ibwq; + }; + struct mlx4_qp mqp; + struct mlx4_buf buf; + + struct mlx4_db db; + struct mlx4_ib_wq rq; + + u32 doorbell_qpn; + __be32 sq_signal_bits; + unsigned sq_next_wqe; + int sq_spare_wqes; + struct mlx4_ib_wq sq; + + enum mlx4_ib_qp_type mlx4_ib_qp_type; + struct ib_umem *umem; + struct mlx4_mtt mtt; + int buf_size; + struct mutex mutex; + u16 xrcdn; + u32 flags; + u8 port; + u8 alt_port; + u8 atomic_rd_en; + u8 resp_depth; + u8 sq_no_prefetch; + u8 state; + int mlx_type; + u32 inl_recv_sz; + struct list_head gid_list; + struct list_head steering_rules; + struct mlx4_ib_buf *sqp_proxy_rcv; + struct mlx4_roce_smac_vlan_info pri; + struct mlx4_roce_smac_vlan_info alt; + u64 reg_id; + struct list_head qps_list; + struct list_head cq_recv_list; + struct list_head cq_send_list; + struct counter_index *counter_index; + struct mlx4_wqn_range *wqn_range; + /* Number of RSS QP parents that uses this WQ */ + u32 rss_usecnt; + struct mlx4_ib_rss *rss_ctx; +}; + +struct mlx4_ib_srq { + struct ib_srq ibsrq; + struct mlx4_srq msrq; + struct mlx4_buf buf; + struct mlx4_db db; + u64 *wrid; + spinlock_t lock; + int head; + int tail; + u16 wqe_ctr; + struct ib_umem *umem; + struct mlx4_mtt mtt; + struct mutex mutex; +}; + +struct mlx4_ib_ah { + struct ib_ah ibah; + union mlx4_ext_av av; +}; + +/****************************************/ +/* alias guid support */ +/****************************************/ +#define NUM_PORT_ALIAS_GUID 2 +#define NUM_ALIAS_GUID_IN_REC 8 +#define NUM_ALIAS_GUID_REC_IN_PORT 16 +#define GUID_REC_SIZE 8 +#define NUM_ALIAS_GUID_PER_PORT 128 +#define MLX4_NOT_SET_GUID (0x00LL) +#define MLX4_GUID_FOR_DELETE_VAL (~(0x00LL)) + +enum mlx4_guid_alias_rec_status { + MLX4_GUID_INFO_STATUS_IDLE, + MLX4_GUID_INFO_STATUS_SET, +}; + +#define GUID_STATE_NEED_PORT_INIT 0x01 + +enum mlx4_guid_alias_rec_method { + MLX4_GUID_INFO_RECORD_SET = IB_MGMT_METHOD_SET, + MLX4_GUID_INFO_RECORD_DELETE = IB_SA_METHOD_DELETE, +}; + +struct mlx4_sriov_alias_guid_info_rec_det { + u8 all_recs[GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC]; + ib_sa_comp_mask guid_indexes; /*indicates what from the 8 records are valid*/ + enum mlx4_guid_alias_rec_status status; /*indicates the administraively status of the record.*/ + unsigned int guids_retry_schedule[NUM_ALIAS_GUID_IN_REC]; + u64 time_to_run; +}; + +struct mlx4_sriov_alias_guid_port_rec_det { + struct mlx4_sriov_alias_guid_info_rec_det all_rec_per_port[NUM_ALIAS_GUID_REC_IN_PORT]; + struct workqueue_struct *wq; + struct delayed_work alias_guid_work; + u8 port; + u32 state_flags; + struct mlx4_sriov_alias_guid *parent; + struct list_head cb_list; +}; + +struct mlx4_sriov_alias_guid { + struct mlx4_sriov_alias_guid_port_rec_det ports_guid[MLX4_MAX_PORTS]; + spinlock_t ag_work_lock; + struct ib_sa_client *sa_client; +}; + +struct mlx4_ib_demux_work { + struct work_struct work; + struct mlx4_ib_dev *dev; + int slave; + int do_init; + u8 port; + +}; + +struct mlx4_ib_tun_tx_buf { + struct mlx4_ib_buf buf; + struct ib_ah *ah; +}; + +struct mlx4_ib_demux_pv_qp { + struct ib_qp *qp; + enum ib_qp_type proxy_qpt; + struct mlx4_ib_buf *ring; + struct mlx4_ib_tun_tx_buf *tx_ring; + spinlock_t tx_lock; + unsigned tx_ix_head; + unsigned tx_ix_tail; +}; + +enum mlx4_ib_demux_pv_state { + DEMUX_PV_STATE_DOWN, + DEMUX_PV_STATE_STARTING, + DEMUX_PV_STATE_ACTIVE, + DEMUX_PV_STATE_DOWNING, +}; + +struct mlx4_ib_demux_pv_ctx { + int port; + int slave; + enum mlx4_ib_demux_pv_state state; + int has_smi; + struct ib_device *ib_dev; + struct ib_cq *cq; + struct ib_pd *pd; + struct work_struct work; + struct workqueue_struct *wq; + struct workqueue_struct *wi_wq; + struct mlx4_ib_demux_pv_qp qp[2]; +}; + +struct mlx4_ib_demux_ctx { + struct ib_device *ib_dev; + int port; + struct workqueue_struct *wq; + struct workqueue_struct *wi_wq; + struct workqueue_struct *ud_wq; + spinlock_t ud_lock; + atomic64_t subnet_prefix; + __be64 guid_cache[128]; + struct mlx4_ib_dev *dev; + /* the following lock protects both mcg_table and mcg_mgid0_list */ + struct mutex mcg_table_lock; + struct rb_root mcg_table; + struct list_head mcg_mgid0_list; + struct workqueue_struct *mcg_wq; + struct mlx4_ib_demux_pv_ctx **tun; + atomic_t tid; + int flushing; /* flushing the work queue */ +}; + +struct mlx4_ib_sriov { + struct mlx4_ib_demux_ctx demux[MLX4_MAX_PORTS]; + struct mlx4_ib_demux_pv_ctx *sqps[MLX4_MAX_PORTS]; + /* when using this spinlock you should use "irq" because + * it may be called from interrupt context.*/ + spinlock_t going_down_lock; + int is_going_down; + + struct mlx4_sriov_alias_guid alias_guid; + + /* CM paravirtualization fields */ + struct list_head cm_list; + spinlock_t id_map_lock; + struct rb_root sl_id_map; + struct idr pv_id_table; +}; + +struct gid_cache_context { + int real_index; + int refcount; +}; + +struct gid_entry { + union ib_gid gid; + enum ib_gid_type gid_type; + struct gid_cache_context *ctx; +}; + +struct mlx4_port_gid_table { + struct gid_entry gids[MLX4_MAX_PORT_GIDS]; +}; + +struct mlx4_ib_iboe { + spinlock_t lock; + struct net_device *netdevs[MLX4_MAX_PORTS]; + atomic64_t mac[MLX4_MAX_PORTS]; + struct notifier_block nb; + struct mlx4_port_gid_table gids[MLX4_MAX_PORTS]; +}; + +struct pkey_mgt { + u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; + u16 phys_pkey_cache[MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; + struct list_head pkey_port_list[MLX4_MFUNC_MAX]; + struct kobject *device_parent[MLX4_MFUNC_MAX]; +}; + +struct mlx4_ib_iov_sysfs_attr { + void *ctx; + struct kobject *kobj; + unsigned long data; + u32 entry_num; + char name[15]; + struct device_attribute dentry; + struct device *dev; +}; + +struct mlx4_ib_iov_sysfs_attr_ar { + struct mlx4_ib_iov_sysfs_attr dentries[3 * NUM_ALIAS_GUID_PER_PORT + 1]; +}; + +struct mlx4_ib_iov_port { + char name[100]; + u8 num; + struct mlx4_ib_dev *dev; + struct list_head list; + struct mlx4_ib_iov_sysfs_attr_ar *dentr_ar; + struct ib_port_attr attr; + struct kobject *cur_port; + struct kobject *admin_alias_parent; + struct kobject *gids_parent; + struct kobject *pkeys_parent; + struct kobject *mcgs_parent; + struct mlx4_ib_iov_sysfs_attr mcg_dentry; +}; + +struct counter_index { + struct list_head list; + u32 index; + u8 allocated; +}; + +struct mlx4_ib_counters { + struct list_head counters_list; + struct mutex mutex; /* mutex for accessing counters list */ + u32 default_counter; +}; + +#define MLX4_DIAG_COUNTERS_TYPES 2 + +struct mlx4_ib_diag_counters { + const char **name; + u32 *offset; + u32 num_counters; +}; + +struct mlx4_ib_dev { + struct ib_device ib_dev; + struct mlx4_dev *dev; + int num_ports; + void __iomem *uar_map; + + struct mlx4_uar priv_uar; + u32 priv_pdn; + MLX4_DECLARE_DOORBELL_LOCK(uar_lock); + + struct ib_mad_agent *send_agent[MLX4_MAX_PORTS][2]; + struct ib_ah *sm_ah[MLX4_MAX_PORTS]; + spinlock_t sm_lock; + atomic64_t sl2vl[MLX4_MAX_PORTS]; + struct mlx4_ib_sriov sriov; + + struct mutex cap_mask_mutex; + bool ib_active; + struct mlx4_ib_iboe iboe; + struct mlx4_ib_counters counters_table[MLX4_MAX_PORTS]; + int *eq_table; + struct kobject *iov_parent; + struct kobject *ports_parent; + struct kobject *dev_ports_parent[MLX4_MFUNC_MAX]; + struct mlx4_ib_iov_port iov_ports[MLX4_MAX_PORTS]; + struct pkey_mgt pkeys; + unsigned long *ib_uc_qpns_bitmap; + int steer_qpn_count; + int steer_qpn_base; + int steering_support; + struct mlx4_ib_qp *qp1_proxy[MLX4_MAX_PORTS]; + /* lock when destroying qp1_proxy and getting netdev events */ + struct mutex qp1_proxy_lock[MLX4_MAX_PORTS]; + u8 bond_next_port; + /* protect resources needed as part of reset flow */ + spinlock_t reset_flow_resource_lock; + struct list_head qp_list; + struct mlx4_ib_diag_counters diag_counters[MLX4_DIAG_COUNTERS_TYPES]; +}; + +struct ib_event_work { + struct work_struct work; + struct mlx4_ib_dev *ib_dev; + struct mlx4_eqe ib_eqe; + int port; +}; + +struct mlx4_ib_qp_tunnel_init_attr { + struct ib_qp_init_attr init_attr; + int slave; + enum ib_qp_type proxy_qp_type; + u8 port; +}; + +struct mlx4_uverbs_ex_query_device { + __u32 comp_mask; + __u32 reserved; +}; + +static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct mlx4_ib_dev, ib_dev); +} + +static inline struct mlx4_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct mlx4_ib_ucontext, ibucontext); +} + +static inline struct mlx4_ib_pd *to_mpd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct mlx4_ib_pd, ibpd); +} + +static inline struct mlx4_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd) +{ + return container_of(ibxrcd, struct mlx4_ib_xrcd, ibxrcd); +} + +static inline struct mlx4_ib_cq *to_mcq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct mlx4_ib_cq, ibcq); +} + +static inline struct mlx4_ib_cq *to_mibcq(struct mlx4_cq *mcq) +{ + return container_of(mcq, struct mlx4_ib_cq, mcq); +} + +static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct mlx4_ib_mr, ibmr); +} + +static inline struct mlx4_ib_mw *to_mmw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct mlx4_ib_mw, ibmw); +} + +static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr); +} + +static inline struct mlx4_ib_flow *to_mflow(struct ib_flow *ibflow) +{ + return container_of(ibflow, struct mlx4_ib_flow, ibflow); +} + +static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct mlx4_ib_qp, ibqp); +} + +static inline struct mlx4_ib_qp *to_mibqp(struct mlx4_qp *mqp) +{ + return container_of(mqp, struct mlx4_ib_qp, mqp); +} + +static inline struct mlx4_ib_srq *to_msrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct mlx4_ib_srq, ibsrq); +} + +static inline struct mlx4_ib_srq *to_mibsrq(struct mlx4_srq *msrq) +{ + return container_of(msrq, struct mlx4_ib_srq, msrq); +} + +static inline struct mlx4_ib_ah *to_mah(struct ib_ah *ibah) +{ + return container_of(ibah, struct mlx4_ib_ah, ibah); +} + +static inline u8 mlx4_ib_bond_next_port(struct mlx4_ib_dev *dev) +{ + dev->bond_next_port = (dev->bond_next_port + 1) % dev->num_ports; + + return dev->bond_next_port + 1; +} + +int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev); +void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev); + +int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, + struct mlx4_db *db); +void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db); + +struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc); +int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, + struct ib_umem *umem); +struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata); +int mlx4_ib_dereg_mr(struct ib_mr *mr); +struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata); +int mlx4_ib_dealloc_mw(struct ib_mw *mw); +struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_num_sg); +int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); +int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); +int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); +struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, + const struct ib_cq_init_attr *attr, + struct ib_ucontext *context, + struct ib_udata *udata); +int mlx4_ib_destroy_cq(struct ib_cq *cq); +int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); +void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq); +void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq); + +struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, + struct ib_udata *udata); +struct ib_ah *mlx4_ib_create_ah_slave(struct ib_pd *pd, + struct rdma_ah_attr *ah_attr, + int slave_sgid_index, u8 *s_mac, + u16 vlan_tag); +int mlx4_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr); +int mlx4_ib_destroy_ah(struct ib_ah *ah); + +struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata); +int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); +int mlx4_ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); +int mlx4_ib_destroy_srq(struct ib_srq *srq); +void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index); +int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); + +struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +int mlx4_ib_destroy_qp(struct ib_qp *qp); +void mlx4_ib_drain_sq(struct ib_qp *qp); +void mlx4_ib_drain_rq(struct ib_qp *qp); +int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int mlx4_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); + +int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags, + int port, const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const void *in_mad, void *response_mad); +int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad_hdr *in, size_t in_mad_size, + struct ib_mad_hdr *out, size_t *out_mad_size, + u16 *out_mad_pkey_index); +int mlx4_ib_mad_init(struct mlx4_ib_dev *dev); +void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev); + +struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr); +int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages, + u64 iova); +int mlx4_ib_unmap_fmr(struct list_head *fmr_list); +int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr); +int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, int netw_view); +int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey, int netw_view); + +int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid, int netw_view); + +static inline bool mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah) +{ + u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3; + + if (rdma_port_get_link_layer(ah->ibah.device, port) == IB_LINK_LAYER_ETHERNET) + return true; + + return !!(ah->av.ib.g_slid & 0x80); +} + +int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx); +void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq); +void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave); +int mlx4_ib_mcg_init(void); +void mlx4_ib_mcg_destroy(void); + +int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid); + +int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *sa_mad); +int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *mad); + +int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + union ib_gid *gid); + +void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num, + enum ib_event_type type); + +void mlx4_ib_tunnels_update_work(struct work_struct *work); + +int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type qpt, struct ib_wc *wc, + struct ib_grh *grh, struct ib_mad *mad); + +int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, + u32 qkey, struct rdma_ah_attr *attr, u8 *s_mac, + u16 vlan_id, struct ib_mad *mad); + +__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx); + +int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, + struct ib_mad *mad); + +int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id, + struct ib_mad *mad); + +void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev); +void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave_id); + +/* alias guid support */ +void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port); +int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev); +void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev); +void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port); + +void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, + int block_num, + u8 port_num, u8 *p_data); + +void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, + int block_num, u8 port_num, + u8 *p_data); + +int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr); +void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr); +ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index); +void mlx4_ib_slave_alias_guid_event(struct mlx4_ib_dev *dev, int slave, + int port, int slave_init); + +int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *device) ; + +void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device); + +__be64 mlx4_ib_gen_node_guid(void); + +int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn); +void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count); +int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + int is_attach); +int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, + u64 start, u64 length, u64 virt_addr, + int mr_access_flags, struct ib_pd *pd, + struct ib_udata *udata); +int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev, + const struct ib_gid_attr *attr); + +void mlx4_sched_ib_sl2vl_update_work(struct mlx4_ib_dev *ibdev, + int port); + +void mlx4_ib_sl2vl_update(struct mlx4_ib_dev *mdev, int port); + +struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata); +int mlx4_ib_destroy_wq(struct ib_wq *wq); +int mlx4_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask, struct ib_udata *udata); + +struct ib_rwq_ind_table +*mlx4_ib_create_rwq_ind_table(struct ib_device *device, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata); +int mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); +int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va, + int *num_of_mtts); + +#endif /* MLX4_IB_H */ diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c new file mode 100644 index 000000000..c7c85c22e --- /dev/null +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -0,0 +1,826 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/slab.h> +#include <rdma/ib_user_verbs.h> + +#include "mlx4_ib.h" + +static u32 convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX4_PERM_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MLX4_PERM_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MLX4_PERM_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MLX4_PERM_LOCAL_WRITE : 0) | + (acc & IB_ACCESS_MW_BIND ? MLX4_PERM_BIND_MW : 0) | + MLX4_PERM_LOCAL_READ; +} + +static enum mlx4_mw_type to_mlx4_type(enum ib_mw_type type) +{ + switch (type) { + case IB_MW_TYPE_1: return MLX4_MW_TYPE_1; + case IB_MW_TYPE_2: return MLX4_MW_TYPE_2; + default: return -1; + } +} + +struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct mlx4_ib_mr *mr; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + err = mlx4_mr_alloc(to_mdev(pd->device)->dev, to_mpd(pd)->pdn, 0, + ~0ull, convert_access(acc), 0, 0, &mr->mmr); + if (err) + goto err_free; + + err = mlx4_mr_enable(to_mdev(pd->device)->dev, &mr->mmr); + if (err) + goto err_mr; + + mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key; + mr->umem = NULL; + + return &mr->ibmr; + +err_mr: + (void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); + +err_free: + kfree(mr); + + return ERR_PTR(err); +} + +enum { + MLX4_MAX_MTT_SHIFT = 31 +}; + +static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev, + struct mlx4_mtt *mtt, + u64 mtt_size, u64 mtt_shift, u64 len, + u64 cur_start_addr, u64 *pages, + int *start_index, int *npages) +{ + u64 cur_end_addr = cur_start_addr + len; + u64 cur_end_addr_aligned = 0; + u64 mtt_entries; + int err = 0; + int k; + + len += (cur_start_addr & (mtt_size - 1ULL)); + cur_end_addr_aligned = round_up(cur_end_addr, mtt_size); + len += (cur_end_addr_aligned - cur_end_addr); + if (len & (mtt_size - 1ULL)) { + pr_warn("write_block: len %llx is not aligned to mtt_size %llx\n", + len, mtt_size); + return -EINVAL; + } + + mtt_entries = (len >> mtt_shift); + + /* + * Align the MTT start address to the mtt_size. + * Required to handle cases when the MR starts in the middle of an MTT + * record. Was not required in old code since the physical addresses + * provided by the dma subsystem were page aligned, which was also the + * MTT size. + */ + cur_start_addr = round_down(cur_start_addr, mtt_size); + /* A new block is started ... */ + for (k = 0; k < mtt_entries; ++k) { + pages[*npages] = cur_start_addr + (mtt_size * k); + (*npages)++; + /* + * Be friendly to mlx4_write_mtt() and pass it chunks of + * appropriate size. + */ + if (*npages == PAGE_SIZE / sizeof(u64)) { + err = mlx4_write_mtt(dev->dev, mtt, *start_index, + *npages, pages); + if (err) + return err; + + (*start_index) += *npages; + *npages = 0; + } + } + + return 0; +} + +static inline u64 alignment_of(u64 ptr) +{ + return ilog2(ptr & (~(ptr - 1))); +} + +static int mlx4_ib_umem_calc_block_mtt(u64 next_block_start, + u64 current_block_end, + u64 block_shift) +{ + /* Check whether the alignment of the new block is aligned as well as + * the previous block. + * Block address must start with zeros till size of entity_size. + */ + if ((next_block_start & ((1ULL << block_shift) - 1ULL)) != 0) + /* + * It is not as well aligned as the previous block-reduce the + * mtt size accordingly. Here we take the last right bit which + * is 1. + */ + block_shift = alignment_of(next_block_start); + + /* + * Check whether the alignment of the end of previous block - is it + * aligned as well as the start of the block + */ + if (((current_block_end) & ((1ULL << block_shift) - 1ULL)) != 0) + /* + * It is not as well aligned as the start of the block - + * reduce the mtt size accordingly. + */ + block_shift = alignment_of(current_block_end); + + return block_shift; +} + +int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, + struct ib_umem *umem) +{ + u64 *pages; + u64 len = 0; + int err = 0; + u64 mtt_size; + u64 cur_start_addr = 0; + u64 mtt_shift; + int start_index = 0; + int npages = 0; + struct scatterlist *sg; + int i; + + pages = (u64 *) __get_free_page(GFP_KERNEL); + if (!pages) + return -ENOMEM; + + mtt_shift = mtt->page_shift; + mtt_size = 1ULL << mtt_shift; + + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) { + if (cur_start_addr + len == sg_dma_address(sg)) { + /* still the same block */ + len += sg_dma_len(sg); + continue; + } + /* + * A new block is started ... + * If len is malaligned, write an extra mtt entry to cover the + * misaligned area (round up the division) + */ + err = mlx4_ib_umem_write_mtt_block(dev, mtt, mtt_size, + mtt_shift, len, + cur_start_addr, + pages, &start_index, + &npages); + if (err) + goto out; + + cur_start_addr = sg_dma_address(sg); + len = sg_dma_len(sg); + } + + /* Handle the last block */ + if (len > 0) { + /* + * If len is malaligned, write an extra mtt entry to cover + * the misaligned area (round up the division) + */ + err = mlx4_ib_umem_write_mtt_block(dev, mtt, mtt_size, + mtt_shift, len, + cur_start_addr, pages, + &start_index, &npages); + if (err) + goto out; + } + + if (npages) + err = mlx4_write_mtt(dev->dev, mtt, start_index, npages, pages); + +out: + free_page((unsigned long) pages); + return err; +} + +/* + * Calculate optimal mtt size based on contiguous pages. + * Function will return also the number of pages that are not aligned to the + * calculated mtt_size to be added to total number of pages. For that we should + * check the first chunk length & last chunk length and if not aligned to + * mtt_size we should increment the non_aligned_pages number. All chunks in the + * middle already handled as part of mtt shift calculation for both their start + * & end addresses. + */ +int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va, + int *num_of_mtts) +{ + u64 block_shift = MLX4_MAX_MTT_SHIFT; + u64 min_shift = umem->page_shift; + u64 last_block_aligned_end = 0; + u64 current_block_start = 0; + u64 first_block_start = 0; + u64 current_block_len = 0; + u64 last_block_end = 0; + struct scatterlist *sg; + u64 current_block_end; + u64 misalignment_bits; + u64 next_block_start; + u64 total_len = 0; + int i; + + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) { + /* + * Initialization - save the first chunk start as the + * current_block_start - block means contiguous pages. + */ + if (current_block_len == 0 && current_block_start == 0) { + current_block_start = sg_dma_address(sg); + first_block_start = current_block_start; + /* + * Find the bits that are different between the physical + * address and the virtual address for the start of the + * MR. + * umem_get aligned the start_va to a page boundary. + * Therefore, we need to align the start va to the same + * boundary. + * misalignment_bits is needed to handle the case of a + * single memory region. In this case, the rest of the + * logic will not reduce the block size. If we use a + * block size which is bigger than the alignment of the + * misalignment bits, we might use the virtual page + * number instead of the physical page number, resulting + * in access to the wrong data. + */ + misalignment_bits = + (start_va & (~(((u64)(BIT(umem->page_shift))) - 1ULL))) + ^ current_block_start; + block_shift = min(alignment_of(misalignment_bits), + block_shift); + } + + /* + * Go over the scatter entries and check if they continue the + * previous scatter entry. + */ + next_block_start = sg_dma_address(sg); + current_block_end = current_block_start + current_block_len; + /* If we have a split (non-contig.) between two blocks */ + if (current_block_end != next_block_start) { + block_shift = mlx4_ib_umem_calc_block_mtt + (next_block_start, + current_block_end, + block_shift); + + /* + * If we reached the minimum shift for 4k page we stop + * the loop. + */ + if (block_shift <= min_shift) + goto end; + + /* + * If not saved yet we are in first block - we save the + * length of first block to calculate the + * non_aligned_pages number at the end. + */ + total_len += current_block_len; + + /* Start a new block */ + current_block_start = next_block_start; + current_block_len = sg_dma_len(sg); + continue; + } + /* The scatter entry is another part of the current block, + * increase the block size. + * An entry in the scatter can be larger than 4k (page) as of + * dma mapping which merge some blocks together. + */ + current_block_len += sg_dma_len(sg); + } + + /* Account for the last block in the total len */ + total_len += current_block_len; + /* Add to the first block the misalignment that it suffers from. */ + total_len += (first_block_start & ((1ULL << block_shift) - 1ULL)); + last_block_end = current_block_start + current_block_len; + last_block_aligned_end = round_up(last_block_end, 1ULL << block_shift); + total_len += (last_block_aligned_end - last_block_end); + + if (total_len & ((1ULL << block_shift) - 1ULL)) + pr_warn("misaligned total length detected (%llu, %llu)!", + total_len, block_shift); + + *num_of_mtts = total_len >> block_shift; +end: + if (block_shift < min_shift) { + /* + * If shift is less than the min we set a warning and return the + * min shift. + */ + pr_warn("umem_calc_optimal_mtt_size - unexpected shift %lld\n", block_shift); + + block_shift = min_shift; + } + return block_shift; +} + +static struct ib_umem *mlx4_get_umem_mr(struct ib_ucontext *context, u64 start, + u64 length, u64 virt_addr, + int access_flags) +{ + /* + * Force registering the memory as writable if the underlying pages + * are writable. This is so rereg can change the access permissions + * from readable to writable without having to run through ib_umem_get + * again + */ + if (!ib_access_writable(access_flags)) { + struct vm_area_struct *vma; + + down_read(¤t->mm->mmap_sem); + /* + * FIXME: Ideally this would iterate over all the vmas that + * cover the memory, but for now it requires a single vma to + * entirely cover the MR to support RO mappings. + */ + vma = find_vma(current->mm, start); + if (vma && vma->vm_end >= start + length && + vma->vm_start <= start) { + if (vma->vm_flags & VM_WRITE) + access_flags |= IB_ACCESS_LOCAL_WRITE; + } else { + access_flags |= IB_ACCESS_LOCAL_WRITE; + } + + up_read(¤t->mm->mmap_sem); + } + + return ib_umem_get(context, start, length, access_flags, 0); +} + +struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_mr *mr; + int shift; + int err; + int n; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + mr->umem = mlx4_get_umem_mr(pd->uobject->context, start, length, + virt_addr, access_flags); + if (IS_ERR(mr->umem)) { + err = PTR_ERR(mr->umem); + goto err_free; + } + + n = ib_umem_page_count(mr->umem); + shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start, &n); + + err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length, + convert_access(access_flags), n, shift, &mr->mmr); + if (err) + goto err_umem; + + err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem); + if (err) + goto err_mr; + + err = mlx4_mr_enable(dev->dev, &mr->mmr); + if (err) + goto err_mr; + + mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.length = length; + mr->ibmr.iova = virt_addr; + mr->ibmr.page_size = 1U << shift; + + return &mr->ibmr; + +err_mr: + (void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); + +err_umem: + ib_umem_release(mr->umem); + +err_free: + kfree(mr); + + return ERR_PTR(err); +} + +int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, + u64 start, u64 length, u64 virt_addr, + int mr_access_flags, struct ib_pd *pd, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(mr->device); + struct mlx4_ib_mr *mmr = to_mmr(mr); + struct mlx4_mpt_entry *mpt_entry; + struct mlx4_mpt_entry **pmpt_entry = &mpt_entry; + int err; + + /* Since we synchronize this call and mlx4_ib_dereg_mr via uverbs, + * we assume that the calls can't run concurrently. Otherwise, a + * race exists. + */ + err = mlx4_mr_hw_get_mpt(dev->dev, &mmr->mmr, &pmpt_entry); + + if (err) + return err; + + if (flags & IB_MR_REREG_PD) { + err = mlx4_mr_hw_change_pd(dev->dev, *pmpt_entry, + to_mpd(pd)->pdn); + + if (err) + goto release_mpt_entry; + } + + if (flags & IB_MR_REREG_ACCESS) { + if (ib_access_writable(mr_access_flags) && + !mmr->umem->writable) { + err = -EPERM; + goto release_mpt_entry; + } + + err = mlx4_mr_hw_change_access(dev->dev, *pmpt_entry, + convert_access(mr_access_flags)); + + if (err) + goto release_mpt_entry; + } + + if (flags & IB_MR_REREG_TRANS) { + int shift; + int n; + + mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); + ib_umem_release(mmr->umem); + mmr->umem = + mlx4_get_umem_mr(mr->uobject->context, start, length, + virt_addr, mr_access_flags); + if (IS_ERR(mmr->umem)) { + err = PTR_ERR(mmr->umem); + /* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */ + mmr->umem = NULL; + goto release_mpt_entry; + } + n = ib_umem_page_count(mmr->umem); + shift = mmr->umem->page_shift; + + err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr, + virt_addr, length, n, shift, + *pmpt_entry); + if (err) { + ib_umem_release(mmr->umem); + goto release_mpt_entry; + } + mmr->mmr.iova = virt_addr; + mmr->mmr.size = length; + + err = mlx4_ib_umem_write_mtt(dev, &mmr->mmr.mtt, mmr->umem); + if (err) { + mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); + ib_umem_release(mmr->umem); + goto release_mpt_entry; + } + } + + /* If we couldn't transfer the MR to the HCA, just remember to + * return a failure. But dereg_mr will free the resources. + */ + err = mlx4_mr_hw_write_mpt(dev->dev, &mmr->mmr, pmpt_entry); + if (!err && flags & IB_MR_REREG_ACCESS) + mmr->mmr.access = mr_access_flags; + +release_mpt_entry: + mlx4_mr_hw_put_mpt(dev->dev, pmpt_entry); + + return err; +} + +static int +mlx4_alloc_priv_pages(struct ib_device *device, + struct mlx4_ib_mr *mr, + int max_pages) +{ + int ret; + + /* Ensure that size is aligned to DMA cacheline + * requirements. + * max_pages is limited to MLX4_MAX_FAST_REG_PAGES + * so page_map_size will never cross PAGE_SIZE. + */ + mr->page_map_size = roundup(max_pages * sizeof(u64), + MLX4_MR_PAGES_ALIGN); + + /* Prevent cross page boundary allocation. */ + mr->pages = (__be64 *)get_zeroed_page(GFP_KERNEL); + if (!mr->pages) + return -ENOMEM; + + mr->page_map = dma_map_single(device->dev.parent, mr->pages, + mr->page_map_size, DMA_TO_DEVICE); + + if (dma_mapping_error(device->dev.parent, mr->page_map)) { + ret = -ENOMEM; + goto err; + } + + return 0; + +err: + free_page((unsigned long)mr->pages); + return ret; +} + +static void +mlx4_free_priv_pages(struct mlx4_ib_mr *mr) +{ + if (mr->pages) { + struct ib_device *device = mr->ibmr.device; + + dma_unmap_single(device->dev.parent, mr->page_map, + mr->page_map_size, DMA_TO_DEVICE); + free_page((unsigned long)mr->pages); + mr->pages = NULL; + } +} + +int mlx4_ib_dereg_mr(struct ib_mr *ibmr) +{ + struct mlx4_ib_mr *mr = to_mmr(ibmr); + int ret; + + mlx4_free_priv_pages(mr); + + ret = mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr); + if (ret) + return ret; + if (mr->umem) + ib_umem_release(mr->umem); + kfree(mr); + + return 0; +} + +struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_mw *mw; + int err; + + mw = kmalloc(sizeof(*mw), GFP_KERNEL); + if (!mw) + return ERR_PTR(-ENOMEM); + + err = mlx4_mw_alloc(dev->dev, to_mpd(pd)->pdn, + to_mlx4_type(type), &mw->mmw); + if (err) + goto err_free; + + err = mlx4_mw_enable(dev->dev, &mw->mmw); + if (err) + goto err_mw; + + mw->ibmw.rkey = mw->mmw.key; + + return &mw->ibmw; + +err_mw: + mlx4_mw_free(dev->dev, &mw->mmw); + +err_free: + kfree(mw); + + return ERR_PTR(err); +} + +int mlx4_ib_dealloc_mw(struct ib_mw *ibmw) +{ + struct mlx4_ib_mw *mw = to_mmw(ibmw); + + mlx4_mw_free(to_mdev(ibmw->device)->dev, &mw->mmw); + kfree(mw); + + return 0; +} + +struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_num_sg) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_mr *mr; + int err; + + if (mr_type != IB_MR_TYPE_MEM_REG || + max_num_sg > MLX4_MAX_FAST_REG_PAGES) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0, + max_num_sg, 0, &mr->mmr); + if (err) + goto err_free; + + err = mlx4_alloc_priv_pages(pd->device, mr, max_num_sg); + if (err) + goto err_free_mr; + + mr->max_pages = max_num_sg; + err = mlx4_mr_enable(dev->dev, &mr->mmr); + if (err) + goto err_free_pl; + + mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key; + mr->umem = NULL; + + return &mr->ibmr; + +err_free_pl: + mr->ibmr.device = pd->device; + mlx4_free_priv_pages(mr); +err_free_mr: + (void) mlx4_mr_free(dev->dev, &mr->mmr); +err_free: + kfree(mr); + return ERR_PTR(err); +} + +struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc, + struct ib_fmr_attr *fmr_attr) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_fmr *fmr; + int err = -ENOMEM; + + fmr = kmalloc(sizeof *fmr, GFP_KERNEL); + if (!fmr) + return ERR_PTR(-ENOMEM); + + err = mlx4_fmr_alloc(dev->dev, to_mpd(pd)->pdn, convert_access(acc), + fmr_attr->max_pages, fmr_attr->max_maps, + fmr_attr->page_shift, &fmr->mfmr); + if (err) + goto err_free; + + err = mlx4_fmr_enable(to_mdev(pd->device)->dev, &fmr->mfmr); + if (err) + goto err_mr; + + fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mfmr.mr.key; + + return &fmr->ibfmr; + +err_mr: + (void) mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr); + +err_free: + kfree(fmr); + + return ERR_PTR(err); +} + +int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int npages, u64 iova) +{ + struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); + struct mlx4_ib_dev *dev = to_mdev(ifmr->ibfmr.device); + + return mlx4_map_phys_fmr(dev->dev, &ifmr->mfmr, page_list, npages, iova, + &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey); +} + +int mlx4_ib_unmap_fmr(struct list_head *fmr_list) +{ + struct ib_fmr *ibfmr; + int err; + struct mlx4_dev *mdev = NULL; + + list_for_each_entry(ibfmr, fmr_list, list) { + if (mdev && to_mdev(ibfmr->device)->dev != mdev) + return -EINVAL; + mdev = to_mdev(ibfmr->device)->dev; + } + + if (!mdev) + return 0; + + list_for_each_entry(ibfmr, fmr_list, list) { + struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); + + mlx4_fmr_unmap(mdev, &ifmr->mfmr, &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey); + } + + /* + * Make sure all MPT status updates are visible before issuing + * SYNC_TPT firmware command. + */ + wmb(); + + err = mlx4_SYNC_TPT(mdev); + if (err) + pr_warn("SYNC_TPT error %d when " + "unmapping FMRs\n", err); + + return 0; +} + +int mlx4_ib_fmr_dealloc(struct ib_fmr *ibfmr) +{ + struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); + struct mlx4_ib_dev *dev = to_mdev(ibfmr->device); + int err; + + err = mlx4_fmr_free(dev->dev, &ifmr->mfmr); + + if (!err) + kfree(ifmr); + + return err; +} + +static int mlx4_set_page(struct ib_mr *ibmr, u64 addr) +{ + struct mlx4_ib_mr *mr = to_mmr(ibmr); + + if (unlikely(mr->npages == mr->max_pages)) + return -ENOMEM; + + mr->pages[mr->npages++] = cpu_to_be64(addr | MLX4_MTT_FLAG_PRESENT); + + return 0; +} + +int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) +{ + struct mlx4_ib_mr *mr = to_mmr(ibmr); + int rc; + + mr->npages = 0; + + ib_dma_sync_single_for_cpu(ibmr->device, mr->page_map, + mr->page_map_size, DMA_TO_DEVICE); + + rc = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, mlx4_set_page); + + ib_dma_sync_single_for_device(ibmr->device, mr->page_map, + mr->page_map_size, DMA_TO_DEVICE); + + return rc; +} diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c new file mode 100644 index 000000000..7209b8a9b --- /dev/null +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -0,0 +1,4466 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/log2.h> +#include <linux/etherdevice.h> +#include <net/ip.h> +#include <linux/slab.h> +#include <linux/netdevice.h> + +#include <rdma/ib_cache.h> +#include <rdma/ib_pack.h> +#include <rdma/ib_addr.h> +#include <rdma/ib_mad.h> + +#include <linux/mlx4/driver.h> +#include <linux/mlx4/qp.h> + +#include "mlx4_ib.h" +#include <rdma/mlx4-abi.h> + +static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, + struct mlx4_ib_cq *recv_cq); +static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, + struct mlx4_ib_cq *recv_cq); +static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state); + +enum { + MLX4_IB_ACK_REQ_FREQ = 8, +}; + +enum { + MLX4_IB_DEFAULT_SCHED_QUEUE = 0x83, + MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f, + MLX4_IB_LINK_TYPE_IB = 0, + MLX4_IB_LINK_TYPE_ETH = 1 +}; + +enum { + /* + * Largest possible UD header: send with GRH and immediate + * data plus 18 bytes for an Ethernet header with VLAN/802.1Q + * tag. (LRH would only use 8 bytes, so Ethernet is the + * biggest case) + */ + MLX4_IB_UD_HEADER_SIZE = 82, + MLX4_IB_LSO_HEADER_SPARE = 128, +}; + +struct mlx4_ib_sqp { + struct mlx4_ib_qp qp; + int pkey_index; + u32 qkey; + u32 send_psn; + struct ib_ud_header ud_header; + u8 header_buf[MLX4_IB_UD_HEADER_SIZE]; + struct ib_qp *roce_v2_gsi; +}; + +enum { + MLX4_IB_MIN_SQ_STRIDE = 6, + MLX4_IB_CACHE_LINE_SIZE = 64, +}; + +enum { + MLX4_RAW_QP_MTU = 7, + MLX4_RAW_QP_MSGMAX = 31, +}; + +#ifndef ETH_ALEN +#define ETH_ALEN 6 +#endif + +static const __be32 mlx4_ib_opcode[] = { + [IB_WR_SEND] = cpu_to_be32(MLX4_OPCODE_SEND), + [IB_WR_LSO] = cpu_to_be32(MLX4_OPCODE_LSO), + [IB_WR_SEND_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_SEND_IMM), + [IB_WR_RDMA_WRITE] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE), + [IB_WR_RDMA_WRITE_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM), + [IB_WR_RDMA_READ] = cpu_to_be32(MLX4_OPCODE_RDMA_READ), + [IB_WR_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_ATOMIC_CS), + [IB_WR_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_ATOMIC_FA), + [IB_WR_SEND_WITH_INV] = cpu_to_be32(MLX4_OPCODE_SEND_INVAL), + [IB_WR_LOCAL_INV] = cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL), + [IB_WR_REG_MR] = cpu_to_be32(MLX4_OPCODE_FMR), + [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS), + [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA), +}; + +enum mlx4_ib_source_type { + MLX4_IB_QP_SRC = 0, + MLX4_IB_RWQ_SRC = 1, +}; + +static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp) +{ + return container_of(mqp, struct mlx4_ib_sqp, qp); +} + +static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + if (!mlx4_is_master(dev->dev)) + return 0; + + return qp->mqp.qpn >= dev->dev->phys_caps.base_tunnel_sqpn && + qp->mqp.qpn < dev->dev->phys_caps.base_tunnel_sqpn + + 8 * MLX4_MFUNC_MAX; +} + +static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + int proxy_sqp = 0; + int real_sqp = 0; + int i; + /* PPF or Native -- real SQP */ + real_sqp = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) && + qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn && + qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 3); + if (real_sqp) + return 1; + /* VF or PF -- proxy SQP */ + if (mlx4_is_mfunc(dev->dev)) { + for (i = 0; i < dev->dev->caps.num_ports; i++) { + if (qp->mqp.qpn == dev->dev->caps.spec_qps[i].qp0_proxy || + qp->mqp.qpn == dev->dev->caps.spec_qps[i].qp1_proxy) { + proxy_sqp = 1; + break; + } + } + } + if (proxy_sqp) + return 1; + + return !!(qp->flags & MLX4_IB_ROCE_V2_GSI_QP); +} + +/* used for INIT/CLOSE port logic */ +static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + int proxy_qp0 = 0; + int real_qp0 = 0; + int i; + /* PPF or Native -- real QP0 */ + real_qp0 = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) && + qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn && + qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 1); + if (real_qp0) + return 1; + /* VF or PF -- proxy QP0 */ + if (mlx4_is_mfunc(dev->dev)) { + for (i = 0; i < dev->dev->caps.num_ports; i++) { + if (qp->mqp.qpn == dev->dev->caps.spec_qps[i].qp0_proxy) { + proxy_qp0 = 1; + break; + } + } + } + return proxy_qp0; +} + +static void *get_wqe(struct mlx4_ib_qp *qp, int offset) +{ + return mlx4_buf_offset(&qp->buf, offset); +} + +static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift)); +} + +static void *get_send_wqe(struct mlx4_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift)); +} + +/* + * Stamp a SQ WQE so that it is invalid if prefetched by marking the + * first four bytes of every 64 byte chunk with 0xffffffff, except for + * the very first chunk of the WQE. + */ +static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n) +{ + __be32 *wqe; + int i; + int s; + void *buf; + struct mlx4_wqe_ctrl_seg *ctrl; + + buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); + ctrl = (struct mlx4_wqe_ctrl_seg *)buf; + s = (ctrl->qpn_vlan.fence_size & 0x3f) << 4; + for (i = 64; i < s; i += 64) { + wqe = buf + i; + *wqe = cpu_to_be32(0xffffffff); + } +} + +static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) +{ + struct ib_event event; + struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; + + if (type == MLX4_EVENT_TYPE_PATH_MIG) + to_mibqp(qp)->port = to_mibqp(qp)->alt_port; + + if (ibqp->event_handler) { + event.device = ibqp->device; + event.element.qp = ibqp; + switch (type) { + case MLX4_EVENT_TYPE_PATH_MIG: + event.event = IB_EVENT_PATH_MIG; + break; + case MLX4_EVENT_TYPE_COMM_EST: + event.event = IB_EVENT_COMM_EST; + break; + case MLX4_EVENT_TYPE_SQ_DRAINED: + event.event = IB_EVENT_SQ_DRAINED; + break; + case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE: + event.event = IB_EVENT_QP_LAST_WQE_REACHED; + break; + case MLX4_EVENT_TYPE_WQ_CATAS_ERROR: + event.event = IB_EVENT_QP_FATAL; + break; + case MLX4_EVENT_TYPE_PATH_MIG_FAILED: + event.event = IB_EVENT_PATH_MIG_ERR; + break; + case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + event.event = IB_EVENT_QP_REQ_ERR; + break; + case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR: + event.event = IB_EVENT_QP_ACCESS_ERR; + break; + default: + pr_warn("Unexpected event type %d " + "on QP %06x\n", type, qp->qpn); + return; + } + + ibqp->event_handler(&event, ibqp->qp_context); + } +} + +static void mlx4_ib_wq_event(struct mlx4_qp *qp, enum mlx4_event type) +{ + pr_warn_ratelimited("Unexpected event type %d on WQ 0x%06x. Events are not supported for WQs\n", + type, qp->qpn); +} + +static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags) +{ + /* + * UD WQEs must have a datagram segment. + * RC and UC WQEs might have a remote address segment. + * MLX WQEs need two extra inline data segments (for the UD + * header and space for the ICRC). + */ + switch (type) { + case MLX4_IB_QPT_UD: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_datagram_seg) + + ((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0); + case MLX4_IB_QPT_PROXY_SMI_OWNER: + case MLX4_IB_QPT_PROXY_SMI: + case MLX4_IB_QPT_PROXY_GSI: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_datagram_seg) + 64; + case MLX4_IB_QPT_TUN_SMI_OWNER: + case MLX4_IB_QPT_TUN_GSI: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_datagram_seg); + + case MLX4_IB_QPT_UC: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_raddr_seg); + case MLX4_IB_QPT_RC: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_masked_atomic_seg) + + sizeof (struct mlx4_wqe_raddr_seg); + case MLX4_IB_QPT_SMI: + case MLX4_IB_QPT_GSI: + return sizeof (struct mlx4_wqe_ctrl_seg) + + ALIGN(MLX4_IB_UD_HEADER_SIZE + + DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE, + MLX4_INLINE_ALIGN) * + sizeof (struct mlx4_wqe_inline_seg), + sizeof (struct mlx4_wqe_data_seg)) + + ALIGN(4 + + sizeof (struct mlx4_wqe_inline_seg), + sizeof (struct mlx4_wqe_data_seg)); + default: + return sizeof (struct mlx4_wqe_ctrl_seg); + } +} + +static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, + int is_user, int has_rq, struct mlx4_ib_qp *qp, + u32 inl_recv_sz) +{ + /* Sanity check RQ size before proceeding */ + if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE || + cap->max_recv_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)) + return -EINVAL; + + if (!has_rq) { + if (cap->max_recv_wr || inl_recv_sz) + return -EINVAL; + + qp->rq.wqe_cnt = qp->rq.max_gs = 0; + } else { + u32 max_inl_recv_sz = dev->dev->caps.max_rq_sg * + sizeof(struct mlx4_wqe_data_seg); + u32 wqe_size; + + /* HW requires >= 1 RQ entry with >= 1 gather entry */ + if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge || + inl_recv_sz > max_inl_recv_sz)) + return -EINVAL; + + qp->rq.wqe_cnt = roundup_pow_of_two(max(1U, cap->max_recv_wr)); + qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge)); + wqe_size = qp->rq.max_gs * sizeof(struct mlx4_wqe_data_seg); + qp->rq.wqe_shift = ilog2(max_t(u32, wqe_size, inl_recv_sz)); + } + + /* leave userspace return values as they were, so as not to break ABI */ + if (is_user) { + cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt; + cap->max_recv_sge = qp->rq.max_gs; + } else { + cap->max_recv_wr = qp->rq.max_post = + min(dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt); + cap->max_recv_sge = min(qp->rq.max_gs, + min(dev->dev->caps.max_sq_sg, + dev->dev->caps.max_rq_sg)); + } + + return 0; +} + +static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, + enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp) +{ + int s; + + /* Sanity check SQ size before proceeding */ + if (cap->max_send_wr > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) || + cap->max_send_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) || + cap->max_inline_data + send_wqe_overhead(type, qp->flags) + + sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz) + return -EINVAL; + + /* + * For MLX transport we need 2 extra S/G entries: + * one for the header and one for the checksum at the end + */ + if ((type == MLX4_IB_QPT_SMI || type == MLX4_IB_QPT_GSI || + type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) && + cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) + return -EINVAL; + + s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg), + cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) + + send_wqe_overhead(type, qp->flags); + + if (s > dev->dev->caps.max_sq_desc_sz) + return -EINVAL; + + qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s)); + + /* + * We need to leave 2 KB + 1 WR of headroom in the SQ to + * allow HW to prefetch. + */ + qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1; + qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr + + qp->sq_spare_wqes); + + qp->sq.max_gs = + (min(dev->dev->caps.max_sq_desc_sz, + (1 << qp->sq.wqe_shift)) - + send_wqe_overhead(type, qp->flags)) / + sizeof (struct mlx4_wqe_data_seg); + + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << qp->sq.wqe_shift); + if (qp->rq.wqe_shift > qp->sq.wqe_shift) { + qp->rq.offset = 0; + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + } else { + qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift; + qp->sq.offset = 0; + } + + cap->max_send_wr = qp->sq.max_post = + qp->sq.wqe_cnt - qp->sq_spare_wqes; + cap->max_send_sge = min(qp->sq.max_gs, + min(dev->dev->caps.max_sq_sg, + dev->dev->caps.max_rq_sg)); + /* We don't support inline sends for kernel QPs (yet) */ + cap->max_inline_data = 0; + + return 0; +} + +static int set_user_sq_size(struct mlx4_ib_dev *dev, + struct mlx4_ib_qp *qp, + struct mlx4_ib_create_qp *ucmd) +{ + /* Sanity check SQ size before proceeding */ + if ((1 << ucmd->log_sq_bb_count) > dev->dev->caps.max_wqes || + ucmd->log_sq_stride > + ilog2(roundup_pow_of_two(dev->dev->caps.max_sq_desc_sz)) || + ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE) + return -EINVAL; + + qp->sq.wqe_cnt = 1 << ucmd->log_sq_bb_count; + qp->sq.wqe_shift = ucmd->log_sq_stride; + + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << qp->sq.wqe_shift); + + return 0; +} + +static int alloc_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) +{ + int i; + + qp->sqp_proxy_rcv = + kmalloc_array(qp->rq.wqe_cnt, sizeof(struct mlx4_ib_buf), + GFP_KERNEL); + if (!qp->sqp_proxy_rcv) + return -ENOMEM; + for (i = 0; i < qp->rq.wqe_cnt; i++) { + qp->sqp_proxy_rcv[i].addr = + kmalloc(sizeof (struct mlx4_ib_proxy_sqp_hdr), + GFP_KERNEL); + if (!qp->sqp_proxy_rcv[i].addr) + goto err; + qp->sqp_proxy_rcv[i].map = + ib_dma_map_single(dev, qp->sqp_proxy_rcv[i].addr, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(dev, qp->sqp_proxy_rcv[i].map)) { + kfree(qp->sqp_proxy_rcv[i].addr); + goto err; + } + } + return 0; + +err: + while (i > 0) { + --i; + ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + kfree(qp->sqp_proxy_rcv[i].addr); + } + kfree(qp->sqp_proxy_rcv); + qp->sqp_proxy_rcv = NULL; + return -ENOMEM; +} + +static void free_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) +{ + int i; + + for (i = 0; i < qp->rq.wqe_cnt; i++) { + ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + kfree(qp->sqp_proxy_rcv[i].addr); + } + kfree(qp->sqp_proxy_rcv); +} + +static int qp_has_rq(struct ib_qp_init_attr *attr) +{ + if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT) + return 0; + + return !attr->srq; +} + +static int qp0_enabled_vf(struct mlx4_dev *dev, int qpn) +{ + int i; + for (i = 0; i < dev->caps.num_ports; i++) { + if (qpn == dev->caps.spec_qps[i].qp0_proxy) + return !!dev->caps.spec_qps[i].qp0_qkey; + } + return 0; +} + +static void mlx4_ib_free_qp_counter(struct mlx4_ib_dev *dev, + struct mlx4_ib_qp *qp) +{ + mutex_lock(&dev->counters_table[qp->port - 1].mutex); + mlx4_counter_free(dev->dev, qp->counter_index->index); + list_del(&qp->counter_index->list); + mutex_unlock(&dev->counters_table[qp->port - 1].mutex); + + kfree(qp->counter_index); + qp->counter_index = NULL; +} + +static int set_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_rss *rss_ctx, + struct ib_qp_init_attr *init_attr, + struct mlx4_ib_create_qp_rss *ucmd) +{ + rss_ctx->base_qpn_tbl_sz = init_attr->rwq_ind_tbl->ind_tbl[0]->wq_num | + (init_attr->rwq_ind_tbl->log_ind_tbl_size << 24); + + if ((ucmd->rx_hash_function == MLX4_IB_RX_HASH_FUNC_TOEPLITZ) && + (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP)) { + memcpy(rss_ctx->rss_key, ucmd->rx_hash_key, + MLX4_EN_RSS_KEY_SIZE); + } else { + pr_debug("RX Hash function is not supported\n"); + return (-EOPNOTSUPP); + } + + if (ucmd->rx_hash_fields_mask & ~(MLX4_IB_RX_HASH_SRC_IPV4 | + MLX4_IB_RX_HASH_DST_IPV4 | + MLX4_IB_RX_HASH_SRC_IPV6 | + MLX4_IB_RX_HASH_DST_IPV6 | + MLX4_IB_RX_HASH_SRC_PORT_TCP | + MLX4_IB_RX_HASH_DST_PORT_TCP | + MLX4_IB_RX_HASH_SRC_PORT_UDP | + MLX4_IB_RX_HASH_DST_PORT_UDP | + MLX4_IB_RX_HASH_INNER)) { + pr_debug("RX Hash fields_mask has unsupported mask (0x%llx)\n", + ucmd->rx_hash_fields_mask); + return (-EOPNOTSUPP); + } + + if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV4) && + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV4)) { + rss_ctx->flags = MLX4_RSS_IPV4; + } else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV4) || + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV4)) { + pr_debug("RX Hash fields_mask is not supported - both IPv4 SRC and DST must be set\n"); + return (-EOPNOTSUPP); + } + + if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV6) && + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV6)) { + rss_ctx->flags |= MLX4_RSS_IPV6; + } else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV6) || + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV6)) { + pr_debug("RX Hash fields_mask is not supported - both IPv6 SRC and DST must be set\n"); + return (-EOPNOTSUPP); + } + + if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_UDP) && + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_UDP)) { + if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UDP_RSS)) { + pr_debug("RX Hash fields_mask for UDP is not supported\n"); + return (-EOPNOTSUPP); + } + + if (rss_ctx->flags & MLX4_RSS_IPV4) + rss_ctx->flags |= MLX4_RSS_UDP_IPV4; + if (rss_ctx->flags & MLX4_RSS_IPV6) + rss_ctx->flags |= MLX4_RSS_UDP_IPV6; + if (!(rss_ctx->flags & (MLX4_RSS_IPV6 | MLX4_RSS_IPV4))) { + pr_debug("RX Hash fields_mask is not supported - UDP must be set with IPv4 or IPv6\n"); + return (-EOPNOTSUPP); + } + } else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_UDP) || + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_UDP)) { + pr_debug("RX Hash fields_mask is not supported - both UDP SRC and DST must be set\n"); + return (-EOPNOTSUPP); + } + + if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_TCP) && + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_TCP)) { + if (rss_ctx->flags & MLX4_RSS_IPV4) + rss_ctx->flags |= MLX4_RSS_TCP_IPV4; + if (rss_ctx->flags & MLX4_RSS_IPV6) + rss_ctx->flags |= MLX4_RSS_TCP_IPV6; + if (!(rss_ctx->flags & (MLX4_RSS_IPV6 | MLX4_RSS_IPV4))) { + pr_debug("RX Hash fields_mask is not supported - TCP must be set with IPv4 or IPv6\n"); + return (-EOPNOTSUPP); + } + } else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_TCP) || + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_TCP)) { + pr_debug("RX Hash fields_mask is not supported - both TCP SRC and DST must be set\n"); + return (-EOPNOTSUPP); + } + + if (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_INNER) { + if (dev->dev->caps.tunnel_offload_mode == + MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) { + /* + * Hash according to inner headers if exist, otherwise + * according to outer headers. + */ + rss_ctx->flags |= MLX4_RSS_BY_INNER_HEADERS_IPONLY; + } else { + pr_debug("RSS Hash for inner headers isn't supported\n"); + return (-EOPNOTSUPP); + } + } + + return 0; +} + +static int create_qp_rss(struct mlx4_ib_dev *dev, + struct ib_qp_init_attr *init_attr, + struct mlx4_ib_create_qp_rss *ucmd, + struct mlx4_ib_qp *qp) +{ + int qpn; + int err; + + qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS; + + err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn, 0, qp->mqp.usage); + if (err) + return err; + + err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp); + if (err) + goto err_qpn; + + mutex_init(&qp->mutex); + + INIT_LIST_HEAD(&qp->gid_list); + INIT_LIST_HEAD(&qp->steering_rules); + + qp->mlx4_ib_qp_type = MLX4_IB_QPT_RAW_PACKET; + qp->state = IB_QPS_RESET; + + /* Set dummy send resources to be compatible with HV and PRM */ + qp->sq_no_prefetch = 1; + qp->sq.wqe_cnt = 1; + qp->sq.wqe_shift = MLX4_IB_MIN_SQ_STRIDE; + qp->buf_size = qp->sq.wqe_cnt << MLX4_IB_MIN_SQ_STRIDE; + qp->mtt = (to_mqp( + (struct ib_qp *)init_attr->rwq_ind_tbl->ind_tbl[0]))->mtt; + + qp->rss_ctx = kzalloc(sizeof(*qp->rss_ctx), GFP_KERNEL); + if (!qp->rss_ctx) { + err = -ENOMEM; + goto err_qp_alloc; + } + + err = set_qp_rss(dev, qp->rss_ctx, init_attr, ucmd); + if (err) + goto err; + + return 0; + +err: + kfree(qp->rss_ctx); + +err_qp_alloc: + mlx4_qp_remove(dev->dev, &qp->mqp); + mlx4_qp_free(dev->dev, &qp->mqp); + +err_qpn: + mlx4_qp_release_range(dev->dev, qpn, 1); + return err; +} + +static struct ib_qp *_mlx4_ib_create_qp_rss(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx4_ib_qp *qp; + struct mlx4_ib_create_qp_rss ucmd = {}; + size_t required_cmd_sz; + int err; + + if (!udata) { + pr_debug("RSS QP with NULL udata\n"); + return ERR_PTR(-EINVAL); + } + + if (udata->outlen) + return ERR_PTR(-EOPNOTSUPP); + + required_cmd_sz = offsetof(typeof(ucmd), reserved1) + + sizeof(ucmd.reserved1); + if (udata->inlen < required_cmd_sz) { + pr_debug("invalid inlen\n"); + return ERR_PTR(-EINVAL); + } + + if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) { + pr_debug("copy failed\n"); + return ERR_PTR(-EFAULT); + } + + if (memchr_inv(ucmd.reserved, 0, sizeof(ucmd.reserved))) + return ERR_PTR(-EOPNOTSUPP); + + if (ucmd.comp_mask || ucmd.reserved1) + return ERR_PTR(-EOPNOTSUPP); + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) { + pr_debug("inlen is not supported\n"); + return ERR_PTR(-EOPNOTSUPP); + } + + if (init_attr->qp_type != IB_QPT_RAW_PACKET) { + pr_debug("RSS QP with unsupported QP type %d\n", + init_attr->qp_type); + return ERR_PTR(-EOPNOTSUPP); + } + + if (init_attr->create_flags) { + pr_debug("RSS QP doesn't support create flags\n"); + return ERR_PTR(-EOPNOTSUPP); + } + + if (init_attr->send_cq || init_attr->cap.max_send_wr) { + pr_debug("RSS QP with unsupported send attributes\n"); + return ERR_PTR(-EOPNOTSUPP); + } + + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; + + err = create_qp_rss(to_mdev(pd->device), init_attr, &ucmd, qp); + if (err) { + kfree(qp); + return ERR_PTR(err); + } + + qp->ibqp.qp_num = qp->mqp.qpn; + + return &qp->ibqp; +} + +/* + * This function allocates a WQN from a range which is consecutive and aligned + * to its size. In case the range is full, then it creates a new range and + * allocates WQN from it. The new range will be used for following allocations. + */ +static int mlx4_ib_alloc_wqn(struct mlx4_ib_ucontext *context, + struct mlx4_ib_qp *qp, int range_size, int *wqn) +{ + struct mlx4_ib_dev *dev = to_mdev(context->ibucontext.device); + struct mlx4_wqn_range *range; + int err = 0; + + mutex_lock(&context->wqn_ranges_mutex); + + range = list_first_entry_or_null(&context->wqn_ranges_list, + struct mlx4_wqn_range, list); + + if (!range || (range->refcount == range->size) || range->dirty) { + range = kzalloc(sizeof(*range), GFP_KERNEL); + if (!range) { + err = -ENOMEM; + goto out; + } + + err = mlx4_qp_reserve_range(dev->dev, range_size, + range_size, &range->base_wqn, 0, + qp->mqp.usage); + if (err) { + kfree(range); + goto out; + } + + range->size = range_size; + list_add(&range->list, &context->wqn_ranges_list); + } else if (range_size != 1) { + /* + * Requesting a new range (>1) when last range is still open, is + * not valid. + */ + err = -EINVAL; + goto out; + } + + qp->wqn_range = range; + + *wqn = range->base_wqn + range->refcount; + + range->refcount++; + +out: + mutex_unlock(&context->wqn_ranges_mutex); + + return err; +} + +static void mlx4_ib_release_wqn(struct mlx4_ib_ucontext *context, + struct mlx4_ib_qp *qp, bool dirty_release) +{ + struct mlx4_ib_dev *dev = to_mdev(context->ibucontext.device); + struct mlx4_wqn_range *range; + + mutex_lock(&context->wqn_ranges_mutex); + + range = qp->wqn_range; + + range->refcount--; + if (!range->refcount) { + mlx4_qp_release_range(dev->dev, range->base_wqn, + range->size); + list_del(&range->list); + kfree(range); + } else if (dirty_release) { + /* + * A range which one of its WQNs is destroyed, won't be able to be + * reused for further WQN allocations. + * The next created WQ will allocate a new range. + */ + range->dirty = 1; + } + + mutex_unlock(&context->wqn_ranges_mutex); +} + +static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, + enum mlx4_ib_source_type src, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata, int sqpn, + struct mlx4_ib_qp **caller_qp) +{ + int qpn; + int err; + struct mlx4_ib_sqp *sqp = NULL; + struct mlx4_ib_qp *qp; + enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type; + struct mlx4_ib_cq *mcq; + unsigned long flags; + int range_size = 0; + + /* When tunneling special qps, we use a plain UD qp */ + if (sqpn) { + if (mlx4_is_mfunc(dev->dev) && + (!mlx4_is_master(dev->dev) || + !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) { + if (init_attr->qp_type == IB_QPT_GSI) + qp_type = MLX4_IB_QPT_PROXY_GSI; + else { + if (mlx4_is_master(dev->dev) || + qp0_enabled_vf(dev->dev, sqpn)) + qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER; + else + qp_type = MLX4_IB_QPT_PROXY_SMI; + } + } + qpn = sqpn; + /* add extra sg entry for tunneling */ + init_attr->cap.max_recv_sge++; + } else if (init_attr->create_flags & MLX4_IB_SRIOV_TUNNEL_QP) { + struct mlx4_ib_qp_tunnel_init_attr *tnl_init = + container_of(init_attr, + struct mlx4_ib_qp_tunnel_init_attr, init_attr); + if ((tnl_init->proxy_qp_type != IB_QPT_SMI && + tnl_init->proxy_qp_type != IB_QPT_GSI) || + !mlx4_is_master(dev->dev)) + return -EINVAL; + if (tnl_init->proxy_qp_type == IB_QPT_GSI) + qp_type = MLX4_IB_QPT_TUN_GSI; + else if (tnl_init->slave == mlx4_master_func_num(dev->dev) || + mlx4_vf_smi_enabled(dev->dev, tnl_init->slave, + tnl_init->port)) + qp_type = MLX4_IB_QPT_TUN_SMI_OWNER; + else + qp_type = MLX4_IB_QPT_TUN_SMI; + /* we are definitely in the PPF here, since we are creating + * tunnel QPs. base_tunnel_sqpn is therefore valid. */ + qpn = dev->dev->phys_caps.base_tunnel_sqpn + 8 * tnl_init->slave + + tnl_init->proxy_qp_type * 2 + tnl_init->port - 1; + sqpn = qpn; + } + + if (!*caller_qp) { + if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI || + (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) { + sqp = kzalloc(sizeof(struct mlx4_ib_sqp), GFP_KERNEL); + if (!sqp) + return -ENOMEM; + qp = &sqp->qp; + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; + } else { + qp = kzalloc(sizeof(struct mlx4_ib_qp), GFP_KERNEL); + if (!qp) + return -ENOMEM; + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; + } + } else + qp = *caller_qp; + + qp->mlx4_ib_qp_type = qp_type; + + mutex_init(&qp->mutex); + spin_lock_init(&qp->sq.lock); + spin_lock_init(&qp->rq.lock); + INIT_LIST_HEAD(&qp->gid_list); + INIT_LIST_HEAD(&qp->steering_rules); + + qp->state = IB_QPS_RESET; + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); + + + if (pd->uobject) { + union { + struct mlx4_ib_create_qp qp; + struct mlx4_ib_create_wq wq; + } ucmd; + size_t copy_len; + int shift; + int n; + + copy_len = (src == MLX4_IB_QP_SRC) ? + sizeof(struct mlx4_ib_create_qp) : + min(sizeof(struct mlx4_ib_create_wq), udata->inlen); + + if (ib_copy_from_udata(&ucmd, udata, copy_len)) { + err = -EFAULT; + goto err; + } + + if (src == MLX4_IB_RWQ_SRC) { + if (ucmd.wq.comp_mask || ucmd.wq.reserved[0] || + ucmd.wq.reserved[1] || ucmd.wq.reserved[2]) { + pr_debug("user command isn't supported\n"); + err = -EOPNOTSUPP; + goto err; + } + + if (ucmd.wq.log_range_size > + ilog2(dev->dev->caps.max_rss_tbl_sz)) { + pr_debug("WQN range size must be equal or smaller than %d\n", + dev->dev->caps.max_rss_tbl_sz); + err = -EOPNOTSUPP; + goto err; + } + range_size = 1 << ucmd.wq.log_range_size; + } else { + qp->inl_recv_sz = ucmd.qp.inl_recv_sz; + } + + if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS) { + if (!(dev->dev->caps.flags & + MLX4_DEV_CAP_FLAG_FCS_KEEP)) { + pr_debug("scatter FCS is unsupported\n"); + err = -EOPNOTSUPP; + goto err; + } + + qp->flags |= MLX4_IB_QP_SCATTER_FCS; + } + + err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, + qp_has_rq(init_attr), qp, qp->inl_recv_sz); + if (err) + goto err; + + if (src == MLX4_IB_QP_SRC) { + qp->sq_no_prefetch = ucmd.qp.sq_no_prefetch; + + err = set_user_sq_size(dev, qp, + (struct mlx4_ib_create_qp *) + &ucmd); + if (err) + goto err; + } else { + qp->sq_no_prefetch = 1; + qp->sq.wqe_cnt = 1; + qp->sq.wqe_shift = MLX4_IB_MIN_SQ_STRIDE; + /* Allocated buffer expects to have at least that SQ + * size. + */ + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << qp->sq.wqe_shift); + } + + qp->umem = ib_umem_get(pd->uobject->context, + (src == MLX4_IB_QP_SRC) ? ucmd.qp.buf_addr : + ucmd.wq.buf_addr, qp->buf_size, 0, 0); + if (IS_ERR(qp->umem)) { + err = PTR_ERR(qp->umem); + goto err; + } + + n = ib_umem_page_count(qp->umem); + shift = mlx4_ib_umem_calc_optimal_mtt_size(qp->umem, 0, &n); + err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt); + + if (err) + goto err_buf; + + err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem); + if (err) + goto err_mtt; + + if (qp_has_rq(init_attr)) { + err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), + (src == MLX4_IB_QP_SRC) ? ucmd.qp.db_addr : + ucmd.wq.db_addr, &qp->db); + if (err) + goto err_mtt; + } + qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS; + } else { + err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, + qp_has_rq(init_attr), qp, 0); + if (err) + goto err; + + qp->sq_no_prefetch = 0; + + if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) + qp->flags |= MLX4_IB_QP_LSO; + + if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { + if (dev->steering_support == + MLX4_STEERING_MODE_DEVICE_MANAGED) + qp->flags |= MLX4_IB_QP_NETIF; + else { + err = -EINVAL; + goto err; + } + } + + err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp); + if (err) + goto err; + + if (qp_has_rq(init_attr)) { + err = mlx4_db_alloc(dev->dev, &qp->db, 0); + if (err) + goto err; + + *qp->db.db = 0; + } + + if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, + &qp->buf)) { + err = -ENOMEM; + goto err_db; + } + + err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift, + &qp->mtt); + if (err) + goto err_buf; + + err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf); + if (err) + goto err_mtt; + + qp->sq.wrid = kvmalloc_array(qp->sq.wqe_cnt, + sizeof(u64), GFP_KERNEL); + qp->rq.wrid = kvmalloc_array(qp->rq.wqe_cnt, + sizeof(u64), GFP_KERNEL); + if (!qp->sq.wrid || !qp->rq.wrid) { + err = -ENOMEM; + goto err_wrid; + } + qp->mqp.usage = MLX4_RES_USAGE_DRIVER; + } + + if (sqpn) { + if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) { + if (alloc_proxy_bufs(pd->device, qp)) { + err = -ENOMEM; + goto err_wrid; + } + } + } else if (src == MLX4_IB_RWQ_SRC) { + err = mlx4_ib_alloc_wqn(to_mucontext(pd->uobject->context), qp, + range_size, &qpn); + if (err) + goto err_wrid; + } else { + /* Raw packet QPNs may not have bits 6,7 set in their qp_num; + * otherwise, the WQE BlueFlame setup flow wrongly causes + * VLAN insertion. */ + if (init_attr->qp_type == IB_QPT_RAW_PACKET) + err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn, + (init_attr->cap.max_send_wr ? + MLX4_RESERVE_ETH_BF_QP : 0) | + (init_attr->cap.max_recv_wr ? + MLX4_RESERVE_A0_QP : 0), + qp->mqp.usage); + else + if (qp->flags & MLX4_IB_QP_NETIF) + err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn); + else + err = mlx4_qp_reserve_range(dev->dev, 1, 1, + &qpn, 0, qp->mqp.usage); + if (err) + goto err_proxy; + } + + if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) + qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK; + + err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp); + if (err) + goto err_qpn; + + if (init_attr->qp_type == IB_QPT_XRC_TGT) + qp->mqp.qpn |= (1 << 23); + + /* + * Hardware wants QPN written in big-endian order (after + * shifting) for send doorbell. Precompute this value to save + * a little bit when posting sends. + */ + qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); + + qp->mqp.event = (src == MLX4_IB_QP_SRC) ? mlx4_ib_qp_event : + mlx4_ib_wq_event; + + if (!*caller_qp) + *caller_qp = qp; + + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + mlx4_ib_lock_cqs(to_mcq(init_attr->send_cq), + to_mcq(init_attr->recv_cq)); + /* Maintain device to QPs access, needed for further handling + * via reset flow + */ + list_add_tail(&qp->qps_list, &dev->qp_list); + /* Maintain CQ to QPs access, needed for further handling + * via reset flow + */ + mcq = to_mcq(init_attr->send_cq); + list_add_tail(&qp->cq_send_list, &mcq->send_qp_list); + mcq = to_mcq(init_attr->recv_cq); + list_add_tail(&qp->cq_recv_list, &mcq->recv_qp_list); + mlx4_ib_unlock_cqs(to_mcq(init_attr->send_cq), + to_mcq(init_attr->recv_cq)); + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + return 0; + +err_qpn: + if (!sqpn) { + if (qp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_free(dev, qpn, 1); + else if (src == MLX4_IB_RWQ_SRC) + mlx4_ib_release_wqn(to_mucontext(pd->uobject->context), + qp, 0); + else + mlx4_qp_release_range(dev->dev, qpn, 1); + } +err_proxy: + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) + free_proxy_bufs(pd->device, qp); +err_wrid: + if (pd->uobject) { + if (qp_has_rq(init_attr)) + mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db); + } else { + kvfree(qp->sq.wrid); + kvfree(qp->rq.wrid); + } + +err_mtt: + mlx4_mtt_cleanup(dev->dev, &qp->mtt); + +err_buf: + if (pd->uobject) + ib_umem_release(qp->umem); + else + mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); + +err_db: + if (!pd->uobject && qp_has_rq(init_attr)) + mlx4_db_free(dev->dev, &qp->db); + +err: + if (sqp) + kfree(sqp); + else if (!*caller_qp) + kfree(qp); + return err; +} + +static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state) +{ + switch (state) { + case IB_QPS_RESET: return MLX4_QP_STATE_RST; + case IB_QPS_INIT: return MLX4_QP_STATE_INIT; + case IB_QPS_RTR: return MLX4_QP_STATE_RTR; + case IB_QPS_RTS: return MLX4_QP_STATE_RTS; + case IB_QPS_SQD: return MLX4_QP_STATE_SQD; + case IB_QPS_SQE: return MLX4_QP_STATE_SQER; + case IB_QPS_ERR: return MLX4_QP_STATE_ERR; + default: return -1; + } +} + +static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) + __acquires(&send_cq->lock) __acquires(&recv_cq->lock) +{ + if (send_cq == recv_cq) { + spin_lock(&send_cq->lock); + __acquire(&recv_cq->lock); + } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_lock(&send_cq->lock); + spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock(&recv_cq->lock); + spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); + } +} + +static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) + __releases(&send_cq->lock) __releases(&recv_cq->lock) +{ + if (send_cq == recv_cq) { + __release(&recv_cq->lock); + spin_unlock(&send_cq->lock); + } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_unlock(&recv_cq->lock); + spin_unlock(&send_cq->lock); + } else { + spin_unlock(&send_cq->lock); + spin_unlock(&recv_cq->lock); + } +} + +static void del_gid_entries(struct mlx4_ib_qp *qp) +{ + struct mlx4_ib_gid_entry *ge, *tmp; + + list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { + list_del(&ge->list); + kfree(ge); + } +} + +static struct mlx4_ib_pd *get_pd(struct mlx4_ib_qp *qp) +{ + if (qp->ibqp.qp_type == IB_QPT_XRC_TGT) + return to_mpd(to_mxrcd(qp->ibqp.xrcd)->pd); + else + return to_mpd(qp->ibqp.pd); +} + +static void get_cqs(struct mlx4_ib_qp *qp, enum mlx4_ib_source_type src, + struct mlx4_ib_cq **send_cq, struct mlx4_ib_cq **recv_cq) +{ + switch (qp->ibqp.qp_type) { + case IB_QPT_XRC_TGT: + *send_cq = to_mcq(to_mxrcd(qp->ibqp.xrcd)->cq); + *recv_cq = *send_cq; + break; + case IB_QPT_XRC_INI: + *send_cq = to_mcq(qp->ibqp.send_cq); + *recv_cq = *send_cq; + break; + default: + *recv_cq = (src == MLX4_IB_QP_SRC) ? to_mcq(qp->ibqp.recv_cq) : + to_mcq(qp->ibwq.cq); + *send_cq = (src == MLX4_IB_QP_SRC) ? to_mcq(qp->ibqp.send_cq) : + *recv_cq; + break; + } +} + +static void destroy_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + if (qp->state != IB_QPS_RESET) { + int i; + + for (i = 0; i < (1 << qp->ibqp.rwq_ind_tbl->log_ind_tbl_size); + i++) { + struct ib_wq *ibwq = qp->ibqp.rwq_ind_tbl->ind_tbl[i]; + struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq); + + mutex_lock(&wq->mutex); + + wq->rss_usecnt--; + + mutex_unlock(&wq->mutex); + } + + if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state), + MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp)) + pr_warn("modify QP %06x to RESET failed.\n", + qp->mqp.qpn); + } + + mlx4_qp_remove(dev->dev, &qp->mqp); + mlx4_qp_free(dev->dev, &qp->mqp); + mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1); + del_gid_entries(qp); + kfree(qp->rss_ctx); +} + +static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, + enum mlx4_ib_source_type src, int is_user) +{ + struct mlx4_ib_cq *send_cq, *recv_cq; + unsigned long flags; + + if (qp->state != IB_QPS_RESET) { + if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state), + MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp)) + pr_warn("modify QP %06x to RESET failed.\n", + qp->mqp.qpn); + if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) { + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = 0; + qp->pri.smac_port = 0; + } + if (qp->alt.smac) { + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = 0; + } + if (qp->pri.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); + qp->pri.vid = 0xFFFF; + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } + if (qp->alt.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); + qp->alt.vid = 0xFFFF; + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } + } + + get_cqs(qp, src, &send_cq, &recv_cq); + + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + mlx4_ib_lock_cqs(send_cq, recv_cq); + + /* del from lists under both locks above to protect reset flow paths */ + list_del(&qp->qps_list); + list_del(&qp->cq_send_list); + list_del(&qp->cq_recv_list); + if (!is_user) { + __mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, + qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL); + if (send_cq != recv_cq) + __mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + } + + mlx4_qp_remove(dev->dev, &qp->mqp); + + mlx4_ib_unlock_cqs(send_cq, recv_cq); + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + + mlx4_qp_free(dev->dev, &qp->mqp); + + if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) { + if (qp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_free(dev, qp->mqp.qpn, 1); + else if (src == MLX4_IB_RWQ_SRC) + mlx4_ib_release_wqn(to_mucontext( + qp->ibwq.uobject->context), qp, 1); + else + mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1); + } + + mlx4_mtt_cleanup(dev->dev, &qp->mtt); + + if (is_user) { + if (qp->rq.wqe_cnt) { + struct mlx4_ib_ucontext *mcontext = !src ? + to_mucontext(qp->ibqp.uobject->context) : + to_mucontext(qp->ibwq.uobject->context); + mlx4_ib_db_unmap_user(mcontext, &qp->db); + } + ib_umem_release(qp->umem); + } else { + kvfree(qp->sq.wrid); + kvfree(qp->rq.wrid); + if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) + free_proxy_bufs(&dev->ib_dev, qp); + mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); + if (qp->rq.wqe_cnt) + mlx4_db_free(dev->dev, &qp->db); + } + + del_gid_entries(qp); +} + +static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr) +{ + /* Native or PPF */ + if (!mlx4_is_mfunc(dev->dev) || + (mlx4_is_master(dev->dev) && + attr->create_flags & MLX4_IB_SRIOV_SQP)) { + return dev->dev->phys_caps.base_sqpn + + (attr->qp_type == IB_QPT_SMI ? 0 : 2) + + attr->port_num - 1; + } + /* PF or VF -- creating proxies */ + if (attr->qp_type == IB_QPT_SMI) + return dev->dev->caps.spec_qps[attr->port_num - 1].qp0_proxy; + else + return dev->dev->caps.spec_qps[attr->port_num - 1].qp1_proxy; +} + +static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx4_ib_qp *qp = NULL; + int err; + int sup_u_create_flags = MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK; + u16 xrcdn = 0; + + if (init_attr->rwq_ind_tbl) + return _mlx4_ib_create_qp_rss(pd, init_attr, udata); + + /* + * We only support LSO, vendor flag1, and multicast loopback blocking, + * and only for kernel UD QPs. + */ + if (init_attr->create_flags & ~(MLX4_IB_QP_LSO | + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK | + MLX4_IB_SRIOV_TUNNEL_QP | + MLX4_IB_SRIOV_SQP | + MLX4_IB_QP_NETIF | + MLX4_IB_QP_CREATE_ROCE_V2_GSI)) + return ERR_PTR(-EINVAL); + + if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { + if (init_attr->qp_type != IB_QPT_UD) + return ERR_PTR(-EINVAL); + } + + if (init_attr->create_flags) { + if (udata && init_attr->create_flags & ~(sup_u_create_flags)) + return ERR_PTR(-EINVAL); + + if ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP | + MLX4_IB_QP_CREATE_ROCE_V2_GSI | + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) && + init_attr->qp_type != IB_QPT_UD) || + (init_attr->create_flags & MLX4_IB_SRIOV_SQP && + init_attr->qp_type > IB_QPT_GSI) || + (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI && + init_attr->qp_type != IB_QPT_GSI)) + return ERR_PTR(-EINVAL); + } + + switch (init_attr->qp_type) { + case IB_QPT_XRC_TGT: + pd = to_mxrcd(init_attr->xrcd)->pd; + xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; + init_attr->send_cq = to_mxrcd(init_attr->xrcd)->cq; + /* fall through */ + case IB_QPT_XRC_INI: + if (!(to_mdev(pd->device)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) + return ERR_PTR(-ENOSYS); + init_attr->recv_cq = init_attr->send_cq; + /* fall through */ + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_RAW_PACKET: + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; + /* fall through */ + case IB_QPT_UD: + { + err = create_qp_common(to_mdev(pd->device), pd, MLX4_IB_QP_SRC, + init_attr, udata, 0, &qp); + if (err) { + kfree(qp); + return ERR_PTR(err); + } + + qp->ibqp.qp_num = qp->mqp.qpn; + qp->xrcdn = xrcdn; + + break; + } + case IB_QPT_SMI: + case IB_QPT_GSI: + { + int sqpn; + + /* Userspace is not allowed to create special QPs: */ + if (udata) + return ERR_PTR(-EINVAL); + if (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI) { + int res = mlx4_qp_reserve_range(to_mdev(pd->device)->dev, + 1, 1, &sqpn, 0, + MLX4_RES_USAGE_DRIVER); + + if (res) + return ERR_PTR(res); + } else { + sqpn = get_sqp_num(to_mdev(pd->device), init_attr); + } + + err = create_qp_common(to_mdev(pd->device), pd, MLX4_IB_QP_SRC, + init_attr, udata, sqpn, &qp); + if (err) + return ERR_PTR(err); + + qp->port = init_attr->port_num; + qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : + init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI ? sqpn : 1; + break; + } + default: + /* Don't support raw QPs */ + return ERR_PTR(-EINVAL); + } + + return &qp->ibqp; +} + +struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) { + struct ib_device *device = pd ? pd->device : init_attr->xrcd->device; + struct ib_qp *ibqp; + struct mlx4_ib_dev *dev = to_mdev(device); + + ibqp = _mlx4_ib_create_qp(pd, init_attr, udata); + + if (!IS_ERR(ibqp) && + (init_attr->qp_type == IB_QPT_GSI) && + !(init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI)) { + struct mlx4_ib_sqp *sqp = to_msqp((to_mqp(ibqp))); + int is_eth = rdma_cap_eth_ah(&dev->ib_dev, init_attr->port_num); + + if (is_eth && + dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) { + init_attr->create_flags |= MLX4_IB_QP_CREATE_ROCE_V2_GSI; + sqp->roce_v2_gsi = ib_create_qp(pd, init_attr); + + if (IS_ERR(sqp->roce_v2_gsi)) { + pr_err("Failed to create GSI QP for RoCEv2 (%ld)\n", PTR_ERR(sqp->roce_v2_gsi)); + sqp->roce_v2_gsi = NULL; + } else { + sqp = to_msqp(to_mqp(sqp->roce_v2_gsi)); + sqp->qp.flags |= MLX4_IB_ROCE_V2_GSI_QP; + } + + init_attr->create_flags &= ~MLX4_IB_QP_CREATE_ROCE_V2_GSI; + } + } + return ibqp; +} + +static int _mlx4_ib_destroy_qp(struct ib_qp *qp) +{ + struct mlx4_ib_dev *dev = to_mdev(qp->device); + struct mlx4_ib_qp *mqp = to_mqp(qp); + + if (is_qp0(dev, mqp)) + mlx4_CLOSE_PORT(dev->dev, mqp->port); + + if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI && + dev->qp1_proxy[mqp->port - 1] == mqp) { + mutex_lock(&dev->qp1_proxy_lock[mqp->port - 1]); + dev->qp1_proxy[mqp->port - 1] = NULL; + mutex_unlock(&dev->qp1_proxy_lock[mqp->port - 1]); + } + + if (mqp->counter_index) + mlx4_ib_free_qp_counter(dev, mqp); + + if (qp->rwq_ind_tbl) { + destroy_qp_rss(dev, mqp); + } else { + struct mlx4_ib_pd *pd; + + pd = get_pd(mqp); + destroy_qp_common(dev, mqp, MLX4_IB_QP_SRC, !!pd->ibpd.uobject); + } + + if (is_sqp(dev, mqp)) + kfree(to_msqp(mqp)); + else + kfree(mqp); + + return 0; +} + +int mlx4_ib_destroy_qp(struct ib_qp *qp) +{ + struct mlx4_ib_qp *mqp = to_mqp(qp); + + if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) { + struct mlx4_ib_sqp *sqp = to_msqp(mqp); + + if (sqp->roce_v2_gsi) + ib_destroy_qp(sqp->roce_v2_gsi); + } + + return _mlx4_ib_destroy_qp(qp); +} + +static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type) +{ + switch (type) { + case MLX4_IB_QPT_RC: return MLX4_QP_ST_RC; + case MLX4_IB_QPT_UC: return MLX4_QP_ST_UC; + case MLX4_IB_QPT_UD: return MLX4_QP_ST_UD; + case MLX4_IB_QPT_XRC_INI: + case MLX4_IB_QPT_XRC_TGT: return MLX4_QP_ST_XRC; + case MLX4_IB_QPT_SMI: + case MLX4_IB_QPT_GSI: + case MLX4_IB_QPT_RAW_PACKET: return MLX4_QP_ST_MLX; + + case MLX4_IB_QPT_PROXY_SMI_OWNER: + case MLX4_IB_QPT_TUN_SMI_OWNER: return (mlx4_is_mfunc(dev->dev) ? + MLX4_QP_ST_MLX : -1); + case MLX4_IB_QPT_PROXY_SMI: + case MLX4_IB_QPT_TUN_SMI: + case MLX4_IB_QPT_PROXY_GSI: + case MLX4_IB_QPT_TUN_GSI: return (mlx4_is_mfunc(dev->dev) ? + MLX4_QP_ST_UD : -1); + default: return -1; + } +} + +static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr, + int attr_mask) +{ + u8 dest_rd_atomic; + u32 access_flags; + u32 hw_access_flags = 0; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + dest_rd_atomic = attr->max_dest_rd_atomic; + else + dest_rd_atomic = qp->resp_depth; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + access_flags = attr->qp_access_flags; + else + access_flags = qp->atomic_rd_en; + + if (!dest_rd_atomic) + access_flags &= IB_ACCESS_REMOTE_WRITE; + + if (access_flags & IB_ACCESS_REMOTE_READ) + hw_access_flags |= MLX4_QP_BIT_RRE; + if (access_flags & IB_ACCESS_REMOTE_ATOMIC) + hw_access_flags |= MLX4_QP_BIT_RAE; + if (access_flags & IB_ACCESS_REMOTE_WRITE) + hw_access_flags |= MLX4_QP_BIT_RWE; + + return cpu_to_be32(hw_access_flags); +} + +static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr, + int attr_mask) +{ + if (attr_mask & IB_QP_PKEY_INDEX) + sqp->pkey_index = attr->pkey_index; + if (attr_mask & IB_QP_QKEY) + sqp->qkey = attr->qkey; + if (attr_mask & IB_QP_SQ_PSN) + sqp->send_psn = attr->sq_psn; +} + +static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port) +{ + path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6); +} + +static int _mlx4_set_path(struct mlx4_ib_dev *dev, + const struct rdma_ah_attr *ah, + u64 smac, u16 vlan_tag, struct mlx4_qp_path *path, + struct mlx4_roce_smac_vlan_info *smac_info, u8 port) +{ + int vidx; + int smac_index; + int err; + + path->grh_mylmc = rdma_ah_get_path_bits(ah) & 0x7f; + path->rlid = cpu_to_be16(rdma_ah_get_dlid(ah)); + if (rdma_ah_get_static_rate(ah)) { + path->static_rate = rdma_ah_get_static_rate(ah) + + MLX4_STAT_RATE_OFFSET; + while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && + !(1 << path->static_rate & dev->dev->caps.stat_rate_support)) + --path->static_rate; + } else + path->static_rate = 0; + + if (rdma_ah_get_ah_flags(ah) & IB_AH_GRH) { + const struct ib_global_route *grh = rdma_ah_read_grh(ah); + int real_sgid_index = + mlx4_ib_gid_index_to_real_index(dev, grh->sgid_attr); + + if (real_sgid_index < 0) + return real_sgid_index; + if (real_sgid_index >= dev->dev->caps.gid_table_len[port]) { + pr_err("sgid_index (%u) too large. max is %d\n", + real_sgid_index, dev->dev->caps.gid_table_len[port] - 1); + return -1; + } + + path->grh_mylmc |= 1 << 7; + path->mgid_index = real_sgid_index; + path->hop_limit = grh->hop_limit; + path->tclass_flowlabel = + cpu_to_be32((grh->traffic_class << 20) | + (grh->flow_label)); + memcpy(path->rgid, grh->dgid.raw, 16); + } + + if (ah->type == RDMA_AH_ATTR_TYPE_ROCE) { + if (!(rdma_ah_get_ah_flags(ah) & IB_AH_GRH)) + return -1; + + path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | + ((port - 1) << 6) | ((rdma_ah_get_sl(ah) & 7) << 3); + + path->feup |= MLX4_FEUP_FORCE_ETH_UP; + if (vlan_tag < 0x1000) { + if (smac_info->vid < 0x1000) { + /* both valid vlan ids */ + if (smac_info->vid != vlan_tag) { + /* different VIDs. unreg old and reg new */ + err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); + if (err) + return err; + smac_info->candidate_vid = vlan_tag; + smac_info->candidate_vlan_index = vidx; + smac_info->candidate_vlan_port = port; + smac_info->update_vid = 1; + path->vlan_index = vidx; + } else { + path->vlan_index = smac_info->vlan_index; + } + } else { + /* no current vlan tag in qp */ + err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); + if (err) + return err; + smac_info->candidate_vid = vlan_tag; + smac_info->candidate_vlan_index = vidx; + smac_info->candidate_vlan_port = port; + smac_info->update_vid = 1; + path->vlan_index = vidx; + } + path->feup |= MLX4_FVL_FORCE_ETH_VLAN; + path->fl = 1 << 6; + } else { + /* have current vlan tag. unregister it at modify-qp success */ + if (smac_info->vid < 0x1000) { + smac_info->candidate_vid = 0xFFFF; + smac_info->update_vid = 1; + } + } + + /* get smac_index for RoCE use. + * If no smac was yet assigned, register one. + * If one was already assigned, but the new mac differs, + * unregister the old one and register the new one. + */ + if ((!smac_info->smac && !smac_info->smac_port) || + smac_info->smac != smac) { + /* register candidate now, unreg if needed, after success */ + smac_index = mlx4_register_mac(dev->dev, port, smac); + if (smac_index >= 0) { + smac_info->candidate_smac_index = smac_index; + smac_info->candidate_smac = smac; + smac_info->candidate_smac_port = port; + } else { + return -EINVAL; + } + } else { + smac_index = smac_info->smac_index; + } + memcpy(path->dmac, ah->roce.dmac, 6); + path->ackto = MLX4_IB_LINK_TYPE_ETH; + /* put MAC table smac index for IBoE */ + path->grh_mylmc = (u8) (smac_index) | 0x80; + } else { + path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | + ((port - 1) << 6) | ((rdma_ah_get_sl(ah) & 0xf) << 2); + } + + return 0; +} + +static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_qp_attr *qp, + enum ib_qp_attr_mask qp_attr_mask, + struct mlx4_ib_qp *mqp, + struct mlx4_qp_path *path, u8 port, + u16 vlan_id, u8 *smac) +{ + return _mlx4_set_path(dev, &qp->ah_attr, + mlx4_mac_to_u64(smac), + vlan_id, + path, &mqp->pri, port); +} + +static int mlx4_set_alt_path(struct mlx4_ib_dev *dev, + const struct ib_qp_attr *qp, + enum ib_qp_attr_mask qp_attr_mask, + struct mlx4_ib_qp *mqp, + struct mlx4_qp_path *path, u8 port) +{ + return _mlx4_set_path(dev, &qp->alt_ah_attr, + 0, + 0xffff, + path, &mqp->alt, port); +} + +static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + struct mlx4_ib_gid_entry *ge, *tmp; + + list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { + if (!ge->added && mlx4_ib_add_mc(dev, qp, &ge->gid)) { + ge->added = 1; + ge->port = qp->port; + } + } +} + +static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, + struct mlx4_ib_qp *qp, + struct mlx4_qp_context *context) +{ + u64 u64_mac; + int smac_index; + + u64_mac = atomic64_read(&dev->iboe.mac[qp->port - 1]); + + context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6); + if (!qp->pri.smac && !qp->pri.smac_port) { + smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac); + if (smac_index >= 0) { + qp->pri.candidate_smac_index = smac_index; + qp->pri.candidate_smac = u64_mac; + qp->pri.candidate_smac_port = qp->port; + context->pri_path.grh_mylmc = 0x80 | (u8) smac_index; + } else { + return -ENOENT; + } + } + return 0; +} + +static int create_qp_lb_counter(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + struct counter_index *new_counter_index; + int err; + u32 tmp_idx; + + if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) != + IB_LINK_LAYER_ETHERNET || + !(qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) || + !(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_LB_SRC_CHK)) + return 0; + + err = mlx4_counter_alloc(dev->dev, &tmp_idx, MLX4_RES_USAGE_DRIVER); + if (err) + return err; + + new_counter_index = kmalloc(sizeof(*new_counter_index), GFP_KERNEL); + if (!new_counter_index) { + mlx4_counter_free(dev->dev, tmp_idx); + return -ENOMEM; + } + + new_counter_index->index = tmp_idx; + new_counter_index->allocated = 1; + qp->counter_index = new_counter_index; + + mutex_lock(&dev->counters_table[qp->port - 1].mutex); + list_add_tail(&new_counter_index->list, + &dev->counters_table[qp->port - 1].counters_list); + mutex_unlock(&dev->counters_table[qp->port - 1].mutex); + + return 0; +} + +enum { + MLX4_QPC_ROCE_MODE_1 = 0, + MLX4_QPC_ROCE_MODE_2 = 2, + MLX4_QPC_ROCE_MODE_UNDEFINED = 0xff +}; + +static u8 gid_type_to_qpc(enum ib_gid_type gid_type) +{ + switch (gid_type) { + case IB_GID_TYPE_ROCE: + return MLX4_QPC_ROCE_MODE_1; + case IB_GID_TYPE_ROCE_UDP_ENCAP: + return MLX4_QPC_ROCE_MODE_2; + default: + return MLX4_QPC_ROCE_MODE_UNDEFINED; + } +} + +/* + * Go over all RSS QP's childes (WQs) and apply their HW state according to + * their logic state if the RSS QP is the first RSS QP associated for the WQ. + */ +static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num) +{ + int err = 0; + int i; + + for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) { + struct ib_wq *ibwq = ind_tbl->ind_tbl[i]; + struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq); + + mutex_lock(&wq->mutex); + + /* Mlx4_ib restrictions: + * WQ's is associated to a port according to the RSS QP it is + * associates to. + * In case the WQ is associated to a different port by another + * RSS QP, return a failure. + */ + if ((wq->rss_usecnt > 0) && (wq->port != port_num)) { + err = -EINVAL; + mutex_unlock(&wq->mutex); + break; + } + wq->port = port_num; + if ((wq->rss_usecnt == 0) && (ibwq->state == IB_WQS_RDY)) { + err = _mlx4_ib_modify_wq(ibwq, IB_WQS_RDY); + if (err) { + mutex_unlock(&wq->mutex); + break; + } + } + wq->rss_usecnt++; + + mutex_unlock(&wq->mutex); + } + + if (i && err) { + int j; + + for (j = (i - 1); j >= 0; j--) { + struct ib_wq *ibwq = ind_tbl->ind_tbl[j]; + struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq); + + mutex_lock(&wq->mutex); + + if ((wq->rss_usecnt == 1) && + (ibwq->state == IB_WQS_RDY)) + if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET)) + pr_warn("failed to reverse WQN=0x%06x\n", + ibwq->wq_num); + wq->rss_usecnt--; + + mutex_unlock(&wq->mutex); + } + } + + return err; +} + +static void bring_down_rss_rwqs(struct ib_rwq_ind_table *ind_tbl) +{ + int i; + + for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) { + struct ib_wq *ibwq = ind_tbl->ind_tbl[i]; + struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq); + + mutex_lock(&wq->mutex); + + if ((wq->rss_usecnt == 1) && (ibwq->state == IB_WQS_RDY)) + if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET)) + pr_warn("failed to reverse WQN=%x\n", + ibwq->wq_num); + wq->rss_usecnt--; + + mutex_unlock(&wq->mutex); + } +} + +static void fill_qp_rss_context(struct mlx4_qp_context *context, + struct mlx4_ib_qp *qp) +{ + struct mlx4_rss_context *rss_context; + + rss_context = (void *)context + offsetof(struct mlx4_qp_context, + pri_path) + MLX4_RSS_OFFSET_IN_QPC_PRI_PATH; + + rss_context->base_qpn = cpu_to_be32(qp->rss_ctx->base_qpn_tbl_sz); + rss_context->default_qpn = + cpu_to_be32(qp->rss_ctx->base_qpn_tbl_sz & 0xffffff); + if (qp->rss_ctx->flags & (MLX4_RSS_UDP_IPV4 | MLX4_RSS_UDP_IPV6)) + rss_context->base_qpn_udp = rss_context->default_qpn; + rss_context->flags = qp->rss_ctx->flags; + /* Currently support just toeplitz */ + rss_context->hash_fn = MLX4_RSS_HASH_TOP; + + memcpy(rss_context->rss_key, qp->rss_ctx->rss_key, + MLX4_EN_RSS_KEY_SIZE); +} + +static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, + const struct ib_qp_attr *attr, int attr_mask, + enum ib_qp_state cur_state, enum ib_qp_state new_state) +{ + struct ib_uobject *ibuobject; + struct ib_srq *ibsrq; + const struct ib_gid_attr *gid_attr = NULL; + struct ib_rwq_ind_table *rwq_ind_tbl; + enum ib_qp_type qp_type; + struct mlx4_ib_dev *dev; + struct mlx4_ib_qp *qp; + struct mlx4_ib_pd *pd; + struct mlx4_ib_cq *send_cq, *recv_cq; + struct mlx4_qp_context *context; + enum mlx4_qp_optpar optpar = 0; + int sqd_event; + int steer_qp = 0; + int err = -EINVAL; + int counter_index; + + if (src_type == MLX4_IB_RWQ_SRC) { + struct ib_wq *ibwq; + + ibwq = (struct ib_wq *)src; + ibuobject = ibwq->uobject; + ibsrq = NULL; + rwq_ind_tbl = NULL; + qp_type = IB_QPT_RAW_PACKET; + qp = to_mqp((struct ib_qp *)ibwq); + dev = to_mdev(ibwq->device); + pd = to_mpd(ibwq->pd); + } else { + struct ib_qp *ibqp; + + ibqp = (struct ib_qp *)src; + ibuobject = ibqp->uobject; + ibsrq = ibqp->srq; + rwq_ind_tbl = ibqp->rwq_ind_tbl; + qp_type = ibqp->qp_type; + qp = to_mqp(ibqp); + dev = to_mdev(ibqp->device); + pd = get_pd(qp); + } + + /* APM is not supported under RoCE */ + if (attr_mask & IB_QP_ALT_PATH && + rdma_port_get_link_layer(&dev->ib_dev, qp->port) == + IB_LINK_LAYER_ETHERNET) + return -ENOTSUPP; + + context = kzalloc(sizeof *context, GFP_KERNEL); + if (!context) + return -ENOMEM; + + context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) | + (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16)); + + if (!(attr_mask & IB_QP_PATH_MIG_STATE)) + context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); + else { + optpar |= MLX4_QP_OPTPAR_PM_STATE; + switch (attr->path_mig_state) { + case IB_MIG_MIGRATED: + context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); + break; + case IB_MIG_REARM: + context->flags |= cpu_to_be32(MLX4_QP_PM_REARM << 11); + break; + case IB_MIG_ARMED: + context->flags |= cpu_to_be32(MLX4_QP_PM_ARMED << 11); + break; + } + } + + if (qp->inl_recv_sz) + context->param3 |= cpu_to_be32(1 << 25); + + if (qp->flags & MLX4_IB_QP_SCATTER_FCS) + context->param3 |= cpu_to_be32(1 << 29); + + if (qp_type == IB_QPT_GSI || qp_type == IB_QPT_SMI) + context->mtu_msgmax = (IB_MTU_4096 << 5) | 11; + else if (qp_type == IB_QPT_RAW_PACKET) + context->mtu_msgmax = (MLX4_RAW_QP_MTU << 5) | MLX4_RAW_QP_MSGMAX; + else if (qp_type == IB_QPT_UD) { + if (qp->flags & MLX4_IB_QP_LSO) + context->mtu_msgmax = (IB_MTU_4096 << 5) | + ilog2(dev->dev->caps.max_gso_sz); + else + context->mtu_msgmax = (IB_MTU_4096 << 5) | 13; + } else if (attr_mask & IB_QP_PATH_MTU) { + if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) { + pr_err("path MTU (%u) is invalid\n", + attr->path_mtu); + goto out; + } + context->mtu_msgmax = (attr->path_mtu << 5) | + ilog2(dev->dev->caps.max_msg_sz); + } + + if (!rwq_ind_tbl) { /* PRM RSS receive side should be left zeros */ + if (qp->rq.wqe_cnt) + context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3; + context->rq_size_stride |= qp->rq.wqe_shift - 4; + } + + if (qp->sq.wqe_cnt) + context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3; + context->sq_size_stride |= qp->sq.wqe_shift - 4; + + if (new_state == IB_QPS_RESET && qp->counter_index) + mlx4_ib_free_qp_counter(dev, qp); + + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + context->sq_size_stride |= !!qp->sq_no_prefetch << 7; + context->xrcd = cpu_to_be32((u32) qp->xrcdn); + if (qp_type == IB_QPT_RAW_PACKET) + context->param3 |= cpu_to_be32(1 << 30); + } + + if (ibuobject) + context->usr_page = cpu_to_be32( + mlx4_to_hw_uar_index(dev->dev, + to_mucontext(ibuobject->context) + ->uar.index)); + else + context->usr_page = cpu_to_be32( + mlx4_to_hw_uar_index(dev->dev, dev->priv_uar.index)); + + if (attr_mask & IB_QP_DEST_QPN) + context->remote_qpn = cpu_to_be32(attr->dest_qp_num); + + if (attr_mask & IB_QP_PORT) { + if (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD && + !(attr_mask & IB_QP_AV)) { + mlx4_set_sched(&context->pri_path, attr->port_num); + optpar |= MLX4_QP_OPTPAR_SCHED_QUEUE; + } + } + + if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { + err = create_qp_lb_counter(dev, qp); + if (err) + goto out; + + counter_index = + dev->counters_table[qp->port - 1].default_counter; + if (qp->counter_index) + counter_index = qp->counter_index->index; + + if (counter_index != -1) { + context->pri_path.counter_index = counter_index; + optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX; + if (qp->counter_index) { + context->pri_path.fl |= + MLX4_FL_ETH_SRC_CHECK_MC_LB; + context->pri_path.vlan_control |= + MLX4_CTRL_ETH_SRC_CHECK_IF_COUNTER; + } + } else + context->pri_path.counter_index = + MLX4_SINK_COUNTER_INDEX(dev->dev); + + if (qp->flags & MLX4_IB_QP_NETIF) { + mlx4_ib_steer_qp_reg(dev, qp, 1); + steer_qp = 1; + } + + if (qp_type == IB_QPT_GSI) { + enum ib_gid_type gid_type = qp->flags & MLX4_IB_ROCE_V2_GSI_QP ? + IB_GID_TYPE_ROCE_UDP_ENCAP : IB_GID_TYPE_ROCE; + u8 qpc_roce_mode = gid_type_to_qpc(gid_type); + + context->rlkey_roce_mode |= (qpc_roce_mode << 6); + } + } + + if (attr_mask & IB_QP_PKEY_INDEX) { + if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) + context->pri_path.disable_pkey_check = 0x40; + context->pri_path.pkey_index = attr->pkey_index; + optpar |= MLX4_QP_OPTPAR_PKEY_INDEX; + } + + if (attr_mask & IB_QP_AV) { + u8 port_num = mlx4_is_bonded(dev->dev) ? 1 : + attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + u16 vlan = 0xffff; + u8 smac[ETH_ALEN]; + int is_eth = + rdma_cap_eth_ah(&dev->ib_dev, port_num) && + rdma_ah_get_ah_flags(&attr->ah_attr) & IB_AH_GRH; + + if (is_eth) { + gid_attr = attr->ah_attr.grh.sgid_attr; + vlan = rdma_vlan_dev_vlan_id(gid_attr->ndev); + memcpy(smac, gid_attr->ndev->dev_addr, ETH_ALEN); + } + + if (mlx4_set_path(dev, attr, attr_mask, qp, &context->pri_path, + port_num, vlan, smac)) + goto out; + + optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH | + MLX4_QP_OPTPAR_SCHED_QUEUE); + + if (is_eth && + (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR)) { + u8 qpc_roce_mode = gid_type_to_qpc(gid_attr->gid_type); + + if (qpc_roce_mode == MLX4_QPC_ROCE_MODE_UNDEFINED) { + err = -EINVAL; + goto out; + } + context->rlkey_roce_mode |= (qpc_roce_mode << 6); + } + + } + + if (attr_mask & IB_QP_TIMEOUT) { + context->pri_path.ackto |= attr->timeout << 3; + optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT; + } + + if (attr_mask & IB_QP_ALT_PATH) { + if (attr->alt_port_num == 0 || + attr->alt_port_num > dev->dev->caps.num_ports) + goto out; + + if (attr->alt_pkey_index >= + dev->dev->caps.pkey_table_len[attr->alt_port_num]) + goto out; + + if (mlx4_set_alt_path(dev, attr, attr_mask, qp, + &context->alt_path, + attr->alt_port_num)) + goto out; + + context->alt_path.pkey_index = attr->alt_pkey_index; + context->alt_path.ackto = attr->alt_timeout << 3; + optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH; + } + + context->pd = cpu_to_be32(pd->pdn); + + if (!rwq_ind_tbl) { + context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28); + get_cqs(qp, src_type, &send_cq, &recv_cq); + } else { /* Set dummy CQs to be compatible with HV and PRM */ + send_cq = to_mcq(rwq_ind_tbl->ind_tbl[0]->cq); + recv_cq = send_cq; + } + context->cqn_send = cpu_to_be32(send_cq->mcq.cqn); + context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn); + + /* Set "fast registration enabled" for all kernel QPs */ + if (!ibuobject) + context->params1 |= cpu_to_be32(1 << 11); + + if (attr_mask & IB_QP_RNR_RETRY) { + context->params1 |= cpu_to_be32(attr->rnr_retry << 13); + optpar |= MLX4_QP_OPTPAR_RNR_RETRY; + } + + if (attr_mask & IB_QP_RETRY_CNT) { + context->params1 |= cpu_to_be32(attr->retry_cnt << 16); + optpar |= MLX4_QP_OPTPAR_RETRY_COUNT; + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { + if (attr->max_rd_atomic) + context->params1 |= + cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); + optpar |= MLX4_QP_OPTPAR_SRA_MAX; + } + + if (attr_mask & IB_QP_SQ_PSN) + context->next_send_psn = cpu_to_be32(attr->sq_psn); + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { + if (attr->max_dest_rd_atomic) + context->params2 |= + cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); + optpar |= MLX4_QP_OPTPAR_RRA_MAX; + } + + if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) { + context->params2 |= to_mlx4_access_flags(qp, attr, attr_mask); + optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE; + } + + if (ibsrq) + context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC); + + if (attr_mask & IB_QP_MIN_RNR_TIMER) { + context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); + optpar |= MLX4_QP_OPTPAR_RNR_TIMEOUT; + } + if (attr_mask & IB_QP_RQ_PSN) + context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); + + /* proxy and tunnel qp qkeys will be changed in modify-qp wrappers */ + if (attr_mask & IB_QP_QKEY) { + if (qp->mlx4_ib_qp_type & + (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) + context->qkey = cpu_to_be32(IB_QP_SET_QKEY); + else { + if (mlx4_is_mfunc(dev->dev) && + !(qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) && + (attr->qkey & MLX4_RESERVED_QKEY_MASK) == + MLX4_RESERVED_QKEY_BASE) { + pr_err("Cannot use reserved QKEY" + " 0x%x (range 0xffff0000..0xffffffff" + " is reserved)\n", attr->qkey); + err = -EINVAL; + goto out; + } + context->qkey = cpu_to_be32(attr->qkey); + } + optpar |= MLX4_QP_OPTPAR_Q_KEY; + } + + if (ibsrq) + context->srqn = cpu_to_be32(1 << 24 | + to_msrq(ibsrq)->msrq.srqn); + + if (qp->rq.wqe_cnt && + cur_state == IB_QPS_RESET && + new_state == IB_QPS_INIT) + context->db_rec_addr = cpu_to_be64(qp->db.dma); + + if (cur_state == IB_QPS_INIT && + new_state == IB_QPS_RTR && + (qp_type == IB_QPT_GSI || qp_type == IB_QPT_SMI || + qp_type == IB_QPT_UD || qp_type == IB_QPT_RAW_PACKET)) { + context->pri_path.sched_queue = (qp->port - 1) << 6; + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI || + qp->mlx4_ib_qp_type & + (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) { + context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE; + if (qp->mlx4_ib_qp_type != MLX4_IB_QPT_SMI) + context->pri_path.fl = 0x80; + } else { + if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) + context->pri_path.fl = 0x80; + context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE; + } + if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) == + IB_LINK_LAYER_ETHERNET) { + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) + context->pri_path.feup = 1 << 7; /* don't fsm */ + /* handle smac_index */ + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) { + err = handle_eth_ud_smac_index(dev, qp, context); + if (err) { + err = -EINVAL; + goto out; + } + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) + dev->qp1_proxy[qp->port - 1] = qp; + } + } + } + + if (qp_type == IB_QPT_RAW_PACKET) { + context->pri_path.ackto = (context->pri_path.ackto & 0xf8) | + MLX4_IB_LINK_TYPE_ETH; + if (dev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) { + /* set QP to receive both tunneled & non-tunneled packets */ + if (!rwq_ind_tbl) + context->srqn = cpu_to_be32(7 << 28); + } + } + + if (qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) { + int is_eth = rdma_port_get_link_layer( + &dev->ib_dev, qp->port) == + IB_LINK_LAYER_ETHERNET; + if (is_eth) { + context->pri_path.ackto = MLX4_IB_LINK_TYPE_ETH; + optpar |= MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH; + } + } + + if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && + attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) + sqd_event = 1; + else + sqd_event = 0; + + if (!ibuobject && + cur_state == IB_QPS_RESET && + new_state == IB_QPS_INIT) + context->rlkey_roce_mode |= (1 << 4); + + /* + * Before passing a kernel QP to the HW, make sure that the + * ownership bits of the send queue are set and the SQ + * headroom is stamped so that the hardware doesn't start + * processing stale work requests. + */ + if (!ibuobject && + cur_state == IB_QPS_RESET && + new_state == IB_QPS_INIT) { + struct mlx4_wqe_ctrl_seg *ctrl; + int i; + + for (i = 0; i < qp->sq.wqe_cnt; ++i) { + ctrl = get_send_wqe(qp, i); + ctrl->owner_opcode = cpu_to_be32(1 << 31); + ctrl->qpn_vlan.fence_size = + 1 << (qp->sq.wqe_shift - 4); + stamp_send_wqe(qp, i); + } + } + + if (rwq_ind_tbl && + cur_state == IB_QPS_RESET && + new_state == IB_QPS_INIT) { + fill_qp_rss_context(context, qp); + context->flags |= cpu_to_be32(1 << MLX4_RSS_QPC_FLAG_OFFSET); + } + + err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state), + to_mlx4_state(new_state), context, optpar, + sqd_event, &qp->mqp); + if (err) + goto out; + + qp->state = new_state; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->atomic_rd_en = attr->qp_access_flags; + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->resp_depth = attr->max_dest_rd_atomic; + if (attr_mask & IB_QP_PORT) { + qp->port = attr->port_num; + update_mcg_macs(dev, qp); + } + if (attr_mask & IB_QP_ALT_PATH) + qp->alt_port = attr->alt_port_num; + + if (is_sqp(dev, qp)) + store_sqp_attrs(to_msqp(qp), attr, attr_mask); + + /* + * If we moved QP0 to RTR, bring the IB link up; if we moved + * QP0 to RESET or ERROR, bring the link back down. + */ + if (is_qp0(dev, qp)) { + if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR) + if (mlx4_INIT_PORT(dev->dev, qp->port)) + pr_warn("INIT_PORT failed for port %d\n", + qp->port); + + if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && + (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) + mlx4_CLOSE_PORT(dev->dev, qp->port); + } + + /* + * If we moved a kernel QP to RESET, clean up all old CQ + * entries and reinitialize the QP. + */ + if (new_state == IB_QPS_RESET) { + if (!ibuobject) { + mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, + ibsrq ? to_msrq(ibsrq) : NULL); + if (send_cq != recv_cq) + mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + + qp->rq.head = 0; + qp->rq.tail = 0; + qp->sq.head = 0; + qp->sq.tail = 0; + qp->sq_next_wqe = 0; + if (qp->rq.wqe_cnt) + *qp->db.db = 0; + + if (qp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_reg(dev, qp, 0); + } + if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) { + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = 0; + qp->pri.smac_port = 0; + } + if (qp->alt.smac) { + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = 0; + } + if (qp->pri.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); + qp->pri.vid = 0xFFFF; + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } + + if (qp->alt.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); + qp->alt.vid = 0xFFFF; + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } + } +out: + if (err && qp->counter_index) + mlx4_ib_free_qp_counter(dev, qp); + if (err && steer_qp) + mlx4_ib_steer_qp_reg(dev, qp, 0); + kfree(context); + if (qp->pri.candidate_smac || + (!qp->pri.candidate_smac && qp->pri.candidate_smac_port)) { + if (err) { + mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port, qp->pri.candidate_smac); + } else { + if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = qp->pri.candidate_smac; + qp->pri.smac_index = qp->pri.candidate_smac_index; + qp->pri.smac_port = qp->pri.candidate_smac_port; + } + qp->pri.candidate_smac = 0; + qp->pri.candidate_smac_index = 0; + qp->pri.candidate_smac_port = 0; + } + if (qp->alt.candidate_smac) { + if (err) { + mlx4_unregister_mac(dev->dev, qp->alt.candidate_smac_port, qp->alt.candidate_smac); + } else { + if (qp->alt.smac) + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = qp->alt.candidate_smac; + qp->alt.smac_index = qp->alt.candidate_smac_index; + qp->alt.smac_port = qp->alt.candidate_smac_port; + } + qp->alt.candidate_smac = 0; + qp->alt.candidate_smac_index = 0; + qp->alt.candidate_smac_port = 0; + } + + if (qp->pri.update_vid) { + if (err) { + if (qp->pri.candidate_vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->pri.candidate_vlan_port, + qp->pri.candidate_vid); + } else { + if (qp->pri.vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, + qp->pri.vid); + qp->pri.vid = qp->pri.candidate_vid; + qp->pri.vlan_port = qp->pri.candidate_vlan_port; + qp->pri.vlan_index = qp->pri.candidate_vlan_index; + } + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } + + if (qp->alt.update_vid) { + if (err) { + if (qp->alt.candidate_vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->alt.candidate_vlan_port, + qp->alt.candidate_vid); + } else { + if (qp->alt.vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, + qp->alt.vid); + qp->alt.vid = qp->alt.candidate_vid; + qp->alt.vlan_port = qp->alt.candidate_vlan_port; + qp->alt.vlan_index = qp->alt.candidate_vlan_index; + } + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } + + return err; +} + +enum { + MLX4_IB_MODIFY_QP_RSS_SUP_ATTR_MSK = (IB_QP_STATE | + IB_QP_PORT), +}; + +static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED; + struct mlx4_ib_dev *dev = to_mdev(ibqp->device); + struct mlx4_ib_qp *qp = to_mqp(ibqp); + enum ib_qp_state cur_state, new_state; + int err = -EINVAL; + mutex_lock(&qp->mutex); + + cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (cur_state != new_state || cur_state != IB_QPS_RESET) { + int port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + ll = rdma_port_get_link_layer(&dev->ib_dev, port); + } + + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask, ll)) { + pr_debug("qpn 0x%x: invalid attribute mask specified " + "for transition %d to %d. qp_type %d," + " attr_mask 0x%x\n", + ibqp->qp_num, cur_state, new_state, + ibqp->qp_type, attr_mask); + goto out; + } + + if (ibqp->rwq_ind_tbl) { + if (!(((cur_state == IB_QPS_RESET) && + (new_state == IB_QPS_INIT)) || + ((cur_state == IB_QPS_INIT) && + (new_state == IB_QPS_RTR)))) { + pr_debug("qpn 0x%x: RSS QP unsupported transition %d to %d\n", + ibqp->qp_num, cur_state, new_state); + + err = -EOPNOTSUPP; + goto out; + } + + if (attr_mask & ~MLX4_IB_MODIFY_QP_RSS_SUP_ATTR_MSK) { + pr_debug("qpn 0x%x: RSS QP unsupported attribute mask 0x%x for transition %d to %d\n", + ibqp->qp_num, attr_mask, cur_state, new_state); + + err = -EOPNOTSUPP; + goto out; + } + } + + if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) { + if ((cur_state == IB_QPS_RESET) && (new_state == IB_QPS_INIT)) { + if ((ibqp->qp_type == IB_QPT_RC) || + (ibqp->qp_type == IB_QPT_UD) || + (ibqp->qp_type == IB_QPT_UC) || + (ibqp->qp_type == IB_QPT_RAW_PACKET) || + (ibqp->qp_type == IB_QPT_XRC_INI)) { + attr->port_num = mlx4_ib_bond_next_port(dev); + } + } else { + /* no sense in changing port_num + * when ports are bonded */ + attr_mask &= ~IB_QP_PORT; + } + } + + if ((attr_mask & IB_QP_PORT) && + (attr->port_num == 0 || attr->port_num > dev->num_ports)) { + pr_debug("qpn 0x%x: invalid port number (%d) specified " + "for transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->port_num, cur_state, + new_state, ibqp->qp_type); + goto out; + } + + if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type == IB_QPT_RAW_PACKET) && + (rdma_port_get_link_layer(&dev->ib_dev, attr->port_num) != + IB_LINK_LAYER_ETHERNET)) + goto out; + + if (attr_mask & IB_QP_PKEY_INDEX) { + int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) { + pr_debug("qpn 0x%x: invalid pkey index (%d) specified " + "for transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->pkey_index, cur_state, + new_state, ibqp->qp_type); + goto out; + } + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && + attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) { + pr_debug("qpn 0x%x: max_rd_atomic (%d) too large. " + "Transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->max_rd_atomic, cur_state, + new_state, ibqp->qp_type); + goto out; + } + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && + attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) { + pr_debug("qpn 0x%x: max_dest_rd_atomic (%d) too large. " + "Transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->max_dest_rd_atomic, cur_state, + new_state, ibqp->qp_type); + goto out; + } + + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + err = 0; + goto out; + } + + if (ibqp->rwq_ind_tbl && (new_state == IB_QPS_INIT)) { + err = bringup_rss_rwqs(ibqp->rwq_ind_tbl, attr->port_num); + if (err) + goto out; + } + + err = __mlx4_ib_modify_qp(ibqp, MLX4_IB_QP_SRC, attr, attr_mask, + cur_state, new_state); + + if (ibqp->rwq_ind_tbl && err) + bring_down_rss_rwqs(ibqp->rwq_ind_tbl); + + if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) + attr->port_num = 1; + +out: + mutex_unlock(&qp->mutex); + return err; +} + +int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct mlx4_ib_qp *mqp = to_mqp(ibqp); + int ret; + + ret = _mlx4_ib_modify_qp(ibqp, attr, attr_mask, udata); + + if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) { + struct mlx4_ib_sqp *sqp = to_msqp(mqp); + int err = 0; + + if (sqp->roce_v2_gsi) + err = ib_modify_qp(sqp->roce_v2_gsi, attr, attr_mask); + if (err) + pr_err("Failed to modify GSI QP for RoCEv2 (%d)\n", + err); + } + return ret; +} + +static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey) +{ + int i; + for (i = 0; i < dev->caps.num_ports; i++) { + if (qpn == dev->caps.spec_qps[i].qp0_proxy || + qpn == dev->caps.spec_qps[i].qp0_tunnel) { + *qkey = dev->caps.spec_qps[i].qp0_qkey; + return 0; + } + } + return -EINVAL; +} + +static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp, + const struct ib_ud_wr *wr, + void *wqe, unsigned *mlx_seg_len) +{ + struct mlx4_ib_dev *mdev = to_mdev(sqp->qp.ibqp.device); + struct ib_device *ib_dev = &mdev->ib_dev; + struct mlx4_wqe_mlx_seg *mlx = wqe; + struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; + struct mlx4_ib_ah *ah = to_mah(wr->ah); + u16 pkey; + u32 qkey; + int send_size; + int header_size; + int spc; + int err; + int i; + + if (wr->wr.opcode != IB_WR_SEND) + return -EINVAL; + + send_size = 0; + + for (i = 0; i < wr->wr.num_sge; ++i) + send_size += wr->wr.sg_list[i].length; + + /* for proxy-qp0 sends, need to add in size of tunnel header */ + /* for tunnel-qp0 sends, tunnel header is already in s/g list */ + if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) + send_size += sizeof (struct mlx4_ib_tunnel_header); + + ib_ud_header_init(send_size, 1, 0, 0, 0, 0, 0, 0, &sqp->ud_header); + + if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) { + sqp->ud_header.lrh.service_level = + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; + sqp->ud_header.lrh.destination_lid = + cpu_to_be16(ah->av.ib.g_slid & 0x7f); + sqp->ud_header.lrh.source_lid = + cpu_to_be16(ah->av.ib.g_slid & 0x7f); + } + + mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); + + /* force loopback */ + mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_VL15 | 0x1 | MLX4_WQE_MLX_SLR); + mlx->rlid = sqp->ud_header.lrh.destination_lid; + + sqp->ud_header.lrh.virtual_lane = 0; + sqp->ud_header.bth.solicited_event = !!(wr->wr.send_flags & IB_SEND_SOLICITED); + err = ib_get_cached_pkey(ib_dev, sqp->qp.port, 0, &pkey); + if (err) + return err; + sqp->ud_header.bth.pkey = cpu_to_be16(pkey); + if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI_OWNER) + sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->remote_qpn); + else + sqp->ud_header.bth.destination_qpn = + cpu_to_be32(mdev->dev->caps.spec_qps[sqp->qp.port - 1].qp0_tunnel); + + sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); + if (mlx4_is_master(mdev->dev)) { + if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) + return -EINVAL; + } else { + if (vf_get_qp0_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) + return -EINVAL; + } + sqp->ud_header.deth.qkey = cpu_to_be32(qkey); + sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn); + + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; + sqp->ud_header.immediate_present = 0; + + header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf); + + /* + * Inline data segments may not cross a 64 byte boundary. If + * our UD header is bigger than the space available up to the + * next 64 byte boundary in the WQE, use two inline data + * segments to hold the UD header. + */ + spc = MLX4_INLINE_ALIGN - + ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); + if (header_size <= spc) { + inl->byte_count = cpu_to_be32(1 << 31 | header_size); + memcpy(inl + 1, sqp->header_buf, header_size); + i = 1; + } else { + inl->byte_count = cpu_to_be32(1 << 31 | spc); + memcpy(inl + 1, sqp->header_buf, spc); + + inl = (void *) (inl + 1) + spc; + memcpy(inl + 1, sqp->header_buf + spc, header_size - spc); + /* + * Need a barrier here to make sure all the data is + * visible before the byte_count field is set. + * Otherwise the HCA prefetcher could grab the 64-byte + * chunk with this inline segment and get a valid (!= + * 0xffffffff) byte count but stale data, and end up + * generating a packet with bad headers. + * + * The first inline segment's byte_count field doesn't + * need a barrier, because it comes after a + * control/MLX segment and therefore is at an offset + * of 16 mod 64. + */ + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc)); + i = 2; + } + + *mlx_seg_len = + ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16); + return 0; +} + +static u8 sl_to_vl(struct mlx4_ib_dev *dev, u8 sl, int port_num) +{ + union sl2vl_tbl_to_u64 tmp_vltab; + u8 vl; + + if (sl > 15) + return 0xf; + tmp_vltab.sl64 = atomic64_read(&dev->sl2vl[port_num - 1]); + vl = tmp_vltab.sl8[sl >> 1]; + if (sl & 1) + vl &= 0x0f; + else + vl >>= 4; + return vl; +} + +static int fill_gid_by_hw_index(struct mlx4_ib_dev *ibdev, u8 port_num, + int index, union ib_gid *gid, + enum ib_gid_type *gid_type) +{ + struct mlx4_ib_iboe *iboe = &ibdev->iboe; + struct mlx4_port_gid_table *port_gid_table; + unsigned long flags; + + port_gid_table = &iboe->gids[port_num - 1]; + spin_lock_irqsave(&iboe->lock, flags); + memcpy(gid, &port_gid_table->gids[index].gid, sizeof(*gid)); + *gid_type = port_gid_table->gids[index].gid_type; + spin_unlock_irqrestore(&iboe->lock, flags); + if (rdma_is_zero_gid(gid)) + return -ENOENT; + + return 0; +} + +#define MLX4_ROCEV2_QP1_SPORT 0xC000 +static int build_mlx_header(struct mlx4_ib_sqp *sqp, const struct ib_ud_wr *wr, + void *wqe, unsigned *mlx_seg_len) +{ + struct ib_device *ib_dev = sqp->qp.ibqp.device; + struct mlx4_ib_dev *ibdev = to_mdev(ib_dev); + struct mlx4_wqe_mlx_seg *mlx = wqe; + struct mlx4_wqe_ctrl_seg *ctrl = wqe; + struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; + struct mlx4_ib_ah *ah = to_mah(wr->ah); + union ib_gid sgid; + u16 pkey; + int send_size; + int header_size; + int spc; + int i; + int err = 0; + u16 vlan = 0xffff; + bool is_eth; + bool is_vlan = false; + bool is_grh; + bool is_udp = false; + int ip_version = 0; + + send_size = 0; + for (i = 0; i < wr->wr.num_sge; ++i) + send_size += wr->wr.sg_list[i].length; + + is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET; + is_grh = mlx4_ib_ah_grh_present(ah); + if (is_eth) { + enum ib_gid_type gid_type; + if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { + /* When multi-function is enabled, the ib_core gid + * indexes don't necessarily match the hw ones, so + * we must use our own cache */ + err = mlx4_get_roce_gid_from_slave(to_mdev(ib_dev)->dev, + be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sgid.raw[0]); + if (err) + return err; + } else { + err = fill_gid_by_hw_index(ibdev, sqp->qp.port, + ah->av.ib.gid_index, + &sgid, &gid_type); + if (!err) { + is_udp = gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP; + if (is_udp) { + if (ipv6_addr_v4mapped((struct in6_addr *)&sgid)) + ip_version = 4; + else + ip_version = 6; + is_grh = false; + } + } else { + return err; + } + } + if (ah->av.eth.vlan != cpu_to_be16(0xffff)) { + vlan = be16_to_cpu(ah->av.eth.vlan) & 0x0fff; + is_vlan = 1; + } + } + err = ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, + ip_version, is_udp, 0, &sqp->ud_header); + if (err) + return err; + + if (!is_eth) { + sqp->ud_header.lrh.service_level = + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; + sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid; + sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f); + } + + if (is_grh || (ip_version == 6)) { + sqp->ud_header.grh.traffic_class = + (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff; + sqp->ud_header.grh.flow_label = + ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff); + sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit; + if (is_eth) { + memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16); + } else { + if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { + /* When multi-function is enabled, the ib_core gid + * indexes don't necessarily match the hw ones, so + * we must use our own cache + */ + sqp->ud_header.grh.source_gid.global.subnet_prefix = + cpu_to_be64(atomic64_read(&(to_mdev(ib_dev)->sriov. + demux[sqp->qp.port - 1]. + subnet_prefix))); + sqp->ud_header.grh.source_gid.global.interface_id = + to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. + guid_cache[ah->av.ib.gid_index]; + } else { + sqp->ud_header.grh.source_gid = + ah->ibah.sgid_attr->gid; + } + } + memcpy(sqp->ud_header.grh.destination_gid.raw, + ah->av.ib.dgid, 16); + } + + if (ip_version == 4) { + sqp->ud_header.ip4.tos = + (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff; + sqp->ud_header.ip4.id = 0; + sqp->ud_header.ip4.frag_off = htons(IP_DF); + sqp->ud_header.ip4.ttl = ah->av.eth.hop_limit; + + memcpy(&sqp->ud_header.ip4.saddr, + sgid.raw + 12, 4); + memcpy(&sqp->ud_header.ip4.daddr, ah->av.ib.dgid + 12, 4); + sqp->ud_header.ip4.check = ib_ud_ip4_csum(&sqp->ud_header); + } + + if (is_udp) { + sqp->ud_header.udp.dport = htons(ROCE_V2_UDP_DPORT); + sqp->ud_header.udp.sport = htons(MLX4_ROCEV2_QP1_SPORT); + sqp->ud_header.udp.csum = 0; + } + + mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); + + if (!is_eth) { + mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) | + (sqp->ud_header.lrh.destination_lid == + IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) | + (sqp->ud_header.lrh.service_level << 8)); + if (ah->av.ib.port_pd & cpu_to_be32(0x80000000)) + mlx->flags |= cpu_to_be32(0x1); /* force loopback */ + mlx->rlid = sqp->ud_header.lrh.destination_lid; + } + + switch (wr->wr.opcode) { + case IB_WR_SEND: + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; + sqp->ud_header.immediate_present = 0; + break; + case IB_WR_SEND_WITH_IMM: + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; + sqp->ud_header.immediate_present = 1; + sqp->ud_header.immediate_data = wr->wr.ex.imm_data; + break; + default: + return -EINVAL; + } + + if (is_eth) { + struct in6_addr in6; + u16 ether_type; + u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13; + + ether_type = (!is_udp) ? ETH_P_IBOE: + (ip_version == 4 ? ETH_P_IP : ETH_P_IPV6); + + mlx->sched_prio = cpu_to_be16(pcp); + + ether_addr_copy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac); + memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6); + memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2); + memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4); + memcpy(&in6, sgid.raw, sizeof(in6)); + + + if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6)) + mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); + if (!is_vlan) { + sqp->ud_header.eth.type = cpu_to_be16(ether_type); + } else { + sqp->ud_header.vlan.type = cpu_to_be16(ether_type); + sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp); + } + } else { + sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : + sl_to_vl(to_mdev(ib_dev), + sqp->ud_header.lrh.service_level, + sqp->qp.port); + if (sqp->qp.ibqp.qp_num && sqp->ud_header.lrh.virtual_lane == 15) + return -EINVAL; + if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE) + sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE; + } + sqp->ud_header.bth.solicited_event = !!(wr->wr.send_flags & IB_SEND_SOLICITED); + if (!sqp->qp.ibqp.qp_num) + err = ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, + &pkey); + else + err = ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->pkey_index, + &pkey); + if (err) + return err; + + sqp->ud_header.bth.pkey = cpu_to_be16(pkey); + sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->remote_qpn); + sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); + sqp->ud_header.deth.qkey = cpu_to_be32(wr->remote_qkey & 0x80000000 ? + sqp->qkey : wr->remote_qkey); + sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num); + + header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf); + + if (0) { + pr_err("built UD header of size %d:\n", header_size); + for (i = 0; i < header_size / 4; ++i) { + if (i % 8 == 0) + pr_err(" [%02x] ", i * 4); + pr_cont(" %08x", + be32_to_cpu(((__be32 *) sqp->header_buf)[i])); + if ((i + 1) % 8 == 0) + pr_cont("\n"); + } + pr_err("\n"); + } + + /* + * Inline data segments may not cross a 64 byte boundary. If + * our UD header is bigger than the space available up to the + * next 64 byte boundary in the WQE, use two inline data + * segments to hold the UD header. + */ + spc = MLX4_INLINE_ALIGN - + ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); + if (header_size <= spc) { + inl->byte_count = cpu_to_be32(1 << 31 | header_size); + memcpy(inl + 1, sqp->header_buf, header_size); + i = 1; + } else { + inl->byte_count = cpu_to_be32(1 << 31 | spc); + memcpy(inl + 1, sqp->header_buf, spc); + + inl = (void *) (inl + 1) + spc; + memcpy(inl + 1, sqp->header_buf + spc, header_size - spc); + /* + * Need a barrier here to make sure all the data is + * visible before the byte_count field is set. + * Otherwise the HCA prefetcher could grab the 64-byte + * chunk with this inline segment and get a valid (!= + * 0xffffffff) byte count but stale data, and end up + * generating a packet with bad headers. + * + * The first inline segment's byte_count field doesn't + * need a barrier, because it comes after a + * control/MLX segment and therefore is at an offset + * of 16 mod 64. + */ + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc)); + i = 2; + } + + *mlx_seg_len = + ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16); + return 0; +} + +static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq) +{ + unsigned cur; + struct mlx4_ib_cq *cq; + + cur = wq->head - wq->tail; + if (likely(cur + nreq < wq->max_post)) + return 0; + + cq = to_mcq(ib_cq); + spin_lock(&cq->lock); + cur = wq->head - wq->tail; + spin_unlock(&cq->lock); + + return cur + nreq >= wq->max_post; +} + +static __be32 convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC) : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE) : 0) | + (acc & IB_ACCESS_REMOTE_READ ? + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ) : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE) : 0) | + cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ); +} + +static void set_reg_seg(struct mlx4_wqe_fmr_seg *fseg, + const struct ib_reg_wr *wr) +{ + struct mlx4_ib_mr *mr = to_mmr(wr->mr); + + fseg->flags = convert_access(wr->access); + fseg->mem_key = cpu_to_be32(wr->key); + fseg->buf_list = cpu_to_be64(mr->page_map); + fseg->start_addr = cpu_to_be64(mr->ibmr.iova); + fseg->reg_len = cpu_to_be64(mr->ibmr.length); + fseg->offset = 0; /* XXX -- is this just for ZBVA? */ + fseg->page_size = cpu_to_be32(ilog2(mr->ibmr.page_size)); + fseg->reserved[0] = 0; + fseg->reserved[1] = 0; +} + +static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey) +{ + memset(iseg, 0, sizeof(*iseg)); + iseg->mem_key = cpu_to_be32(rkey); +} + +static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg, + u64 remote_addr, u32 rkey) +{ + rseg->raddr = cpu_to_be64(remote_addr); + rseg->rkey = cpu_to_be32(rkey); + rseg->reserved = 0; +} + +static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, + const struct ib_atomic_wr *wr) +{ + if (wr->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + aseg->swap_add = cpu_to_be64(wr->swap); + aseg->compare = cpu_to_be64(wr->compare_add); + } else if (wr->wr.opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) { + aseg->swap_add = cpu_to_be64(wr->compare_add); + aseg->compare = cpu_to_be64(wr->compare_add_mask); + } else { + aseg->swap_add = cpu_to_be64(wr->compare_add); + aseg->compare = 0; + } + +} + +static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg, + const struct ib_atomic_wr *wr) +{ + aseg->swap_add = cpu_to_be64(wr->swap); + aseg->swap_add_mask = cpu_to_be64(wr->swap_mask); + aseg->compare = cpu_to_be64(wr->compare_add); + aseg->compare_mask = cpu_to_be64(wr->compare_add_mask); +} + +static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, + const struct ib_ud_wr *wr) +{ + memcpy(dseg->av, &to_mah(wr->ah)->av, sizeof (struct mlx4_av)); + dseg->dqpn = cpu_to_be32(wr->remote_qpn); + dseg->qkey = cpu_to_be32(wr->remote_qkey); + dseg->vlan = to_mah(wr->ah)->av.eth.vlan; + memcpy(dseg->mac, to_mah(wr->ah)->av.eth.mac, 6); +} + +static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev, + struct mlx4_wqe_datagram_seg *dseg, + const struct ib_ud_wr *wr, + enum mlx4_ib_qp_type qpt) +{ + union mlx4_ext_av *av = &to_mah(wr->ah)->av; + struct mlx4_av sqp_av = {0}; + int port = *((u8 *) &av->ib.port_pd) & 0x3; + + /* force loopback */ + sqp_av.port_pd = av->ib.port_pd | cpu_to_be32(0x80000000); + sqp_av.g_slid = av->ib.g_slid & 0x7f; /* no GRH */ + sqp_av.sl_tclass_flowlabel = av->ib.sl_tclass_flowlabel & + cpu_to_be32(0xf0000000); + + memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av)); + if (qpt == MLX4_IB_QPT_PROXY_GSI) + dseg->dqpn = cpu_to_be32(dev->dev->caps.spec_qps[port - 1].qp1_tunnel); + else + dseg->dqpn = cpu_to_be32(dev->dev->caps.spec_qps[port - 1].qp0_tunnel); + /* Use QKEY from the QP context, which is set by master */ + dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY); +} + +static void build_tunnel_header(const struct ib_ud_wr *wr, void *wqe, + unsigned *mlx_seg_len) +{ + struct mlx4_wqe_inline_seg *inl = wqe; + struct mlx4_ib_tunnel_header hdr; + struct mlx4_ib_ah *ah = to_mah(wr->ah); + int spc; + int i; + + memcpy(&hdr.av, &ah->av, sizeof hdr.av); + hdr.remote_qpn = cpu_to_be32(wr->remote_qpn); + hdr.pkey_index = cpu_to_be16(wr->pkey_index); + hdr.qkey = cpu_to_be32(wr->remote_qkey); + memcpy(hdr.mac, ah->av.eth.mac, 6); + hdr.vlan = ah->av.eth.vlan; + + spc = MLX4_INLINE_ALIGN - + ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); + if (sizeof (hdr) <= spc) { + memcpy(inl + 1, &hdr, sizeof (hdr)); + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | sizeof (hdr)); + i = 1; + } else { + memcpy(inl + 1, &hdr, spc); + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | spc); + + inl = (void *) (inl + 1) + spc; + memcpy(inl + 1, (void *) &hdr + spc, sizeof (hdr) - spc); + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | (sizeof (hdr) - spc)); + i = 2; + } + + *mlx_seg_len = + ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + sizeof (hdr), 16); +} + +static void set_mlx_icrc_seg(void *dseg) +{ + u32 *t = dseg; + struct mlx4_wqe_inline_seg *iseg = dseg; + + t[1] = 0; + + /* + * Need a barrier here before writing the byte_count field to + * make sure that all the data is visible before the + * byte_count field is set. Otherwise, if the segment begins + * a new cacheline, the HCA prefetcher could grab the 64-byte + * chunk and get a valid (!= * 0xffffffff) byte count but + * stale data, and end up sending the wrong data. + */ + wmb(); + + iseg->byte_count = cpu_to_be32((1 << 31) | 4); +} + +static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) +{ + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); + + /* + * Need a barrier here before writing the byte_count field to + * make sure that all the data is visible before the + * byte_count field is set. Otherwise, if the segment begins + * a new cacheline, the HCA prefetcher could grab the 64-byte + * chunk and get a valid (!= * 0xffffffff) byte count but + * stale data, and end up sending the wrong data. + */ + wmb(); + + dseg->byte_count = cpu_to_be32(sg->length); +} + +static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) +{ + dseg->byte_count = cpu_to_be32(sg->length); + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); +} + +static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, + const struct ib_ud_wr *wr, struct mlx4_ib_qp *qp, + unsigned *lso_seg_len, __be32 *lso_hdr_sz, __be32 *blh) +{ + unsigned halign = ALIGN(sizeof *wqe + wr->hlen, 16); + + if (unlikely(halign > MLX4_IB_CACHE_LINE_SIZE)) + *blh = cpu_to_be32(1 << 6); + + if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) && + wr->wr.num_sge > qp->sq.max_gs - (halign >> 4))) + return -EINVAL; + + memcpy(wqe->header, wr->header, wr->hlen); + + *lso_hdr_sz = cpu_to_be32(wr->mss << 16 | wr->hlen); + *lso_seg_len = halign; + return 0; +} + +static __be32 send_ieth(const struct ib_send_wr *wr) +{ + switch (wr->opcode) { + case IB_WR_SEND_WITH_IMM: + case IB_WR_RDMA_WRITE_WITH_IMM: + return wr->ex.imm_data; + + case IB_WR_SEND_WITH_INV: + return cpu_to_be32(wr->ex.invalidate_rkey); + + default: + return 0; + } +} + +static void add_zero_len_inline(void *wqe) +{ + struct mlx4_wqe_inline_seg *inl = wqe; + memset(wqe, 0, 16); + inl->byte_count = cpu_to_be32(1 << 31); +} + +static int _mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr, bool drain) +{ + struct mlx4_ib_qp *qp = to_mqp(ibqp); + void *wqe; + struct mlx4_wqe_ctrl_seg *ctrl; + struct mlx4_wqe_data_seg *dseg; + unsigned long flags; + int nreq; + int err = 0; + unsigned ind; + int uninitialized_var(size); + unsigned uninitialized_var(seglen); + __be32 dummy; + __be32 *lso_wqe; + __be32 uninitialized_var(lso_hdr_sz); + __be32 blh; + int i; + struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) { + struct mlx4_ib_sqp *sqp = to_msqp(qp); + + if (sqp->roce_v2_gsi) { + struct mlx4_ib_ah *ah = to_mah(ud_wr(wr)->ah); + enum ib_gid_type gid_type; + union ib_gid gid; + + if (!fill_gid_by_hw_index(mdev, sqp->qp.port, + ah->av.ib.gid_index, + &gid, &gid_type)) + qp = (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? + to_mqp(sqp->roce_v2_gsi) : qp; + else + pr_err("Failed to get gid at index %d. RoCEv2 will not work properly\n", + ah->av.ib.gid_index); + } + } + + spin_lock_irqsave(&qp->sq.lock, flags); + if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR && + !drain) { + err = -EIO; + *bad_wr = wr; + nreq = 0; + goto out; + } + + ind = qp->sq_next_wqe; + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + lso_wqe = &dummy; + blh = 0; + + if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->sq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); + qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id; + + ctrl->srcrb_flags = + (wr->send_flags & IB_SEND_SIGNALED ? + cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) | + (wr->send_flags & IB_SEND_SOLICITED ? + cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) | + ((wr->send_flags & IB_SEND_IP_CSUM) ? + cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM | + MLX4_WQE_CTRL_TCP_UDP_CSUM) : 0) | + qp->sq_signal_bits; + + ctrl->imm = send_ieth(wr); + + wqe += sizeof *ctrl; + size = sizeof *ctrl / 16; + + switch (qp->mlx4_ib_qp_type) { + case MLX4_IB_QPT_RC: + case MLX4_IB_QPT_UC: + switch (wr->opcode) { + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD: + set_raddr_seg(wqe, atomic_wr(wr)->remote_addr, + atomic_wr(wr)->rkey); + wqe += sizeof (struct mlx4_wqe_raddr_seg); + + set_atomic_seg(wqe, atomic_wr(wr)); + wqe += sizeof (struct mlx4_wqe_atomic_seg); + + size += (sizeof (struct mlx4_wqe_raddr_seg) + + sizeof (struct mlx4_wqe_atomic_seg)) / 16; + + break; + + case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: + set_raddr_seg(wqe, atomic_wr(wr)->remote_addr, + atomic_wr(wr)->rkey); + wqe += sizeof (struct mlx4_wqe_raddr_seg); + + set_masked_atomic_seg(wqe, atomic_wr(wr)); + wqe += sizeof (struct mlx4_wqe_masked_atomic_seg); + + size += (sizeof (struct mlx4_wqe_raddr_seg) + + sizeof (struct mlx4_wqe_masked_atomic_seg)) / 16; + + break; + + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(wqe, rdma_wr(wr)->remote_addr, + rdma_wr(wr)->rkey); + wqe += sizeof (struct mlx4_wqe_raddr_seg); + size += sizeof (struct mlx4_wqe_raddr_seg) / 16; + break; + + case IB_WR_LOCAL_INV: + ctrl->srcrb_flags |= + cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER); + set_local_inv_seg(wqe, wr->ex.invalidate_rkey); + wqe += sizeof (struct mlx4_wqe_local_inval_seg); + size += sizeof (struct mlx4_wqe_local_inval_seg) / 16; + break; + + case IB_WR_REG_MR: + ctrl->srcrb_flags |= + cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER); + set_reg_seg(wqe, reg_wr(wr)); + wqe += sizeof(struct mlx4_wqe_fmr_seg); + size += sizeof(struct mlx4_wqe_fmr_seg) / 16; + break; + + default: + /* No extra segments required for sends */ + break; + } + break; + + case MLX4_IB_QPT_TUN_SMI_OWNER: + err = build_sriov_qp0_header(to_msqp(qp), ud_wr(wr), + ctrl, &seglen); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + wqe += seglen; + size += seglen / 16; + break; + case MLX4_IB_QPT_TUN_SMI: + case MLX4_IB_QPT_TUN_GSI: + /* this is a UD qp used in MAD responses to slaves. */ + set_datagram_seg(wqe, ud_wr(wr)); + /* set the forced-loopback bit in the data seg av */ + *(__be32 *) wqe |= cpu_to_be32(0x80000000); + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + break; + case MLX4_IB_QPT_UD: + set_datagram_seg(wqe, ud_wr(wr)); + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + + if (wr->opcode == IB_WR_LSO) { + err = build_lso_seg(wqe, ud_wr(wr), qp, &seglen, + &lso_hdr_sz, &blh); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + lso_wqe = (__be32 *) wqe; + wqe += seglen; + size += seglen / 16; + } + break; + + case MLX4_IB_QPT_PROXY_SMI_OWNER: + err = build_sriov_qp0_header(to_msqp(qp), ud_wr(wr), + ctrl, &seglen); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + wqe += seglen; + size += seglen / 16; + /* to start tunnel header on a cache-line boundary */ + add_zero_len_inline(wqe); + wqe += 16; + size++; + build_tunnel_header(ud_wr(wr), wqe, &seglen); + wqe += seglen; + size += seglen / 16; + break; + case MLX4_IB_QPT_PROXY_SMI: + case MLX4_IB_QPT_PROXY_GSI: + /* If we are tunneling special qps, this is a UD qp. + * In this case we first add a UD segment targeting + * the tunnel qp, and then add a header with address + * information */ + set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, + ud_wr(wr), + qp->mlx4_ib_qp_type); + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + build_tunnel_header(ud_wr(wr), wqe, &seglen); + wqe += seglen; + size += seglen / 16; + break; + + case MLX4_IB_QPT_SMI: + case MLX4_IB_QPT_GSI: + err = build_mlx_header(to_msqp(qp), ud_wr(wr), ctrl, + &seglen); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + wqe += seglen; + size += seglen / 16; + break; + + default: + break; + } + + /* + * Write data segments in reverse order, so as to + * overwrite cacheline stamp last within each + * cacheline. This avoids issues with WQE + * prefetching. + */ + + dseg = wqe; + dseg += wr->num_sge - 1; + size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16); + + /* Add one more inline data segment for ICRC for MLX sends */ + if (unlikely(qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI || + qp->mlx4_ib_qp_type & + (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))) { + set_mlx_icrc_seg(dseg + 1); + size += sizeof (struct mlx4_wqe_data_seg) / 16; + } + + for (i = wr->num_sge - 1; i >= 0; --i, --dseg) + set_data_seg(dseg, wr->sg_list + i); + + /* + * Possibly overwrite stamping in cacheline with LSO + * segment only after making sure all data segments + * are written. + */ + wmb(); + *lso_wqe = lso_hdr_sz; + + ctrl->qpn_vlan.fence_size = (wr->send_flags & IB_SEND_FENCE ? + MLX4_WQE_CTRL_FENCE : 0) | size; + + /* + * Make sure descriptor is fully written before + * setting ownership bit (because HW can start + * executing as soon as we do). + */ + wmb(); + + if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) { + *bad_wr = wr; + err = -EINVAL; + goto out; + } + + ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | + (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh; + + /* + * We can improve latency by not stamping the last + * send queue WQE until after ringing the doorbell, so + * only stamp here if there are still more WQEs to post. + */ + if (wr->next) + stamp_send_wqe(qp, ind + qp->sq_spare_wqes); + ind++; + } + +out: + if (likely(nreq)) { + qp->sq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + writel_relaxed(qp->doorbell_qpn, + to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL); + + /* + * Make sure doorbells don't leak out of SQ spinlock + * and reach the HCA out of order. + */ + mmiowb(); + + stamp_send_wqe(qp, ind + qp->sq_spare_wqes - 1); + + qp->sq_next_wqe = ind; + } + + spin_unlock_irqrestore(&qp->sq.lock, flags); + + return err; +} + +int mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + return _mlx4_ib_post_send(ibqp, wr, bad_wr, false); +} + +static int _mlx4_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr, bool drain) +{ + struct mlx4_ib_qp *qp = to_mqp(ibqp); + struct mlx4_wqe_data_seg *scat; + unsigned long flags; + int err = 0; + int nreq; + int ind; + int max_gs; + int i; + struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + + max_gs = qp->rq.max_gs; + spin_lock_irqsave(&qp->rq.lock, flags); + + if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR && + !drain) { + err = -EIO; + *bad_wr = wr; + nreq = 0; + goto out; + } + + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->rq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + scat = get_recv_wqe(qp, ind); + + if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) { + ib_dma_sync_single_for_device(ibqp->device, + qp->sqp_proxy_rcv[ind].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + scat->byte_count = + cpu_to_be32(sizeof (struct mlx4_ib_proxy_sqp_hdr)); + /* use dma lkey from upper layer entry */ + scat->lkey = cpu_to_be32(wr->sg_list->lkey); + scat->addr = cpu_to_be64(qp->sqp_proxy_rcv[ind].map); + scat++; + max_gs--; + } + + for (i = 0; i < wr->num_sge; ++i) + __set_data_seg(scat + i, wr->sg_list + i); + + if (i < max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY); + scat[i].addr = 0; + } + + qp->rq.wrid[ind] = wr->wr_id; + + ind = (ind + 1) & (qp->rq.wqe_cnt - 1); + } + +out: + if (likely(nreq)) { + qp->rq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); + } + + spin_unlock_irqrestore(&qp->rq.lock, flags); + + return err; +} + +int mlx4_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + return _mlx4_ib_post_recv(ibqp, wr, bad_wr, false); +} + +static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state) +{ + switch (mlx4_state) { + case MLX4_QP_STATE_RST: return IB_QPS_RESET; + case MLX4_QP_STATE_INIT: return IB_QPS_INIT; + case MLX4_QP_STATE_RTR: return IB_QPS_RTR; + case MLX4_QP_STATE_RTS: return IB_QPS_RTS; + case MLX4_QP_STATE_SQ_DRAINING: + case MLX4_QP_STATE_SQD: return IB_QPS_SQD; + case MLX4_QP_STATE_SQER: return IB_QPS_SQE; + case MLX4_QP_STATE_ERR: return IB_QPS_ERR; + default: return -1; + } +} + +static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state) +{ + switch (mlx4_mig_state) { + case MLX4_QP_PM_ARMED: return IB_MIG_ARMED; + case MLX4_QP_PM_REARM: return IB_MIG_REARM; + case MLX4_QP_PM_MIGRATED: return IB_MIG_MIGRATED; + default: return -1; + } +} + +static int to_ib_qp_access_flags(int mlx4_flags) +{ + int ib_flags = 0; + + if (mlx4_flags & MLX4_QP_BIT_RRE) + ib_flags |= IB_ACCESS_REMOTE_READ; + if (mlx4_flags & MLX4_QP_BIT_RWE) + ib_flags |= IB_ACCESS_REMOTE_WRITE; + if (mlx4_flags & MLX4_QP_BIT_RAE) + ib_flags |= IB_ACCESS_REMOTE_ATOMIC; + + return ib_flags; +} + +static void to_rdma_ah_attr(struct mlx4_ib_dev *ibdev, + struct rdma_ah_attr *ah_attr, + struct mlx4_qp_path *path) +{ + struct mlx4_dev *dev = ibdev->dev; + u8 port_num = path->sched_queue & 0x40 ? 2 : 1; + + memset(ah_attr, 0, sizeof(*ah_attr)); + if (port_num == 0 || port_num > dev->caps.num_ports) + return; + ah_attr->type = rdma_ah_find_type(&ibdev->ib_dev, port_num); + + if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) + rdma_ah_set_sl(ah_attr, ((path->sched_queue >> 3) & 0x7) | + ((path->sched_queue & 4) << 1)); + else + rdma_ah_set_sl(ah_attr, (path->sched_queue >> 2) & 0xf); + rdma_ah_set_port_num(ah_attr, port_num); + + rdma_ah_set_dlid(ah_attr, be16_to_cpu(path->rlid)); + rdma_ah_set_path_bits(ah_attr, path->grh_mylmc & 0x7f); + rdma_ah_set_static_rate(ah_attr, + path->static_rate ? path->static_rate - 5 : 0); + if (path->grh_mylmc & (1 << 7)) { + rdma_ah_set_grh(ah_attr, NULL, + be32_to_cpu(path->tclass_flowlabel) & 0xfffff, + path->mgid_index, + path->hop_limit, + (be32_to_cpu(path->tclass_flowlabel) + >> 20) & 0xff); + rdma_ah_set_dgid_raw(ah_attr, path->rgid); + } +} + +int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx4_ib_dev *dev = to_mdev(ibqp->device); + struct mlx4_ib_qp *qp = to_mqp(ibqp); + struct mlx4_qp_context context; + int mlx4_state; + int err = 0; + + if (ibqp->rwq_ind_tbl) + return -EOPNOTSUPP; + + mutex_lock(&qp->mutex); + + if (qp->state == IB_QPS_RESET) { + qp_attr->qp_state = IB_QPS_RESET; + goto done; + } + + err = mlx4_qp_query(dev->dev, &qp->mqp, &context); + if (err) { + err = -EINVAL; + goto out; + } + + mlx4_state = be32_to_cpu(context.flags) >> 28; + + qp->state = to_ib_qp_state(mlx4_state); + qp_attr->qp_state = qp->state; + qp_attr->path_mtu = context.mtu_msgmax >> 5; + qp_attr->path_mig_state = + to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3); + qp_attr->qkey = be32_to_cpu(context.qkey); + qp_attr->rq_psn = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff; + qp_attr->sq_psn = be32_to_cpu(context.next_send_psn) & 0xffffff; + qp_attr->dest_qp_num = be32_to_cpu(context.remote_qpn) & 0xffffff; + qp_attr->qp_access_flags = + to_ib_qp_access_flags(be32_to_cpu(context.params2)); + + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { + to_rdma_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path); + to_rdma_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path); + qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f; + qp_attr->alt_port_num = + rdma_ah_get_port_num(&qp_attr->alt_ah_attr); + } + + qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f; + if (qp_attr->qp_state == IB_QPS_INIT) + qp_attr->port_num = qp->port; + else + qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1; + + /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ + qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING; + + qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7); + + qp_attr->max_dest_rd_atomic = + 1 << ((be32_to_cpu(context.params2) >> 21) & 0x7); + qp_attr->min_rnr_timer = + (be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f; + qp_attr->timeout = context.pri_path.ackto >> 3; + qp_attr->retry_cnt = (be32_to_cpu(context.params1) >> 16) & 0x7; + qp_attr->rnr_retry = (be32_to_cpu(context.params1) >> 13) & 0x7; + qp_attr->alt_timeout = context.alt_path.ackto >> 3; + +done: + qp_attr->cur_qp_state = qp_attr->qp_state; + qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt; + qp_attr->cap.max_recv_sge = qp->rq.max_gs; + + if (!ibqp->uobject) { + qp_attr->cap.max_send_wr = qp->sq.wqe_cnt; + qp_attr->cap.max_send_sge = qp->sq.max_gs; + } else { + qp_attr->cap.max_send_wr = 0; + qp_attr->cap.max_send_sge = 0; + } + + /* + * We don't support inline sends for kernel QPs (yet), and we + * don't know what userspace's value should be. + */ + qp_attr->cap.max_inline_data = 0; + + qp_init_attr->cap = qp_attr->cap; + + qp_init_attr->create_flags = 0; + if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) + qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + + if (qp->flags & MLX4_IB_QP_LSO) + qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; + + if (qp->flags & MLX4_IB_QP_NETIF) + qp_init_attr->create_flags |= IB_QP_CREATE_NETIF_QP; + + qp_init_attr->sq_sig_type = + qp->sq_signal_bits == cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) ? + IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + +out: + mutex_unlock(&qp->mutex); + return err; +} + +struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev; + struct ib_qp_init_attr ib_qp_init_attr; + struct mlx4_ib_qp *qp; + struct mlx4_ib_create_wq ucmd; + int err, required_cmd_sz; + + if (!(udata && pd->uobject)) + return ERR_PTR(-EINVAL); + + required_cmd_sz = offsetof(typeof(ucmd), comp_mask) + + sizeof(ucmd.comp_mask); + if (udata->inlen < required_cmd_sz) { + pr_debug("invalid inlen\n"); + return ERR_PTR(-EINVAL); + } + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) { + pr_debug("inlen is not supported\n"); + return ERR_PTR(-EOPNOTSUPP); + } + + if (udata->outlen) + return ERR_PTR(-EOPNOTSUPP); + + dev = to_mdev(pd->device); + + if (init_attr->wq_type != IB_WQT_RQ) { + pr_debug("unsupported wq type %d\n", init_attr->wq_type); + return ERR_PTR(-EOPNOTSUPP); + } + + if (init_attr->create_flags & ~IB_WQ_FLAGS_SCATTER_FCS) { + pr_debug("unsupported create_flags %u\n", + init_attr->create_flags); + return ERR_PTR(-EOPNOTSUPP); + } + + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; + + memset(&ib_qp_init_attr, 0, sizeof(ib_qp_init_attr)); + ib_qp_init_attr.qp_context = init_attr->wq_context; + ib_qp_init_attr.qp_type = IB_QPT_RAW_PACKET; + ib_qp_init_attr.cap.max_recv_wr = init_attr->max_wr; + ib_qp_init_attr.cap.max_recv_sge = init_attr->max_sge; + ib_qp_init_attr.recv_cq = init_attr->cq; + ib_qp_init_attr.send_cq = ib_qp_init_attr.recv_cq; /* Dummy CQ */ + + if (init_attr->create_flags & IB_WQ_FLAGS_SCATTER_FCS) + ib_qp_init_attr.create_flags |= IB_QP_CREATE_SCATTER_FCS; + + err = create_qp_common(dev, pd, MLX4_IB_RWQ_SRC, &ib_qp_init_attr, + udata, 0, &qp); + if (err) { + kfree(qp); + return ERR_PTR(err); + } + + qp->ibwq.event_handler = init_attr->event_handler; + qp->ibwq.wq_num = qp->mqp.qpn; + qp->ibwq.state = IB_WQS_RESET; + + return &qp->ibwq; +} + +static int ib_wq2qp_state(enum ib_wq_state state) +{ + switch (state) { + case IB_WQS_RESET: + return IB_QPS_RESET; + case IB_WQS_RDY: + return IB_QPS_RTR; + default: + return IB_QPS_ERR; + } +} + +static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state) +{ + struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq); + enum ib_qp_state qp_cur_state; + enum ib_qp_state qp_new_state; + int attr_mask; + int err; + + /* ib_qp.state represents the WQ HW state while ib_wq.state represents + * the WQ logic state. + */ + qp_cur_state = qp->state; + qp_new_state = ib_wq2qp_state(new_state); + + if (ib_wq2qp_state(new_state) == qp_cur_state) + return 0; + + if (new_state == IB_WQS_RDY) { + struct ib_qp_attr attr = {}; + + attr.port_num = qp->port; + attr_mask = IB_QP_PORT; + + err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, &attr, + attr_mask, IB_QPS_RESET, IB_QPS_INIT); + if (err) { + pr_debug("WQN=0x%06x failed to apply RST->INIT on the HW QP\n", + ibwq->wq_num); + return err; + } + + qp_cur_state = IB_QPS_INIT; + } + + attr_mask = 0; + err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL, attr_mask, + qp_cur_state, qp_new_state); + + if (err && (qp_cur_state == IB_QPS_INIT)) { + qp_new_state = IB_QPS_RESET; + if (__mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL, + attr_mask, IB_QPS_INIT, IB_QPS_RESET)) { + pr_warn("WQN=0x%06x failed with reverting HW's resources failure\n", + ibwq->wq_num); + qp_new_state = IB_QPS_INIT; + } + } + + qp->state = qp_new_state; + + return err; +} + +int mlx4_ib_modify_wq(struct ib_wq *ibwq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask, struct ib_udata *udata) +{ + struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq); + struct mlx4_ib_modify_wq ucmd = {}; + size_t required_cmd_sz; + enum ib_wq_state cur_state, new_state; + int err = 0; + + required_cmd_sz = offsetof(typeof(ucmd), reserved) + + sizeof(ucmd.reserved); + if (udata->inlen < required_cmd_sz) + return -EINVAL; + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) + return -EOPNOTSUPP; + + if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) + return -EFAULT; + + if (ucmd.comp_mask || ucmd.reserved) + return -EOPNOTSUPP; + + if (wq_attr_mask & IB_WQ_FLAGS) + return -EOPNOTSUPP; + + cur_state = wq_attr_mask & IB_WQ_CUR_STATE ? wq_attr->curr_wq_state : + ibwq->state; + new_state = wq_attr_mask & IB_WQ_STATE ? wq_attr->wq_state : cur_state; + + if (cur_state < IB_WQS_RESET || cur_state > IB_WQS_ERR || + new_state < IB_WQS_RESET || new_state > IB_WQS_ERR) + return -EINVAL; + + if ((new_state == IB_WQS_RDY) && (cur_state == IB_WQS_ERR)) + return -EINVAL; + + if ((new_state == IB_WQS_ERR) && (cur_state == IB_WQS_RESET)) + return -EINVAL; + + /* Need to protect against the parent RSS which also may modify WQ + * state. + */ + mutex_lock(&qp->mutex); + + /* Can update HW state only if a RSS QP has already associated to this + * WQ, so we can apply its port on the WQ. + */ + if (qp->rss_usecnt) + err = _mlx4_ib_modify_wq(ibwq, new_state); + + if (!err) + ibwq->state = new_state; + + mutex_unlock(&qp->mutex); + + return err; +} + +int mlx4_ib_destroy_wq(struct ib_wq *ibwq) +{ + struct mlx4_ib_dev *dev = to_mdev(ibwq->device); + struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq); + + if (qp->counter_index) + mlx4_ib_free_qp_counter(dev, qp); + + destroy_qp_common(dev, qp, MLX4_IB_RWQ_SRC, 1); + + kfree(qp); + + return 0; +} + +struct ib_rwq_ind_table +*mlx4_ib_create_rwq_ind_table(struct ib_device *device, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata) +{ + struct ib_rwq_ind_table *rwq_ind_table; + struct mlx4_ib_create_rwq_ind_tbl_resp resp = {}; + unsigned int ind_tbl_size = 1 << init_attr->log_ind_tbl_size; + unsigned int base_wqn; + size_t min_resp_len; + int i; + int err; + + if (udata->inlen > 0 && + !ib_is_udata_cleared(udata, 0, + udata->inlen)) + return ERR_PTR(-EOPNOTSUPP); + + min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved); + if (udata->outlen && udata->outlen < min_resp_len) + return ERR_PTR(-EINVAL); + + if (ind_tbl_size > + device->attrs.rss_caps.max_rwq_indirection_table_size) { + pr_debug("log_ind_tbl_size = %d is bigger than supported = %d\n", + ind_tbl_size, + device->attrs.rss_caps.max_rwq_indirection_table_size); + return ERR_PTR(-EINVAL); + } + + base_wqn = init_attr->ind_tbl[0]->wq_num; + + if (base_wqn % ind_tbl_size) { + pr_debug("WQN=0x%x isn't aligned with indirection table size\n", + base_wqn); + return ERR_PTR(-EINVAL); + } + + for (i = 1; i < ind_tbl_size; i++) { + if (++base_wqn != init_attr->ind_tbl[i]->wq_num) { + pr_debug("indirection table's WQNs aren't consecutive\n"); + return ERR_PTR(-EINVAL); + } + } + + rwq_ind_table = kzalloc(sizeof(*rwq_ind_table), GFP_KERNEL); + if (!rwq_ind_table) + return ERR_PTR(-ENOMEM); + + if (udata->outlen) { + resp.response_length = offsetof(typeof(resp), response_length) + + sizeof(resp.response_length); + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) + goto err; + } + + return rwq_ind_table; + +err: + kfree(rwq_ind_table); + return ERR_PTR(err); +} + +int mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) +{ + kfree(ib_rwq_ind_tbl); + return 0; +} + +struct mlx4_ib_drain_cqe { + struct ib_cqe cqe; + struct completion done; +}; + +static void mlx4_ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct mlx4_ib_drain_cqe *cqe = container_of(wc->wr_cqe, + struct mlx4_ib_drain_cqe, + cqe); + + complete(&cqe->done); +} + +/* This function returns only once the drained WR was completed */ +static void handle_drain_completion(struct ib_cq *cq, + struct mlx4_ib_drain_cqe *sdrain, + struct mlx4_ib_dev *dev) +{ + struct mlx4_dev *mdev = dev->dev; + + if (cq->poll_ctx == IB_POLL_DIRECT) { + while (wait_for_completion_timeout(&sdrain->done, HZ / 10) <= 0) + ib_process_cq_direct(cq, -1); + return; + } + + if (mdev->persist->state == MLX4_DEVICE_STATE_INTERNAL_ERROR) { + struct mlx4_ib_cq *mcq = to_mcq(cq); + bool triggered = false; + unsigned long flags; + + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + /* Make sure that the CQ handler won't run if wasn't run yet */ + if (!mcq->mcq.reset_notify_added) + mcq->mcq.reset_notify_added = 1; + else + triggered = true; + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + + if (triggered) { + /* Wait for any scheduled/running task to be ended */ + switch (cq->poll_ctx) { + case IB_POLL_SOFTIRQ: + irq_poll_disable(&cq->iop); + irq_poll_enable(&cq->iop); + break; + case IB_POLL_WORKQUEUE: + cancel_work_sync(&cq->work); + break; + default: + WARN_ON_ONCE(1); + } + } + + /* Run the CQ handler - this makes sure that the drain WR will + * be processed if wasn't processed yet. + */ + mcq->mcq.comp(&mcq->mcq); + } + + wait_for_completion(&sdrain->done); +} + +void mlx4_ib_drain_sq(struct ib_qp *qp) +{ + struct ib_cq *cq = qp->send_cq; + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct mlx4_ib_drain_cqe sdrain; + const struct ib_send_wr *bad_swr; + struct ib_rdma_wr swr = { + .wr = { + .next = NULL, + { .wr_cqe = &sdrain.cqe, }, + .opcode = IB_WR_RDMA_WRITE, + }, + }; + int ret; + struct mlx4_ib_dev *dev = to_mdev(qp->device); + struct mlx4_dev *mdev = dev->dev; + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret && mdev->persist->state != MLX4_DEVICE_STATE_INTERNAL_ERROR) { + WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); + return; + } + + sdrain.cqe.done = mlx4_ib_drain_qp_done; + init_completion(&sdrain.done); + + ret = _mlx4_ib_post_send(qp, &swr.wr, &bad_swr, true); + if (ret) { + WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); + return; + } + + handle_drain_completion(cq, &sdrain, dev); +} + +void mlx4_ib_drain_rq(struct ib_qp *qp) +{ + struct ib_cq *cq = qp->recv_cq; + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct mlx4_ib_drain_cqe rdrain; + struct ib_recv_wr rwr = {}; + const struct ib_recv_wr *bad_rwr; + int ret; + struct mlx4_ib_dev *dev = to_mdev(qp->device); + struct mlx4_dev *mdev = dev->dev; + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret && mdev->persist->state != MLX4_DEVICE_STATE_INTERNAL_ERROR) { + WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); + return; + } + + rwr.wr_cqe = &rdrain.cqe; + rdrain.cqe.done = mlx4_ib_drain_qp_done; + init_completion(&rdrain.done); + + ret = _mlx4_ib_post_recv(qp, &rwr, &bad_rwr, true); + if (ret) { + WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); + return; + } + + handle_drain_completion(cq, &rdrain, dev); +} diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c new file mode 100644 index 000000000..3731b31c3 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -0,0 +1,378 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/mlx4/qp.h> +#include <linux/mlx4/srq.h> +#include <linux/slab.h> + +#include "mlx4_ib.h" +#include <rdma/mlx4-abi.h> + +static void *get_wqe(struct mlx4_ib_srq *srq, int n) +{ + return mlx4_buf_offset(&srq->buf, n << srq->msrq.wqe_shift); +} + +static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type) +{ + struct ib_event event; + struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq; + + if (ibsrq->event_handler) { + event.device = ibsrq->device; + event.element.srq = ibsrq; + switch (type) { + case MLX4_EVENT_TYPE_SRQ_LIMIT: + event.event = IB_EVENT_SRQ_LIMIT_REACHED; + break; + case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR: + event.event = IB_EVENT_SRQ_ERR; + break; + default: + pr_warn("Unexpected event type %d " + "on SRQ %06x\n", type, srq->srqn); + return; + } + + ibsrq->event_handler(&event, ibsrq->srq_context); + } +} + +struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_srq *srq; + struct mlx4_wqe_srq_next_seg *next; + struct mlx4_wqe_data_seg *scatter; + u32 cqn; + u16 xrcdn; + int desc_size; + int buf_size; + int err; + int i; + + /* Sanity check SRQ size before proceeding */ + if (init_attr->attr.max_wr >= dev->dev->caps.max_srq_wqes || + init_attr->attr.max_sge > dev->dev->caps.max_srq_sge) + return ERR_PTR(-EINVAL); + + srq = kmalloc(sizeof *srq, GFP_KERNEL); + if (!srq) + return ERR_PTR(-ENOMEM); + + mutex_init(&srq->mutex); + spin_lock_init(&srq->lock); + srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1); + srq->msrq.max_gs = init_attr->attr.max_sge; + + desc_size = max(32UL, + roundup_pow_of_two(sizeof (struct mlx4_wqe_srq_next_seg) + + srq->msrq.max_gs * + sizeof (struct mlx4_wqe_data_seg))); + srq->msrq.wqe_shift = ilog2(desc_size); + + buf_size = srq->msrq.max * desc_size; + + if (pd->uobject) { + struct mlx4_ib_create_srq ucmd; + + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + err = -EFAULT; + goto err_srq; + } + + srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, + buf_size, 0, 0); + if (IS_ERR(srq->umem)) { + err = PTR_ERR(srq->umem); + goto err_srq; + } + + err = mlx4_mtt_init(dev->dev, ib_umem_page_count(srq->umem), + srq->umem->page_shift, &srq->mtt); + if (err) + goto err_buf; + + err = mlx4_ib_umem_write_mtt(dev, &srq->mtt, srq->umem); + if (err) + goto err_mtt; + + err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), + ucmd.db_addr, &srq->db); + if (err) + goto err_mtt; + } else { + err = mlx4_db_alloc(dev->dev, &srq->db, 0); + if (err) + goto err_srq; + + *srq->db.db = 0; + + if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, + &srq->buf)) { + err = -ENOMEM; + goto err_db; + } + + srq->head = 0; + srq->tail = srq->msrq.max - 1; + srq->wqe_ctr = 0; + + for (i = 0; i < srq->msrq.max; ++i) { + next = get_wqe(srq, i); + next->next_wqe_index = + cpu_to_be16((i + 1) & (srq->msrq.max - 1)); + + for (scatter = (void *) (next + 1); + (void *) scatter < (void *) next + desc_size; + ++scatter) + scatter->lkey = cpu_to_be32(MLX4_INVALID_LKEY); + } + + err = mlx4_mtt_init(dev->dev, srq->buf.npages, srq->buf.page_shift, + &srq->mtt); + if (err) + goto err_buf; + + err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf); + if (err) + goto err_mtt; + + srq->wrid = kvmalloc_array(srq->msrq.max, + sizeof(u64), GFP_KERNEL); + if (!srq->wrid) { + err = -ENOMEM; + goto err_mtt; + } + } + + cqn = ib_srq_has_cq(init_attr->srq_type) ? + to_mcq(init_attr->ext.cq)->mcq.cqn : 0; + xrcdn = (init_attr->srq_type == IB_SRQT_XRC) ? + to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn : + (u16) dev->dev->caps.reserved_xrcds; + err = mlx4_srq_alloc(dev->dev, to_mpd(pd)->pdn, cqn, xrcdn, &srq->mtt, + srq->db.dma, &srq->msrq); + if (err) + goto err_wrid; + + srq->msrq.event = mlx4_ib_srq_event; + srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn; + + if (pd->uobject) + if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) { + err = -EFAULT; + goto err_wrid; + } + + init_attr->attr.max_wr = srq->msrq.max - 1; + + return &srq->ibsrq; + +err_wrid: + if (pd->uobject) + mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db); + else + kvfree(srq->wrid); + +err_mtt: + mlx4_mtt_cleanup(dev->dev, &srq->mtt); + +err_buf: + if (pd->uobject) + ib_umem_release(srq->umem); + else + mlx4_buf_free(dev->dev, buf_size, &srq->buf); + +err_db: + if (!pd->uobject) + mlx4_db_free(dev->dev, &srq->db); + +err_srq: + kfree(srq); + + return ERR_PTR(err); +} + +int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx4_ib_srq *srq = to_msrq(ibsrq); + int ret; + + /* We don't support resizing SRQs (yet?) */ + if (attr_mask & IB_SRQ_MAX_WR) + return -EINVAL; + + if (attr_mask & IB_SRQ_LIMIT) { + if (attr->srq_limit >= srq->msrq.max) + return -EINVAL; + + mutex_lock(&srq->mutex); + ret = mlx4_srq_arm(dev->dev, &srq->msrq, attr->srq_limit); + mutex_unlock(&srq->mutex); + + if (ret) + return ret; + } + + return 0; +} + +int mlx4_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) +{ + struct mlx4_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx4_ib_srq *srq = to_msrq(ibsrq); + int ret; + int limit_watermark; + + ret = mlx4_srq_query(dev->dev, &srq->msrq, &limit_watermark); + if (ret) + return ret; + + srq_attr->srq_limit = limit_watermark; + srq_attr->max_wr = srq->msrq.max - 1; + srq_attr->max_sge = srq->msrq.max_gs; + + return 0; +} + +int mlx4_ib_destroy_srq(struct ib_srq *srq) +{ + struct mlx4_ib_dev *dev = to_mdev(srq->device); + struct mlx4_ib_srq *msrq = to_msrq(srq); + + mlx4_srq_free(dev->dev, &msrq->msrq); + mlx4_mtt_cleanup(dev->dev, &msrq->mtt); + + if (srq->uobject) { + mlx4_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db); + ib_umem_release(msrq->umem); + } else { + kvfree(msrq->wrid); + mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift, + &msrq->buf); + mlx4_db_free(dev->dev, &msrq->db); + } + + kfree(msrq); + + return 0; +} + +void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index) +{ + struct mlx4_wqe_srq_next_seg *next; + + /* always called with interrupts disabled. */ + spin_lock(&srq->lock); + + next = get_wqe(srq, srq->tail); + next->next_wqe_index = cpu_to_be16(wqe_index); + srq->tail = wqe_index; + + spin_unlock(&srq->lock); +} + +int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + struct mlx4_ib_srq *srq = to_msrq(ibsrq); + struct mlx4_wqe_srq_next_seg *next; + struct mlx4_wqe_data_seg *scat; + unsigned long flags; + int err = 0; + int nreq; + int i; + struct mlx4_ib_dev *mdev = to_mdev(ibsrq->device); + + spin_lock_irqsave(&srq->lock, flags); + if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { + err = -EIO; + *bad_wr = wr; + nreq = 0; + goto out; + } + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (unlikely(wr->num_sge > srq->msrq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + break; + } + + if (unlikely(srq->head == srq->tail)) { + err = -ENOMEM; + *bad_wr = wr; + break; + } + + srq->wrid[srq->head] = wr->wr_id; + + next = get_wqe(srq, srq->head); + srq->head = be16_to_cpu(next->next_wqe_index); + scat = (struct mlx4_wqe_data_seg *) (next + 1); + + for (i = 0; i < wr->num_sge; ++i) { + scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length); + scat[i].lkey = cpu_to_be32(wr->sg_list[i].lkey); + scat[i].addr = cpu_to_be64(wr->sg_list[i].addr); + } + + if (i < srq->msrq.max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY); + scat[i].addr = 0; + } + } + + if (likely(nreq)) { + srq->wqe_ctr += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *srq->db.db = cpu_to_be32(srq->wqe_ctr); + } +out: + + spin_unlock_irqrestore(&srq->lock, flags); + + return err; +} diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c new file mode 100644 index 000000000..d2da28d61 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/sysfs.c @@ -0,0 +1,886 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/*#include "core_priv.h"*/ +#include "mlx4_ib.h" +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/stat.h> + +#include <rdma/ib_mad.h> +/*show_admin_alias_guid returns the administratively assigned value of that GUID. + * Values returned in buf parameter string: + * 0 - requests opensm to assign a value. + * ffffffffffffffff - delete this entry. + * other - value assigned by administrator. + */ +static ssize_t show_admin_alias_guid(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + __be64 sysadmin_ag_val; + + sysadmin_ag_val = mlx4_get_admin_guid(mdev->dev, + mlx4_ib_iov_dentry->entry_num, + port->num); + + return sprintf(buf, "%llx\n", be64_to_cpu(sysadmin_ag_val)); +} + +/* store_admin_alias_guid stores the (new) administratively assigned value of that GUID. + * Values in buf parameter string: + * 0 - requests opensm to assign a value. + * 0xffffffffffffffff - delete this entry. + * other - guid value assigned by the administrator. + */ +static ssize_t store_admin_alias_guid(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int record_num;/*0-15*/ + int guid_index_in_rec; /*0 - 7*/ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + u64 sysadmin_ag_val; + unsigned long flags; + + record_num = mlx4_ib_iov_dentry->entry_num / 8; + guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8; + if (0 == record_num && 0 == guid_index_in_rec) { + pr_err("GUID 0 block 0 is RO\n"); + return count; + } + spin_lock_irqsave(&mdev->sriov.alias_guid.ag_work_lock, flags); + sscanf(buf, "%llx", &sysadmin_ag_val); + *(__be64 *)&mdev->sriov.alias_guid.ports_guid[port->num - 1]. + all_rec_per_port[record_num]. + all_recs[GUID_REC_SIZE * guid_index_in_rec] = + cpu_to_be64(sysadmin_ag_val); + + /* Change the state to be pending for update */ + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].status + = MLX4_GUID_INFO_STATUS_IDLE ; + mlx4_set_admin_guid(mdev->dev, cpu_to_be64(sysadmin_ag_val), + mlx4_ib_iov_dentry->entry_num, + port->num); + + /* set the record index */ + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].guid_indexes + |= mlx4_ib_get_aguid_comp_mask_from_ix(guid_index_in_rec); + + spin_unlock_irqrestore(&mdev->sriov.alias_guid.ag_work_lock, flags); + mlx4_ib_init_alias_guid_work(mdev, port->num - 1); + + return count; +} + +static ssize_t show_port_gid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + union ib_gid gid; + ssize_t ret; + + ret = __mlx4_ib_query_gid(&mdev->ib_dev, port->num, + mlx4_ib_iov_dentry->entry_num, &gid, 1); + if (ret) + return ret; + ret = sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + be16_to_cpu(((__be16 *) gid.raw)[0]), + be16_to_cpu(((__be16 *) gid.raw)[1]), + be16_to_cpu(((__be16 *) gid.raw)[2]), + be16_to_cpu(((__be16 *) gid.raw)[3]), + be16_to_cpu(((__be16 *) gid.raw)[4]), + be16_to_cpu(((__be16 *) gid.raw)[5]), + be16_to_cpu(((__be16 *) gid.raw)[6]), + be16_to_cpu(((__be16 *) gid.raw)[7])); + return ret; +} + +static ssize_t show_phys_port_pkey(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + u16 pkey; + ssize_t ret; + + ret = __mlx4_ib_query_pkey(&mdev->ib_dev, port->num, + mlx4_ib_iov_dentry->entry_num, &pkey, 1); + if (ret) + return ret; + + return sprintf(buf, "0x%04x\n", pkey); +} + +#define DENTRY_REMOVE(_dentry) \ +do { \ + sysfs_remove_file((_dentry)->kobj, &(_dentry)->dentry.attr); \ +} while (0); + +static int create_sysfs_entry(void *_ctx, struct mlx4_ib_iov_sysfs_attr *_dentry, + char *_name, struct kobject *_kobj, + ssize_t (*show)(struct device *dev, + struct device_attribute *attr, + char *buf), + ssize_t (*store)(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) + ) +{ + int ret = 0; + struct mlx4_ib_iov_sysfs_attr *vdentry = _dentry; + + vdentry->ctx = _ctx; + vdentry->dentry.show = show; + vdentry->dentry.store = store; + sysfs_attr_init(&vdentry->dentry.attr); + vdentry->dentry.attr.name = vdentry->name; + vdentry->dentry.attr.mode = 0; + vdentry->kobj = _kobj; + snprintf(vdentry->name, 15, "%s", _name); + + if (vdentry->dentry.store) + vdentry->dentry.attr.mode |= S_IWUSR; + + if (vdentry->dentry.show) + vdentry->dentry.attr.mode |= S_IRUGO; + + ret = sysfs_create_file(vdentry->kobj, &vdentry->dentry.attr); + if (ret) { + pr_err("failed to create %s\n", vdentry->dentry.attr.name); + vdentry->ctx = NULL; + return ret; + } + + return ret; +} + +int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr) +{ + struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1]; + int ret; + + ret = sysfs_create_file(port->mcgs_parent, attr); + if (ret) + pr_err("failed to create %s\n", attr->name); + + return ret; +} + +void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr) +{ + struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1]; + + sysfs_remove_file(port->mcgs_parent, attr); +} + +static int add_port_entries(struct mlx4_ib_dev *device, int port_num) +{ + int i; + char buff[11]; + struct mlx4_ib_iov_port *port = NULL; + int ret = 0 ; + struct ib_port_attr attr; + + memset(&attr, 0, sizeof(attr)); + /* get the physical gid and pkey table sizes.*/ + ret = __mlx4_ib_query_port(&device->ib_dev, port_num, &attr, 1); + if (ret) + goto err; + + port = &device->iov_ports[port_num - 1]; + port->dev = device; + port->num = port_num; + /* Directory structure: + * iov - + * port num - + * admin_guids + * gids (operational) + * mcg_table + */ + port->dentr_ar = kzalloc(sizeof (struct mlx4_ib_iov_sysfs_attr_ar), + GFP_KERNEL); + if (!port->dentr_ar) { + ret = -ENOMEM; + goto err; + } + sprintf(buff, "%d", port_num); + port->cur_port = kobject_create_and_add(buff, + kobject_get(device->ports_parent)); + if (!port->cur_port) { + ret = -ENOMEM; + goto kobj_create_err; + } + /* admin GUIDs */ + port->admin_alias_parent = kobject_create_and_add("admin_guids", + kobject_get(port->cur_port)); + if (!port->admin_alias_parent) { + ret = -ENOMEM; + goto err_admin_guids; + } + for (i = 0 ; i < attr.gid_tbl_len; i++) { + sprintf(buff, "%d", i); + port->dentr_ar->dentries[i].entry_num = i; + ret = create_sysfs_entry(port, &port->dentr_ar->dentries[i], + buff, port->admin_alias_parent, + show_admin_alias_guid, store_admin_alias_guid); + if (ret) + goto err_admin_alias_parent; + } + + /* gids subdirectory (operational gids) */ + port->gids_parent = kobject_create_and_add("gids", + kobject_get(port->cur_port)); + if (!port->gids_parent) { + ret = -ENOMEM; + goto err_gids; + } + + for (i = 0 ; i < attr.gid_tbl_len; i++) { + sprintf(buff, "%d", i); + port->dentr_ar->dentries[attr.gid_tbl_len + i].entry_num = i; + ret = create_sysfs_entry(port, + &port->dentr_ar->dentries[attr.gid_tbl_len + i], + buff, + port->gids_parent, show_port_gid, NULL); + if (ret) + goto err_gids_parent; + } + + /* physical port pkey table */ + port->pkeys_parent = + kobject_create_and_add("pkeys", kobject_get(port->cur_port)); + if (!port->pkeys_parent) { + ret = -ENOMEM; + goto err_pkeys; + } + + for (i = 0 ; i < attr.pkey_tbl_len; i++) { + sprintf(buff, "%d", i); + port->dentr_ar->dentries[2 * attr.gid_tbl_len + i].entry_num = i; + ret = create_sysfs_entry(port, + &port->dentr_ar->dentries[2 * attr.gid_tbl_len + i], + buff, port->pkeys_parent, + show_phys_port_pkey, NULL); + if (ret) + goto err_pkeys_parent; + } + + /* MCGs table */ + port->mcgs_parent = + kobject_create_and_add("mcgs", kobject_get(port->cur_port)); + if (!port->mcgs_parent) { + ret = -ENOMEM; + goto err_mcgs; + } + return 0; + +err_mcgs: + kobject_put(port->cur_port); + +err_pkeys_parent: + kobject_put(port->pkeys_parent); + +err_pkeys: + kobject_put(port->cur_port); + +err_gids_parent: + kobject_put(port->gids_parent); + +err_gids: + kobject_put(port->cur_port); + +err_admin_alias_parent: + kobject_put(port->admin_alias_parent); + +err_admin_guids: + kobject_put(port->cur_port); + kobject_put(port->cur_port); /* once more for create_and_add buff */ + +kobj_create_err: + kobject_put(device->ports_parent); + kfree(port->dentr_ar); + +err: + pr_err("add_port_entries FAILED: for port:%d, error: %d\n", + port_num, ret); + return ret; +} + +static void get_name(struct mlx4_ib_dev *dev, char *name, int i, int max) +{ + /* pci_name format is: bus:dev:func -> xxxx:yy:zz.n + * with no ARI only 3 last bits are used so when the fn is higher than 8 + * need to add it to the dev num, so count in the last number will be + * modulo 8 */ + snprintf(name, max, "%.8s%.2d.%d", pci_name(dev->dev->persist->pdev), + i / 8, i % 8); +} + +struct mlx4_port { + struct kobject kobj; + struct mlx4_ib_dev *dev; + struct attribute_group pkey_group; + struct attribute_group gid_group; + struct device_attribute enable_smi_admin; + struct device_attribute smi_enabled; + int slave; + u8 port_num; +}; + + +static void mlx4_port_release(struct kobject *kobj) +{ + struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj); + struct attribute *a; + int i; + + for (i = 0; (a = p->pkey_group.attrs[i]); ++i) + kfree(a); + kfree(p->pkey_group.attrs); + for (i = 0; (a = p->gid_group.attrs[i]); ++i) + kfree(a); + kfree(p->gid_group.attrs); + kfree(p); +} + +struct port_attribute { + struct attribute attr; + ssize_t (*show)(struct mlx4_port *, struct port_attribute *, char *buf); + ssize_t (*store)(struct mlx4_port *, struct port_attribute *, + const char *buf, size_t count); +}; + +static ssize_t port_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj); + + if (!port_attr->show) + return -EIO; + return port_attr->show(p, port_attr, buf); +} + +static ssize_t port_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t size) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj); + + if (!port_attr->store) + return -EIO; + return port_attr->store(p, port_attr, buf, size); +} + +static const struct sysfs_ops port_sysfs_ops = { + .show = port_attr_show, + .store = port_attr_store, +}; + +static struct kobj_type port_type = { + .release = mlx4_port_release, + .sysfs_ops = &port_sysfs_ops, +}; + +struct port_table_attribute { + struct port_attribute attr; + char name[8]; + int index; +}; + +static ssize_t show_port_pkey(struct mlx4_port *p, struct port_attribute *attr, + char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + ssize_t ret = -ENODEV; + + if (p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1][tab_attr->index] >= + (p->dev->dev->caps.pkey_table_len[p->port_num])) + ret = sprintf(buf, "none\n"); + else + ret = sprintf(buf, "%d\n", + p->dev->pkeys.virt2phys_pkey[p->slave] + [p->port_num - 1][tab_attr->index]); + return ret; +} + +static ssize_t store_port_pkey(struct mlx4_port *p, struct port_attribute *attr, + const char *buf, size_t count) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + int idx; + int err; + + /* do not allow remapping Dom0 virtual pkey table */ + if (p->slave == mlx4_master_func_num(p->dev->dev)) + return -EINVAL; + + if (!strncasecmp(buf, "no", 2)) + idx = p->dev->dev->phys_caps.pkey_phys_table_len[p->port_num] - 1; + else if (sscanf(buf, "%i", &idx) != 1 || + idx >= p->dev->dev->caps.pkey_table_len[p->port_num] || + idx < 0) + return -EINVAL; + + p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1] + [tab_attr->index] = idx; + mlx4_sync_pkey_table(p->dev->dev, p->slave, p->port_num, + tab_attr->index, idx); + err = mlx4_gen_pkey_eqe(p->dev->dev, p->slave, p->port_num); + if (err) { + pr_err("mlx4_gen_pkey_eqe failed for slave %d," + " port %d, index %d\n", p->slave, p->port_num, idx); + return err; + } + return count; +} + +static ssize_t show_port_gid_idx(struct mlx4_port *p, + struct port_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", p->slave); +} + +static struct attribute ** +alloc_group_attrs(ssize_t (*show)(struct mlx4_port *, + struct port_attribute *, char *buf), + ssize_t (*store)(struct mlx4_port *, struct port_attribute *, + const char *buf, size_t count), + int len) +{ + struct attribute **tab_attr; + struct port_table_attribute *element; + int i; + + tab_attr = kcalloc(1 + len, sizeof (struct attribute *), GFP_KERNEL); + if (!tab_attr) + return NULL; + + for (i = 0; i < len; i++) { + element = kzalloc(sizeof (struct port_table_attribute), + GFP_KERNEL); + if (!element) + goto err; + if (snprintf(element->name, sizeof (element->name), + "%d", i) >= sizeof (element->name)) { + kfree(element); + goto err; + } + sysfs_attr_init(&element->attr.attr); + element->attr.attr.name = element->name; + if (store) { + element->attr.attr.mode = S_IWUSR | S_IRUGO; + element->attr.store = store; + } else + element->attr.attr.mode = S_IRUGO; + + element->attr.show = show; + element->index = i; + tab_attr[i] = &element->attr.attr; + } + return tab_attr; + +err: + while (--i >= 0) + kfree(tab_attr[i]); + kfree(tab_attr); + return NULL; +} + +static ssize_t sysfs_show_smi_enabled(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mlx4_port *p = + container_of(attr, struct mlx4_port, smi_enabled); + ssize_t len = 0; + + if (mlx4_vf_smi_enabled(p->dev->dev, p->slave, p->port_num)) + len = sprintf(buf, "%d\n", 1); + else + len = sprintf(buf, "%d\n", 0); + + return len; +} + +static ssize_t sysfs_show_enable_smi_admin(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_port *p = + container_of(attr, struct mlx4_port, enable_smi_admin); + ssize_t len = 0; + + if (mlx4_vf_get_enable_smi_admin(p->dev->dev, p->slave, p->port_num)) + len = sprintf(buf, "%d\n", 1); + else + len = sprintf(buf, "%d\n", 0); + + return len; +} + +static ssize_t sysfs_store_enable_smi_admin(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mlx4_port *p = + container_of(attr, struct mlx4_port, enable_smi_admin); + int enable; + + if (sscanf(buf, "%i", &enable) != 1 || + enable < 0 || enable > 1) + return -EINVAL; + + if (mlx4_vf_set_enable_smi_admin(p->dev->dev, p->slave, p->port_num, enable)) + return -EINVAL; + return count; +} + +static int add_vf_smi_entries(struct mlx4_port *p) +{ + int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) == + IB_LINK_LAYER_ETHERNET; + int ret; + + /* do not display entries if eth transport, or if master */ + if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev)) + return 0; + + sysfs_attr_init(&p->smi_enabled.attr); + p->smi_enabled.show = sysfs_show_smi_enabled; + p->smi_enabled.store = NULL; + p->smi_enabled.attr.name = "smi_enabled"; + p->smi_enabled.attr.mode = 0444; + ret = sysfs_create_file(&p->kobj, &p->smi_enabled.attr); + if (ret) { + pr_err("failed to create smi_enabled\n"); + return ret; + } + + sysfs_attr_init(&p->enable_smi_admin.attr); + p->enable_smi_admin.show = sysfs_show_enable_smi_admin; + p->enable_smi_admin.store = sysfs_store_enable_smi_admin; + p->enable_smi_admin.attr.name = "enable_smi_admin"; + p->enable_smi_admin.attr.mode = 0644; + ret = sysfs_create_file(&p->kobj, &p->enable_smi_admin.attr); + if (ret) { + pr_err("failed to create enable_smi_admin\n"); + sysfs_remove_file(&p->kobj, &p->smi_enabled.attr); + return ret; + } + return 0; +} + +static void remove_vf_smi_entries(struct mlx4_port *p) +{ + int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) == + IB_LINK_LAYER_ETHERNET; + + if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev)) + return; + + sysfs_remove_file(&p->kobj, &p->smi_enabled.attr); + sysfs_remove_file(&p->kobj, &p->enable_smi_admin.attr); +} + +static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave) +{ + struct mlx4_port *p; + int i; + int ret; + int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port_num) == + IB_LINK_LAYER_ETHERNET; + + p = kzalloc(sizeof *p, GFP_KERNEL); + if (!p) + return -ENOMEM; + + p->dev = dev; + p->port_num = port_num; + p->slave = slave; + + ret = kobject_init_and_add(&p->kobj, &port_type, + kobject_get(dev->dev_ports_parent[slave]), + "%d", port_num); + if (ret) + goto err_alloc; + + p->pkey_group.name = "pkey_idx"; + p->pkey_group.attrs = + alloc_group_attrs(show_port_pkey, + is_eth ? NULL : store_port_pkey, + dev->dev->caps.pkey_table_len[port_num]); + if (!p->pkey_group.attrs) { + ret = -ENOMEM; + goto err_alloc; + } + + ret = sysfs_create_group(&p->kobj, &p->pkey_group); + if (ret) + goto err_free_pkey; + + p->gid_group.name = "gid_idx"; + p->gid_group.attrs = alloc_group_attrs(show_port_gid_idx, NULL, 1); + if (!p->gid_group.attrs) { + ret = -ENOMEM; + goto err_free_pkey; + } + + ret = sysfs_create_group(&p->kobj, &p->gid_group); + if (ret) + goto err_free_gid; + + ret = add_vf_smi_entries(p); + if (ret) + goto err_free_gid; + + list_add_tail(&p->kobj.entry, &dev->pkeys.pkey_port_list[slave]); + return 0; + +err_free_gid: + kfree(p->gid_group.attrs[0]); + kfree(p->gid_group.attrs); + +err_free_pkey: + for (i = 0; i < dev->dev->caps.pkey_table_len[port_num]; ++i) + kfree(p->pkey_group.attrs[i]); + kfree(p->pkey_group.attrs); + +err_alloc: + kobject_put(dev->dev_ports_parent[slave]); + kfree(p); + return ret; +} + +static int register_one_pkey_tree(struct mlx4_ib_dev *dev, int slave) +{ + char name[32]; + int err; + int port; + struct kobject *p, *t; + struct mlx4_port *mport; + struct mlx4_active_ports actv_ports; + + get_name(dev, name, slave, sizeof name); + + dev->pkeys.device_parent[slave] = + kobject_create_and_add(name, kobject_get(dev->iov_parent)); + + if (!dev->pkeys.device_parent[slave]) { + err = -ENOMEM; + goto fail_dev; + } + + INIT_LIST_HEAD(&dev->pkeys.pkey_port_list[slave]); + + dev->dev_ports_parent[slave] = + kobject_create_and_add("ports", + kobject_get(dev->pkeys.device_parent[slave])); + + if (!dev->dev_ports_parent[slave]) { + err = -ENOMEM; + goto err_ports; + } + + actv_ports = mlx4_get_active_ports(dev->dev, slave); + + for (port = 1; port <= dev->dev->caps.num_ports; ++port) { + if (!test_bit(port - 1, actv_ports.ports)) + continue; + err = add_port(dev, port, slave); + if (err) + goto err_add; + } + return 0; + +err_add: + list_for_each_entry_safe(p, t, + &dev->pkeys.pkey_port_list[slave], + entry) { + list_del(&p->entry); + mport = container_of(p, struct mlx4_port, kobj); + sysfs_remove_group(p, &mport->pkey_group); + sysfs_remove_group(p, &mport->gid_group); + remove_vf_smi_entries(mport); + kobject_put(p); + } + kobject_put(dev->dev_ports_parent[slave]); + +err_ports: + kobject_put(dev->pkeys.device_parent[slave]); + /* extra put for the device_parent create_and_add */ + kobject_put(dev->pkeys.device_parent[slave]); + +fail_dev: + kobject_put(dev->iov_parent); + return err; +} + +static int register_pkey_tree(struct mlx4_ib_dev *device) +{ + int i; + + if (!mlx4_is_master(device->dev)) + return 0; + + for (i = 0; i <= device->dev->persist->num_vfs; ++i) + register_one_pkey_tree(device, i); + + return 0; +} + +static void unregister_pkey_tree(struct mlx4_ib_dev *device) +{ + int slave; + struct kobject *p, *t; + struct mlx4_port *port; + + if (!mlx4_is_master(device->dev)) + return; + + for (slave = device->dev->persist->num_vfs; slave >= 0; --slave) { + list_for_each_entry_safe(p, t, + &device->pkeys.pkey_port_list[slave], + entry) { + list_del(&p->entry); + port = container_of(p, struct mlx4_port, kobj); + sysfs_remove_group(p, &port->pkey_group); + sysfs_remove_group(p, &port->gid_group); + remove_vf_smi_entries(port); + kobject_put(p); + kobject_put(device->dev_ports_parent[slave]); + } + kobject_put(device->dev_ports_parent[slave]); + kobject_put(device->pkeys.device_parent[slave]); + kobject_put(device->pkeys.device_parent[slave]); + kobject_put(device->iov_parent); + } +} + +int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *dev) +{ + int i; + int ret = 0; + + if (!mlx4_is_master(dev->dev)) + return 0; + + dev->iov_parent = + kobject_create_and_add("iov", + kobject_get(dev->ib_dev.ports_parent->parent)); + if (!dev->iov_parent) { + ret = -ENOMEM; + goto err; + } + dev->ports_parent = + kobject_create_and_add("ports", + kobject_get(dev->iov_parent)); + if (!dev->ports_parent) { + ret = -ENOMEM; + goto err_ports; + } + + for (i = 1; i <= dev->ib_dev.phys_port_cnt; ++i) { + ret = add_port_entries(dev, i); + if (ret) + goto err_add_entries; + } + + ret = register_pkey_tree(dev); + if (ret) + goto err_add_entries; + return 0; + +err_add_entries: + kobject_put(dev->ports_parent); + +err_ports: + kobject_put(dev->iov_parent); +err: + kobject_put(dev->ib_dev.ports_parent->parent); + pr_err("mlx4_ib_device_register_sysfs error (%d)\n", ret); + return ret; +} + +static void unregister_alias_guid_tree(struct mlx4_ib_dev *device) +{ + struct mlx4_ib_iov_port *p; + int i; + + if (!mlx4_is_master(device->dev)) + return; + + for (i = 0; i < device->dev->caps.num_ports; i++) { + p = &device->iov_ports[i]; + kobject_put(p->admin_alias_parent); + kobject_put(p->gids_parent); + kobject_put(p->pkeys_parent); + kobject_put(p->mcgs_parent); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->dev->ports_parent); + kfree(p->dentr_ar); + } +} + +void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device) +{ + unregister_alias_guid_tree(device); + unregister_pkey_tree(device); + kobject_put(device->ports_parent); + kobject_put(device->iov_parent); + kobject_put(device->iov_parent); + kobject_put(device->ib_dev.ports_parent->parent); +} |