summaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/hw/mlx4
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/hw/mlx4')
-rw-r--r--drivers/infiniband/hw/mlx4/Kconfig11
-rw-r--r--drivers/infiniband/hw/mlx4/Makefile4
-rw-r--r--drivers/infiniband/hw/mlx4/ah.c234
-rw-r--r--drivers/infiniband/hw/mlx4/alias_GUID.c902
-rw-r--r--drivers/infiniband/hw/mlx4/cm.c604
-rw-r--r--drivers/infiniband/hw/mlx4/cq.c973
-rw-r--r--drivers/infiniband/hw/mlx4/doorbell.c100
-rw-r--r--drivers/infiniband/hw/mlx4/mad.c2399
-rw-r--r--drivers/infiniband/hw/mlx4/main.c3344
-rw-r--r--drivers/infiniband/hw/mlx4/mcg.c1267
-rw-r--r--drivers/infiniband/hw/mlx4/mlx4_ib.h943
-rw-r--r--drivers/infiniband/hw/mlx4/mr.c717
-rw-r--r--drivers/infiniband/hw/mlx4/qp.c4474
-rw-r--r--drivers/infiniband/hw/mlx4/srq.c374
-rw-r--r--drivers/infiniband/hw/mlx4/sysfs.c872
15 files changed, 17218 insertions, 0 deletions
diff --git a/drivers/infiniband/hw/mlx4/Kconfig b/drivers/infiniband/hw/mlx4/Kconfig
new file mode 100644
index 000000000..f30ce9dd0
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/Kconfig
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config MLX4_INFINIBAND
+ tristate "Mellanox ConnectX HCA support"
+ depends on NETDEVICES && ETHERNET && PCI && INET
+ select NET_VENDOR_MELLANOX
+ select MLX4_CORE
+ help
+ This driver provides low-level InfiniBand support for
+ Mellanox ConnectX PCI Express host channel adapters (HCAs).
+ This is required to use InfiniBand protocols such as
+ IP-over-IB or SRP with these devices.
diff --git a/drivers/infiniband/hw/mlx4/Makefile b/drivers/infiniband/hw/mlx4/Makefile
new file mode 100644
index 000000000..7b6757b02
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_MLX4_INFINIBAND) += mlx4_ib.o
+
+mlx4_ib-y := ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o cm.o alias_GUID.o sysfs.o
diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
new file mode 100644
index 000000000..7321d6ab5
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+
+#include <linux/slab.h>
+#include <linux/inet.h>
+#include <linux/string.h>
+#include <linux/mlx4/driver.h>
+
+#include "mlx4_ib.h"
+
+static void create_ib_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr)
+{
+ struct mlx4_ib_ah *ah = to_mah(ib_ah);
+ struct mlx4_dev *dev = to_mdev(ib_ah->device)->dev;
+
+ ah->av.ib.port_pd = cpu_to_be32(to_mpd(ib_ah->pd)->pdn |
+ (rdma_ah_get_port_num(ah_attr) << 24));
+ ah->av.ib.g_slid = rdma_ah_get_path_bits(ah_attr);
+ ah->av.ib.sl_tclass_flowlabel =
+ cpu_to_be32(rdma_ah_get_sl(ah_attr) << 28);
+ if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) {
+ const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
+
+ ah->av.ib.g_slid |= 0x80;
+ ah->av.ib.gid_index = grh->sgid_index;
+ ah->av.ib.hop_limit = grh->hop_limit;
+ ah->av.ib.sl_tclass_flowlabel |=
+ cpu_to_be32((grh->traffic_class << 20) |
+ grh->flow_label);
+ memcpy(ah->av.ib.dgid, grh->dgid.raw, 16);
+ }
+
+ ah->av.ib.dlid = cpu_to_be16(rdma_ah_get_dlid(ah_attr));
+ if (rdma_ah_get_static_rate(ah_attr)) {
+ u8 static_rate = rdma_ah_get_static_rate(ah_attr) +
+ MLX4_STAT_RATE_OFFSET;
+
+ while (static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+ !(1 << static_rate & dev->caps.stat_rate_support))
+ --static_rate;
+ ah->av.ib.stat_rate = static_rate;
+ }
+}
+
+static int create_iboe_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr)
+{
+ struct mlx4_ib_dev *ibdev = to_mdev(ib_ah->device);
+ struct mlx4_ib_ah *ah = to_mah(ib_ah);
+ const struct ib_gid_attr *gid_attr;
+ struct mlx4_dev *dev = ibdev->dev;
+ int is_mcast = 0;
+ struct in6_addr in6;
+ u16 vlan_tag = 0xffff;
+ const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
+ int ret;
+
+ memcpy(&in6, grh->dgid.raw, sizeof(in6));
+ if (rdma_is_multicast_addr(&in6))
+ is_mcast = 1;
+
+ memcpy(ah->av.eth.mac, ah_attr->roce.dmac, ETH_ALEN);
+ eth_zero_addr(ah->av.eth.s_mac);
+
+ /*
+ * If sgid_attr is NULL we are being called by mlx4_ib_create_ah_slave
+ * and we are directly creating an AV for a slave's gid_index.
+ */
+ gid_attr = ah_attr->grh.sgid_attr;
+ if (gid_attr) {
+ ret = rdma_read_gid_l2_fields(gid_attr, &vlan_tag,
+ &ah->av.eth.s_mac[0]);
+ if (ret)
+ return ret;
+
+ ret = mlx4_ib_gid_index_to_real_index(ibdev, gid_attr);
+ if (ret < 0)
+ return ret;
+ ah->av.eth.gid_index = ret;
+ } else {
+ /* mlx4_ib_create_ah_slave fills in the s_mac and the vlan */
+ ah->av.eth.gid_index = ah_attr->grh.sgid_index;
+ }
+
+ if (vlan_tag < 0x1000)
+ vlan_tag |= (rdma_ah_get_sl(ah_attr) & 7) << 13;
+ ah->av.eth.port_pd = cpu_to_be32(to_mpd(ib_ah->pd)->pdn |
+ (rdma_ah_get_port_num(ah_attr) << 24));
+ ah->av.eth.vlan = cpu_to_be16(vlan_tag);
+ ah->av.eth.hop_limit = grh->hop_limit;
+ if (rdma_ah_get_static_rate(ah_attr)) {
+ ah->av.eth.stat_rate = rdma_ah_get_static_rate(ah_attr) +
+ MLX4_STAT_RATE_OFFSET;
+ while (ah->av.eth.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+ !(1 << ah->av.eth.stat_rate & dev->caps.stat_rate_support))
+ --ah->av.eth.stat_rate;
+ }
+ ah->av.eth.sl_tclass_flowlabel |=
+ cpu_to_be32((grh->traffic_class << 20) |
+ grh->flow_label);
+ /*
+ * HW requires multicast LID so we just choose one.
+ */
+ if (is_mcast)
+ ah->av.ib.dlid = cpu_to_be16(0xc000);
+
+ memcpy(ah->av.eth.dgid, grh->dgid.raw, 16);
+ ah->av.eth.sl_tclass_flowlabel |= cpu_to_be32(rdma_ah_get_sl(ah_attr)
+ << 29);
+ return 0;
+}
+
+int mlx4_ib_create_ah(struct ib_ah *ib_ah, struct rdma_ah_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct rdma_ah_attr *ah_attr = init_attr->ah_attr;
+
+ if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) {
+ if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH))
+ return -EINVAL;
+ /*
+ * TBD: need to handle the case when we get
+ * called in an atomic context and there we
+ * might sleep. We don't expect this
+ * currently since we're working with link
+ * local addresses which we can translate
+ * without going to sleep.
+ */
+ return create_iboe_ah(ib_ah, ah_attr);
+ }
+
+ create_ib_ah(ib_ah, ah_attr);
+ return 0;
+}
+
+int mlx4_ib_create_ah_slave(struct ib_ah *ah, struct rdma_ah_attr *ah_attr,
+ int slave_sgid_index, u8 *s_mac, u16 vlan_tag)
+{
+ struct rdma_ah_attr slave_attr = *ah_attr;
+ struct rdma_ah_init_attr init_attr = {};
+ struct mlx4_ib_ah *mah = to_mah(ah);
+ int ret;
+
+ slave_attr.grh.sgid_attr = NULL;
+ slave_attr.grh.sgid_index = slave_sgid_index;
+ init_attr.ah_attr = &slave_attr;
+ ret = mlx4_ib_create_ah(ah, &init_attr, NULL);
+ if (ret)
+ return ret;
+
+ ah->type = ah_attr->type;
+
+ /* get rid of force-loopback bit */
+ mah->av.ib.port_pd &= cpu_to_be32(0x7FFFFFFF);
+
+ if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE)
+ memcpy(mah->av.eth.s_mac, s_mac, 6);
+
+ if (vlan_tag < 0x1000)
+ vlan_tag |= (rdma_ah_get_sl(ah_attr) & 7) << 13;
+ mah->av.eth.vlan = cpu_to_be16(vlan_tag);
+
+ return 0;
+}
+
+int mlx4_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
+{
+ struct mlx4_ib_ah *ah = to_mah(ibah);
+ int port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24;
+
+ memset(ah_attr, 0, sizeof *ah_attr);
+ ah_attr->type = ibah->type;
+
+ if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) {
+ rdma_ah_set_dlid(ah_attr, 0);
+ rdma_ah_set_sl(ah_attr,
+ be32_to_cpu(ah->av.eth.sl_tclass_flowlabel)
+ >> 29);
+ } else {
+ rdma_ah_set_dlid(ah_attr, be16_to_cpu(ah->av.ib.dlid));
+ rdma_ah_set_sl(ah_attr,
+ be32_to_cpu(ah->av.ib.sl_tclass_flowlabel)
+ >> 28);
+ }
+
+ rdma_ah_set_port_num(ah_attr, port_num);
+ if (ah->av.ib.stat_rate)
+ rdma_ah_set_static_rate(ah_attr,
+ ah->av.ib.stat_rate -
+ MLX4_STAT_RATE_OFFSET);
+ rdma_ah_set_path_bits(ah_attr, ah->av.ib.g_slid & 0x7F);
+ if (mlx4_ib_ah_grh_present(ah)) {
+ u32 tc_fl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel);
+
+ rdma_ah_set_grh(ah_attr, NULL,
+ tc_fl & 0xfffff, ah->av.ib.gid_index,
+ ah->av.ib.hop_limit,
+ tc_fl >> 20);
+ rdma_ah_set_dgid_raw(ah_attr, ah->av.ib.dgid);
+ }
+
+ return 0;
+}
diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c
new file mode 100644
index 000000000..111fa88a3
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/alias_GUID.c
@@ -0,0 +1,902 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+ /***********************************************************/
+/*This file support the handling of the Alias GUID feature. */
+/***********************************************************/
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_pack.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/delay.h>
+#include "mlx4_ib.h"
+
+/*
+The driver keeps the current state of all guids, as they are in the HW.
+Whenever we receive an smp mad GUIDInfo record, the data will be cached.
+*/
+
+struct mlx4_alias_guid_work_context {
+ u8 port;
+ struct mlx4_ib_dev *dev ;
+ struct ib_sa_query *sa_query;
+ struct completion done;
+ int query_id;
+ struct list_head list;
+ int block_num;
+ ib_sa_comp_mask guid_indexes;
+ u8 method;
+};
+
+struct mlx4_next_alias_guid_work {
+ u8 port;
+ u8 block_num;
+ u8 method;
+ struct mlx4_sriov_alias_guid_info_rec_det rec_det;
+};
+
+static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port,
+ int *resched_delay_sec);
+
+void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num,
+ u32 port_num, u8 *p_data)
+{
+ int i;
+ u64 guid_indexes;
+ int slave_id;
+ u32 port_index = port_num - 1;
+
+ if (!mlx4_is_master(dev->dev))
+ return;
+
+ guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid.
+ ports_guid[port_num - 1].
+ all_rec_per_port[block_num].guid_indexes);
+ pr_debug("port: %u, guid_indexes: 0x%llx\n", port_num, guid_indexes);
+
+ for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+ /* The location of the specific index starts from bit number 4
+ * until bit num 11 */
+ if (test_bit(i + 4, (unsigned long *)&guid_indexes)) {
+ slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ;
+ if (slave_id >= dev->dev->num_slaves) {
+ pr_debug("The last slave: %d\n", slave_id);
+ return;
+ }
+
+ /* cache the guid: */
+ memcpy(&dev->sriov.demux[port_index].guid_cache[slave_id],
+ &p_data[i * GUID_REC_SIZE],
+ GUID_REC_SIZE);
+ } else
+ pr_debug("Guid number: %d in block: %d"
+ " was not updated\n", i, block_num);
+ }
+}
+
+static __be64 get_cached_alias_guid(struct mlx4_ib_dev *dev, int port, int index)
+{
+ if (index >= NUM_ALIAS_GUID_PER_PORT) {
+ pr_err("%s: ERROR: asked for index:%d\n", __func__, index);
+ return (__force __be64) -1;
+ }
+ return *(__be64 *)&dev->sriov.demux[port - 1].guid_cache[index];
+}
+
+
+ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index)
+{
+ return IB_SA_COMP_MASK(4 + index);
+}
+
+void mlx4_ib_slave_alias_guid_event(struct mlx4_ib_dev *dev, int slave,
+ int port, int slave_init)
+{
+ __be64 curr_guid, required_guid;
+ int record_num = slave / 8;
+ int index = slave % 8;
+ int port_index = port - 1;
+ unsigned long flags;
+ int do_work = 0;
+
+ spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags);
+ if (dev->sriov.alias_guid.ports_guid[port_index].state_flags &
+ GUID_STATE_NEED_PORT_INIT)
+ goto unlock;
+ if (!slave_init) {
+ curr_guid = *(__be64 *)&dev->sriov.
+ alias_guid.ports_guid[port_index].
+ all_rec_per_port[record_num].
+ all_recs[GUID_REC_SIZE * index];
+ if (curr_guid == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL) ||
+ !curr_guid)
+ goto unlock;
+ required_guid = cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL);
+ } else {
+ required_guid = mlx4_get_admin_guid(dev->dev, slave, port);
+ if (required_guid == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL))
+ goto unlock;
+ }
+ *(__be64 *)&dev->sriov.alias_guid.ports_guid[port_index].
+ all_rec_per_port[record_num].
+ all_recs[GUID_REC_SIZE * index] = required_guid;
+ dev->sriov.alias_guid.ports_guid[port_index].
+ all_rec_per_port[record_num].guid_indexes
+ |= mlx4_ib_get_aguid_comp_mask_from_ix(index);
+ dev->sriov.alias_guid.ports_guid[port_index].
+ all_rec_per_port[record_num].status
+ = MLX4_GUID_INFO_STATUS_IDLE;
+ /* set to run immediately */
+ dev->sriov.alias_guid.ports_guid[port_index].
+ all_rec_per_port[record_num].time_to_run = 0;
+ dev->sriov.alias_guid.ports_guid[port_index].
+ all_rec_per_port[record_num].
+ guids_retry_schedule[index] = 0;
+ do_work = 1;
+unlock:
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags);
+
+ if (do_work)
+ mlx4_ib_init_alias_guid_work(dev, port_index);
+}
+
+/*
+ * Whenever new GUID is set/unset (guid table change) create event and
+ * notify the relevant slave (master also should be notified).
+ * If the GUID value is not as we have in the cache the slave will not be
+ * updated; in this case it waits for the smp_snoop or the port management
+ * event to call the function and to update the slave.
+ * block_number - the index of the block (16 blocks available)
+ * port_number - 1 or 2
+ */
+void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev,
+ int block_num, u32 port_num,
+ u8 *p_data)
+{
+ int i;
+ u64 guid_indexes;
+ int slave_id, slave_port;
+ enum slave_port_state new_state;
+ enum slave_port_state prev_state;
+ __be64 tmp_cur_ag, form_cache_ag;
+ enum slave_port_gen_event gen_event;
+ struct mlx4_sriov_alias_guid_info_rec_det *rec;
+ unsigned long flags;
+ __be64 required_value;
+
+ if (!mlx4_is_master(dev->dev))
+ return;
+
+ rec = &dev->sriov.alias_guid.ports_guid[port_num - 1].
+ all_rec_per_port[block_num];
+ guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid.
+ ports_guid[port_num - 1].
+ all_rec_per_port[block_num].guid_indexes);
+ pr_debug("port: %u, guid_indexes: 0x%llx\n", port_num, guid_indexes);
+
+ /*calculate the slaves and notify them*/
+ for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+ /* the location of the specific index runs from bits 4..11 */
+ if (!(test_bit(i + 4, (unsigned long *)&guid_indexes)))
+ continue;
+
+ slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ;
+ if (slave_id >= dev->dev->persist->num_vfs + 1)
+ return;
+
+ slave_port = mlx4_phys_to_slave_port(dev->dev, slave_id, port_num);
+ if (slave_port < 0) /* this port isn't available for the VF */
+ continue;
+
+ tmp_cur_ag = *(__be64 *)&p_data[i * GUID_REC_SIZE];
+ form_cache_ag = get_cached_alias_guid(dev, port_num,
+ (NUM_ALIAS_GUID_IN_REC * block_num) + i);
+ /*
+ * Check if guid is not the same as in the cache,
+ * If it is different, wait for the snoop_smp or the port mgmt
+ * change event to update the slave on its port state change
+ */
+ if (tmp_cur_ag != form_cache_ag)
+ continue;
+
+ spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags);
+ required_value = *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE];
+
+ if (required_value == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL))
+ required_value = 0;
+
+ if (tmp_cur_ag == required_value) {
+ rec->guid_indexes = rec->guid_indexes &
+ ~mlx4_ib_get_aguid_comp_mask_from_ix(i);
+ } else {
+ /* may notify port down if value is 0 */
+ if (tmp_cur_ag != MLX4_NOT_SET_GUID) {
+ spin_unlock_irqrestore(&dev->sriov.
+ alias_guid.ag_work_lock, flags);
+ continue;
+ }
+ }
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock,
+ flags);
+ mlx4_gen_guid_change_eqe(dev->dev, slave_id, port_num);
+ /*2 cases: Valid GUID, and Invalid Guid*/
+
+ if (tmp_cur_ag != MLX4_NOT_SET_GUID) { /*valid GUID*/
+ prev_state = mlx4_get_slave_port_state(dev->dev, slave_id, port_num);
+ new_state = set_and_calc_slave_port_state(dev->dev, slave_id, port_num,
+ MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID,
+ &gen_event);
+ pr_debug("slave: %d, port: %u prev_port_state: %d,"
+ " new_port_state: %d, gen_event: %d\n",
+ slave_id, port_num, prev_state, new_state, gen_event);
+ if (gen_event == SLAVE_PORT_GEN_EVENT_UP) {
+ pr_debug("sending PORT_UP event to slave: %d, port: %u\n",
+ slave_id, port_num);
+ mlx4_gen_port_state_change_eqe(dev->dev, slave_id,
+ port_num, MLX4_PORT_CHANGE_SUBTYPE_ACTIVE);
+ }
+ } else { /* request to invalidate GUID */
+ set_and_calc_slave_port_state(dev->dev, slave_id, port_num,
+ MLX4_PORT_STATE_IB_EVENT_GID_INVALID,
+ &gen_event);
+ if (gen_event == SLAVE_PORT_GEN_EVENT_DOWN) {
+ pr_debug("sending PORT DOWN event to slave: %d, port: %u\n",
+ slave_id, port_num);
+ mlx4_gen_port_state_change_eqe(dev->dev,
+ slave_id,
+ port_num,
+ MLX4_PORT_CHANGE_SUBTYPE_DOWN);
+ }
+ }
+ }
+}
+
+static void aliasguid_query_handler(int status,
+ struct ib_sa_guidinfo_rec *guid_rec,
+ void *context)
+{
+ struct mlx4_ib_dev *dev;
+ struct mlx4_alias_guid_work_context *cb_ctx = context;
+ u8 port_index ;
+ int i;
+ struct mlx4_sriov_alias_guid_info_rec_det *rec;
+ unsigned long flags, flags1;
+ ib_sa_comp_mask declined_guid_indexes = 0;
+ ib_sa_comp_mask applied_guid_indexes = 0;
+ unsigned int resched_delay_sec = 0;
+
+ if (!context)
+ return;
+
+ dev = cb_ctx->dev;
+ port_index = cb_ctx->port - 1;
+ rec = &dev->sriov.alias_guid.ports_guid[port_index].
+ all_rec_per_port[cb_ctx->block_num];
+
+ if (status) {
+ pr_debug("(port: %d) failed: status = %d\n",
+ cb_ctx->port, status);
+ rec->time_to_run = ktime_get_boottime_ns() + 1 * NSEC_PER_SEC;
+ goto out;
+ }
+
+ if (guid_rec->block_num != cb_ctx->block_num) {
+ pr_err("block num mismatch: %d != %d\n",
+ cb_ctx->block_num, guid_rec->block_num);
+ goto out;
+ }
+
+ pr_debug("lid/port: %d/%d, block_num: %d\n",
+ be16_to_cpu(guid_rec->lid), cb_ctx->port,
+ guid_rec->block_num);
+
+ rec = &dev->sriov.alias_guid.ports_guid[port_index].
+ all_rec_per_port[guid_rec->block_num];
+
+ spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags);
+ for (i = 0 ; i < NUM_ALIAS_GUID_IN_REC; i++) {
+ __be64 sm_response, required_val;
+
+ if (!(cb_ctx->guid_indexes &
+ mlx4_ib_get_aguid_comp_mask_from_ix(i)))
+ continue;
+ sm_response = *(__be64 *)&guid_rec->guid_info_list
+ [i * GUID_REC_SIZE];
+ required_val = *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE];
+ if (cb_ctx->method == MLX4_GUID_INFO_RECORD_DELETE) {
+ if (required_val ==
+ cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL))
+ goto next_entry;
+
+ /* A new value was set till we got the response */
+ pr_debug("need to set new value %llx, record num %d, block_num:%d\n",
+ be64_to_cpu(required_val),
+ i, guid_rec->block_num);
+ goto entry_declined;
+ }
+
+ /* check if the SM didn't assign one of the records.
+ * if it didn't, re-ask for.
+ */
+ if (sm_response == MLX4_NOT_SET_GUID) {
+ if (rec->guids_retry_schedule[i] == 0)
+ mlx4_ib_warn(&dev->ib_dev,
+ "%s:Record num %d in block_num: %d was declined by SM\n",
+ __func__, i,
+ guid_rec->block_num);
+ goto entry_declined;
+ } else {
+ /* properly assigned record. */
+ /* We save the GUID we just got from the SM in the
+ * admin_guid in order to be persistent, and in the
+ * request from the sm the process will ask for the same GUID */
+ if (required_val &&
+ sm_response != required_val) {
+ /* Warn only on first retry */
+ if (rec->guids_retry_schedule[i] == 0)
+ mlx4_ib_warn(&dev->ib_dev, "%s: Failed to set"
+ " admin guid after SysAdmin "
+ "configuration. "
+ "Record num %d in block_num:%d "
+ "was declined by SM, "
+ "new val(0x%llx) was kept, SM returned (0x%llx)\n",
+ __func__, i,
+ guid_rec->block_num,
+ be64_to_cpu(required_val),
+ be64_to_cpu(sm_response));
+ goto entry_declined;
+ } else {
+ *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] =
+ sm_response;
+ if (required_val == 0)
+ mlx4_set_admin_guid(dev->dev,
+ sm_response,
+ (guid_rec->block_num
+ * NUM_ALIAS_GUID_IN_REC) + i,
+ cb_ctx->port);
+ goto next_entry;
+ }
+ }
+entry_declined:
+ declined_guid_indexes |= mlx4_ib_get_aguid_comp_mask_from_ix(i);
+ rec->guids_retry_schedule[i] =
+ (rec->guids_retry_schedule[i] == 0) ? 1 :
+ min((unsigned int)60,
+ rec->guids_retry_schedule[i] * 2);
+ /* using the minimum value among all entries in that record */
+ resched_delay_sec = (resched_delay_sec == 0) ?
+ rec->guids_retry_schedule[i] :
+ min(resched_delay_sec,
+ rec->guids_retry_schedule[i]);
+ continue;
+
+next_entry:
+ rec->guids_retry_schedule[i] = 0;
+ }
+
+ applied_guid_indexes = cb_ctx->guid_indexes & ~declined_guid_indexes;
+ if (declined_guid_indexes ||
+ rec->guid_indexes & ~(applied_guid_indexes)) {
+ pr_debug("record=%d wasn't fully set, guid_indexes=0x%llx applied_indexes=0x%llx, declined_indexes=0x%llx\n",
+ guid_rec->block_num,
+ be64_to_cpu((__force __be64)rec->guid_indexes),
+ be64_to_cpu((__force __be64)applied_guid_indexes),
+ be64_to_cpu((__force __be64)declined_guid_indexes));
+ rec->time_to_run = ktime_get_boottime_ns() +
+ resched_delay_sec * NSEC_PER_SEC;
+ } else {
+ rec->status = MLX4_GUID_INFO_STATUS_SET;
+ }
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags);
+ /*
+ The func is call here to close the cases when the
+ sm doesn't send smp, so in the sa response the driver
+ notifies the slave.
+ */
+ mlx4_ib_notify_slaves_on_guid_change(dev, guid_rec->block_num,
+ cb_ctx->port,
+ guid_rec->guid_info_list);
+out:
+ spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+ spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+ if (!dev->sriov.is_going_down) {
+ get_low_record_time_index(dev, port_index, &resched_delay_sec);
+ queue_delayed_work(dev->sriov.alias_guid.ports_guid[port_index].wq,
+ &dev->sriov.alias_guid.ports_guid[port_index].
+ alias_guid_work,
+ msecs_to_jiffies(resched_delay_sec * 1000));
+ }
+ if (cb_ctx->sa_query) {
+ list_del(&cb_ctx->list);
+ kfree(cb_ctx);
+ } else
+ complete(&cb_ctx->done);
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+ spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+static void invalidate_guid_record(struct mlx4_ib_dev *dev, u8 port, int index)
+{
+ int i;
+ u64 cur_admin_val;
+ ib_sa_comp_mask comp_mask = 0;
+
+ dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].status
+ = MLX4_GUID_INFO_STATUS_SET;
+
+ /* calculate the comp_mask for that record.*/
+ for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+ cur_admin_val =
+ *(u64 *)&dev->sriov.alias_guid.ports_guid[port - 1].
+ all_rec_per_port[index].all_recs[GUID_REC_SIZE * i];
+ /*
+ check the admin value: if it's for delete (~00LL) or
+ it is the first guid of the first record (hw guid) or
+ the records is not in ownership of the sysadmin and the sm doesn't
+ need to assign GUIDs, then don't put it up for assignment.
+ */
+ if (MLX4_GUID_FOR_DELETE_VAL == cur_admin_val ||
+ (!index && !i))
+ continue;
+ comp_mask |= mlx4_ib_get_aguid_comp_mask_from_ix(i);
+ }
+ dev->sriov.alias_guid.ports_guid[port - 1].
+ all_rec_per_port[index].guid_indexes |= comp_mask;
+ if (dev->sriov.alias_guid.ports_guid[port - 1].
+ all_rec_per_port[index].guid_indexes)
+ dev->sriov.alias_guid.ports_guid[port - 1].
+ all_rec_per_port[index].status = MLX4_GUID_INFO_STATUS_IDLE;
+
+}
+
+static int set_guid_rec(struct ib_device *ibdev,
+ struct mlx4_next_alias_guid_work *rec)
+{
+ int err;
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct ib_sa_guidinfo_rec guid_info_rec;
+ ib_sa_comp_mask comp_mask;
+ struct ib_port_attr attr;
+ struct mlx4_alias_guid_work_context *callback_context;
+ unsigned long resched_delay, flags, flags1;
+ u8 port = rec->port + 1;
+ int index = rec->block_num;
+ struct mlx4_sriov_alias_guid_info_rec_det *rec_det = &rec->rec_det;
+ struct list_head *head =
+ &dev->sriov.alias_guid.ports_guid[port - 1].cb_list;
+
+ memset(&attr, 0, sizeof(attr));
+ err = __mlx4_ib_query_port(ibdev, port, &attr, 1);
+ if (err) {
+ pr_debug("mlx4_ib_query_port failed (err: %d), port: %d\n",
+ err, port);
+ return err;
+ }
+ /*check the port was configured by the sm, otherwise no need to send */
+ if (attr.state != IB_PORT_ACTIVE) {
+ pr_debug("port %d not active...rescheduling\n", port);
+ resched_delay = 5 * HZ;
+ err = -EAGAIN;
+ goto new_schedule;
+ }
+
+ callback_context = kmalloc(sizeof *callback_context, GFP_KERNEL);
+ if (!callback_context) {
+ err = -ENOMEM;
+ resched_delay = HZ * 5;
+ goto new_schedule;
+ }
+ callback_context->port = port;
+ callback_context->dev = dev;
+ callback_context->block_num = index;
+ callback_context->guid_indexes = rec_det->guid_indexes;
+ callback_context->method = rec->method;
+
+ memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec));
+
+ guid_info_rec.lid = ib_lid_be16(attr.lid);
+ guid_info_rec.block_num = index;
+
+ memcpy(guid_info_rec.guid_info_list, rec_det->all_recs,
+ GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC);
+ comp_mask = IB_SA_GUIDINFO_REC_LID | IB_SA_GUIDINFO_REC_BLOCK_NUM |
+ rec_det->guid_indexes;
+
+ init_completion(&callback_context->done);
+ spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+ list_add_tail(&callback_context->list, head);
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+
+ callback_context->query_id =
+ ib_sa_guid_info_rec_query(dev->sriov.alias_guid.sa_client,
+ ibdev, port, &guid_info_rec,
+ comp_mask, rec->method, 1000,
+ GFP_KERNEL, aliasguid_query_handler,
+ callback_context,
+ &callback_context->sa_query);
+ if (callback_context->query_id < 0) {
+ pr_debug("ib_sa_guid_info_rec_query failed, query_id: "
+ "%d. will reschedule to the next 1 sec.\n",
+ callback_context->query_id);
+ spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+ list_del(&callback_context->list);
+ kfree(callback_context);
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+ resched_delay = 1 * HZ;
+ err = -EAGAIN;
+ goto new_schedule;
+ }
+ err = 0;
+ goto out;
+
+new_schedule:
+ spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+ spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+ invalidate_guid_record(dev, port, index);
+ if (!dev->sriov.is_going_down) {
+ queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq,
+ &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work,
+ resched_delay);
+ }
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+ spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+
+out:
+ return err;
+}
+
+static void mlx4_ib_guid_port_init(struct mlx4_ib_dev *dev, int port)
+{
+ int j, k, entry;
+ __be64 guid;
+
+ /*Check if the SM doesn't need to assign the GUIDs*/
+ for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) {
+ for (k = 0; k < NUM_ALIAS_GUID_IN_REC; k++) {
+ entry = j * NUM_ALIAS_GUID_IN_REC + k;
+ /* no request for the 0 entry (hw guid) */
+ if (!entry || entry > dev->dev->persist->num_vfs ||
+ !mlx4_is_slave_active(dev->dev, entry))
+ continue;
+ guid = mlx4_get_admin_guid(dev->dev, entry, port);
+ *(__be64 *)&dev->sriov.alias_guid.ports_guid[port - 1].
+ all_rec_per_port[j].all_recs
+ [GUID_REC_SIZE * k] = guid;
+ pr_debug("guid was set, entry=%d, val=0x%llx, port=%d\n",
+ entry,
+ be64_to_cpu(guid),
+ port);
+ }
+ }
+}
+void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port)
+{
+ int i;
+ unsigned long flags, flags1;
+
+ pr_debug("port %d\n", port);
+
+ spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+ spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+
+ if (dev->sriov.alias_guid.ports_guid[port - 1].state_flags &
+ GUID_STATE_NEED_PORT_INIT) {
+ mlx4_ib_guid_port_init(dev, port);
+ dev->sriov.alias_guid.ports_guid[port - 1].state_flags &=
+ (~GUID_STATE_NEED_PORT_INIT);
+ }
+ for (i = 0; i < NUM_ALIAS_GUID_REC_IN_PORT; i++)
+ invalidate_guid_record(dev, port, i);
+
+ if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) {
+ /*
+ make sure no work waits in the queue, if the work is already
+ queued(not on the timer) the cancel will fail. That is not a problem
+ because we just want the work started.
+ */
+ cancel_delayed_work(&dev->sriov.alias_guid.
+ ports_guid[port - 1].alias_guid_work);
+ queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq,
+ &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work,
+ 0);
+ }
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+ spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+static void set_required_record(struct mlx4_ib_dev *dev, u8 port,
+ struct mlx4_next_alias_guid_work *next_rec,
+ int record_index)
+{
+ int i;
+ int lowset_time_entry = -1;
+ int lowest_time = 0;
+ ib_sa_comp_mask delete_guid_indexes = 0;
+ ib_sa_comp_mask set_guid_indexes = 0;
+ struct mlx4_sriov_alias_guid_info_rec_det *rec =
+ &dev->sriov.alias_guid.ports_guid[port].
+ all_rec_per_port[record_index];
+
+ for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+ if (!(rec->guid_indexes &
+ mlx4_ib_get_aguid_comp_mask_from_ix(i)))
+ continue;
+
+ if (*(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] ==
+ cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL))
+ delete_guid_indexes |=
+ mlx4_ib_get_aguid_comp_mask_from_ix(i);
+ else
+ set_guid_indexes |=
+ mlx4_ib_get_aguid_comp_mask_from_ix(i);
+
+ if (lowset_time_entry == -1 || rec->guids_retry_schedule[i] <=
+ lowest_time) {
+ lowset_time_entry = i;
+ lowest_time = rec->guids_retry_schedule[i];
+ }
+ }
+
+ memcpy(&next_rec->rec_det, rec, sizeof(*rec));
+ next_rec->port = port;
+ next_rec->block_num = record_index;
+
+ if (*(__be64 *)&rec->all_recs[lowset_time_entry * GUID_REC_SIZE] ==
+ cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) {
+ next_rec->rec_det.guid_indexes = delete_guid_indexes;
+ next_rec->method = MLX4_GUID_INFO_RECORD_DELETE;
+ } else {
+ next_rec->rec_det.guid_indexes = set_guid_indexes;
+ next_rec->method = MLX4_GUID_INFO_RECORD_SET;
+ }
+}
+
+/* return index of record that should be updated based on lowest
+ * rescheduled time
+ */
+static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port,
+ int *resched_delay_sec)
+{
+ int record_index = -1;
+ u64 low_record_time = 0;
+ struct mlx4_sriov_alias_guid_info_rec_det rec;
+ int j;
+
+ for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) {
+ rec = dev->sriov.alias_guid.ports_guid[port].
+ all_rec_per_port[j];
+ if (rec.status == MLX4_GUID_INFO_STATUS_IDLE &&
+ rec.guid_indexes) {
+ if (record_index == -1 ||
+ rec.time_to_run < low_record_time) {
+ record_index = j;
+ low_record_time = rec.time_to_run;
+ }
+ }
+ }
+ if (resched_delay_sec) {
+ u64 curr_time = ktime_get_boottime_ns();
+
+ *resched_delay_sec = (low_record_time < curr_time) ? 0 :
+ div_u64((low_record_time - curr_time), NSEC_PER_SEC);
+ }
+
+ return record_index;
+}
+
+/* The function returns the next record that was
+ * not configured (or failed to be configured) */
+static int get_next_record_to_update(struct mlx4_ib_dev *dev, u8 port,
+ struct mlx4_next_alias_guid_work *rec)
+{
+ unsigned long flags;
+ int record_index;
+ int ret = 0;
+
+ spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags);
+ record_index = get_low_record_time_index(dev, port, NULL);
+
+ if (record_index < 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ set_required_record(dev, port, rec, record_index);
+out:
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags);
+ return ret;
+}
+
+static void alias_guid_work(struct work_struct *work)
+{
+ struct delayed_work *delay = to_delayed_work(work);
+ int ret = 0;
+ struct mlx4_next_alias_guid_work *rec;
+ struct mlx4_sriov_alias_guid_port_rec_det *sriov_alias_port =
+ container_of(delay, struct mlx4_sriov_alias_guid_port_rec_det,
+ alias_guid_work);
+ struct mlx4_sriov_alias_guid *sriov_alias_guid = sriov_alias_port->parent;
+ struct mlx4_ib_sriov *ib_sriov = container_of(sriov_alias_guid,
+ struct mlx4_ib_sriov,
+ alias_guid);
+ struct mlx4_ib_dev *dev = container_of(ib_sriov, struct mlx4_ib_dev, sriov);
+
+ rec = kzalloc(sizeof *rec, GFP_KERNEL);
+ if (!rec)
+ return;
+
+ pr_debug("starting [port: %d]...\n", sriov_alias_port->port + 1);
+ ret = get_next_record_to_update(dev, sriov_alias_port->port, rec);
+ if (ret) {
+ pr_debug("No more records to update.\n");
+ goto out;
+ }
+
+ set_guid_rec(&dev->ib_dev, rec);
+out:
+ kfree(rec);
+}
+
+
+void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port)
+{
+ unsigned long flags, flags1;
+
+ if (!mlx4_is_master(dev->dev))
+ return;
+ spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+ spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+ if (!dev->sriov.is_going_down) {
+ /* If there is pending one should cancel then run, otherwise
+ * won't run till previous one is ended as same work
+ * struct is used.
+ */
+ cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[port].
+ alias_guid_work);
+ queue_delayed_work(dev->sriov.alias_guid.ports_guid[port].wq,
+ &dev->sriov.alias_guid.ports_guid[port].alias_guid_work, 0);
+ }
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+ spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev)
+{
+ int i;
+ struct mlx4_ib_sriov *sriov = &dev->sriov;
+ struct mlx4_alias_guid_work_context *cb_ctx;
+ struct mlx4_sriov_alias_guid_port_rec_det *det;
+ struct ib_sa_query *sa_query;
+ unsigned long flags;
+
+ for (i = 0 ; i < dev->num_ports; i++) {
+ det = &sriov->alias_guid.ports_guid[i];
+ cancel_delayed_work_sync(&det->alias_guid_work);
+ spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags);
+ while (!list_empty(&det->cb_list)) {
+ cb_ctx = list_entry(det->cb_list.next,
+ struct mlx4_alias_guid_work_context,
+ list);
+ sa_query = cb_ctx->sa_query;
+ cb_ctx->sa_query = NULL;
+ list_del(&cb_ctx->list);
+ spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags);
+ ib_sa_cancel_query(cb_ctx->query_id, sa_query);
+ wait_for_completion(&cb_ctx->done);
+ kfree(cb_ctx);
+ spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags);
+ }
+ spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags);
+ }
+ for (i = 0 ; i < dev->num_ports; i++)
+ destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq);
+ ib_sa_unregister_client(dev->sriov.alias_guid.sa_client);
+ kfree(dev->sriov.alias_guid.sa_client);
+}
+
+int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev)
+{
+ char alias_wq_name[15];
+ int ret = 0;
+ int i, j;
+ union ib_gid gid;
+
+ if (!mlx4_is_master(dev->dev))
+ return 0;
+ dev->sriov.alias_guid.sa_client =
+ kzalloc(sizeof *dev->sriov.alias_guid.sa_client, GFP_KERNEL);
+ if (!dev->sriov.alias_guid.sa_client)
+ return -ENOMEM;
+
+ ib_sa_register_client(dev->sriov.alias_guid.sa_client);
+
+ spin_lock_init(&dev->sriov.alias_guid.ag_work_lock);
+
+ for (i = 1; i <= dev->num_ports; ++i) {
+ if (dev->ib_dev.ops.query_gid(&dev->ib_dev, i, 0, &gid)) {
+ ret = -EFAULT;
+ goto err_unregister;
+ }
+ }
+
+ for (i = 0 ; i < dev->num_ports; i++) {
+ memset(&dev->sriov.alias_guid.ports_guid[i], 0,
+ sizeof (struct mlx4_sriov_alias_guid_port_rec_det));
+ dev->sriov.alias_guid.ports_guid[i].state_flags |=
+ GUID_STATE_NEED_PORT_INIT;
+ for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) {
+ /* mark each val as it was deleted */
+ memset(dev->sriov.alias_guid.ports_guid[i].
+ all_rec_per_port[j].all_recs, 0xFF,
+ sizeof(dev->sriov.alias_guid.ports_guid[i].
+ all_rec_per_port[j].all_recs));
+ }
+ INIT_LIST_HEAD(&dev->sriov.alias_guid.ports_guid[i].cb_list);
+ /*prepare the records, set them to be allocated by sm*/
+ if (mlx4_ib_sm_guid_assign)
+ for (j = 1; j < NUM_ALIAS_GUID_PER_PORT; j++)
+ mlx4_set_admin_guid(dev->dev, 0, j, i + 1);
+ for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT; j++)
+ invalidate_guid_record(dev, i + 1, j);
+
+ dev->sriov.alias_guid.ports_guid[i].parent = &dev->sriov.alias_guid;
+ dev->sriov.alias_guid.ports_guid[i].port = i;
+
+ snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i);
+ dev->sriov.alias_guid.ports_guid[i].wq =
+ alloc_ordered_workqueue(alias_wq_name, WQ_MEM_RECLAIM);
+ if (!dev->sriov.alias_guid.ports_guid[i].wq) {
+ ret = -ENOMEM;
+ goto err_thread;
+ }
+ INIT_DELAYED_WORK(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work,
+ alias_guid_work);
+ }
+ return 0;
+
+err_thread:
+ for (--i; i >= 0; i--) {
+ destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq);
+ dev->sriov.alias_guid.ports_guid[i].wq = NULL;
+ }
+
+err_unregister:
+ ib_sa_unregister_client(dev->sriov.alias_guid.sa_client);
+ kfree(dev->sriov.alias_guid.sa_client);
+ dev->sriov.alias_guid.sa_client = NULL;
+ pr_err("init_alias_guid_service: Failed. (ret:%d)\n", ret);
+ return ret;
+}
diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c
new file mode 100644
index 000000000..12b481d13
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/cm.c
@@ -0,0 +1,604 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/rbtree.h>
+#include <linux/idr.h>
+#include <rdma/ib_cm.h>
+
+#include "mlx4_ib.h"
+
+#define CM_CLEANUP_CACHE_TIMEOUT (30 * HZ)
+
+struct id_map_entry {
+ struct rb_node node;
+
+ u32 sl_cm_id;
+ u32 pv_cm_id;
+ int slave_id;
+ int scheduled_delete;
+ struct mlx4_ib_dev *dev;
+
+ struct list_head list;
+ struct delayed_work timeout;
+};
+
+struct rej_tmout_entry {
+ int slave;
+ u32 rem_pv_cm_id;
+ struct delayed_work timeout;
+ struct xarray *xa_rej_tmout;
+};
+
+struct cm_generic_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+ unsigned char unused[2];
+ __be16 rej_reason;
+};
+
+struct cm_sidr_generic_msg {
+ struct ib_mad_hdr hdr;
+ __be32 request_id;
+};
+
+struct cm_req_msg {
+ unsigned char unused[0x60];
+ union ib_gid primary_path_sgid;
+};
+
+static struct workqueue_struct *cm_wq;
+
+static void set_local_comm_id(struct ib_mad *mad, u32 cm_id)
+{
+ if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+ struct cm_sidr_generic_msg *msg =
+ (struct cm_sidr_generic_msg *)mad;
+ msg->request_id = cpu_to_be32(cm_id);
+ } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+ pr_err("trying to set local_comm_id in SIDR_REP\n");
+ return;
+ } else {
+ struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+ msg->local_comm_id = cpu_to_be32(cm_id);
+ }
+}
+
+static u32 get_local_comm_id(struct ib_mad *mad)
+{
+ if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+ struct cm_sidr_generic_msg *msg =
+ (struct cm_sidr_generic_msg *)mad;
+ return be32_to_cpu(msg->request_id);
+ } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+ pr_err("trying to set local_comm_id in SIDR_REP\n");
+ return -1;
+ } else {
+ struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+ return be32_to_cpu(msg->local_comm_id);
+ }
+}
+
+static void set_remote_comm_id(struct ib_mad *mad, u32 cm_id)
+{
+ if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+ struct cm_sidr_generic_msg *msg =
+ (struct cm_sidr_generic_msg *)mad;
+ msg->request_id = cpu_to_be32(cm_id);
+ } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+ pr_err("trying to set remote_comm_id in SIDR_REQ\n");
+ return;
+ } else {
+ struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+ msg->remote_comm_id = cpu_to_be32(cm_id);
+ }
+}
+
+static u32 get_remote_comm_id(struct ib_mad *mad)
+{
+ if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+ struct cm_sidr_generic_msg *msg =
+ (struct cm_sidr_generic_msg *)mad;
+ return be32_to_cpu(msg->request_id);
+ } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+ pr_err("trying to set remote_comm_id in SIDR_REQ\n");
+ return -1;
+ } else {
+ struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+ return be32_to_cpu(msg->remote_comm_id);
+ }
+}
+
+static union ib_gid gid_from_req_msg(struct ib_device *ibdev, struct ib_mad *mad)
+{
+ struct cm_req_msg *msg = (struct cm_req_msg *)mad;
+
+ return msg->primary_path_sgid;
+}
+
+/* Lock should be taken before called */
+static struct id_map_entry *
+id_map_find_by_sl_id(struct ib_device *ibdev, u32 slave_id, u32 sl_cm_id)
+{
+ struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map;
+ struct rb_node *node = sl_id_map->rb_node;
+
+ while (node) {
+ struct id_map_entry *id_map_entry =
+ rb_entry(node, struct id_map_entry, node);
+
+ if (id_map_entry->sl_cm_id > sl_cm_id)
+ node = node->rb_left;
+ else if (id_map_entry->sl_cm_id < sl_cm_id)
+ node = node->rb_right;
+ else if (id_map_entry->slave_id > slave_id)
+ node = node->rb_left;
+ else if (id_map_entry->slave_id < slave_id)
+ node = node->rb_right;
+ else
+ return id_map_entry;
+ }
+ return NULL;
+}
+
+static void id_map_ent_timeout(struct work_struct *work)
+{
+ struct delayed_work *delay = to_delayed_work(work);
+ struct id_map_entry *ent = container_of(delay, struct id_map_entry, timeout);
+ struct id_map_entry *found_ent;
+ struct mlx4_ib_dev *dev = ent->dev;
+ struct mlx4_ib_sriov *sriov = &dev->sriov;
+ struct rb_root *sl_id_map = &sriov->sl_id_map;
+
+ spin_lock(&sriov->id_map_lock);
+ if (!xa_erase(&sriov->pv_id_table, ent->pv_cm_id))
+ goto out;
+ found_ent = id_map_find_by_sl_id(&dev->ib_dev, ent->slave_id, ent->sl_cm_id);
+ if (found_ent && found_ent == ent)
+ rb_erase(&found_ent->node, sl_id_map);
+
+out:
+ list_del(&ent->list);
+ spin_unlock(&sriov->id_map_lock);
+ kfree(ent);
+}
+
+static void sl_id_map_add(struct ib_device *ibdev, struct id_map_entry *new)
+{
+ struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map;
+ struct rb_node **link = &sl_id_map->rb_node, *parent = NULL;
+ struct id_map_entry *ent;
+ int slave_id = new->slave_id;
+ int sl_cm_id = new->sl_cm_id;
+
+ ent = id_map_find_by_sl_id(ibdev, slave_id, sl_cm_id);
+ if (ent) {
+ pr_debug("overriding existing sl_id_map entry (cm_id = %x)\n",
+ sl_cm_id);
+
+ rb_replace_node(&ent->node, &new->node, sl_id_map);
+ return;
+ }
+
+ /* Go to the bottom of the tree */
+ while (*link) {
+ parent = *link;
+ ent = rb_entry(parent, struct id_map_entry, node);
+
+ if (ent->sl_cm_id > sl_cm_id || (ent->sl_cm_id == sl_cm_id && ent->slave_id > slave_id))
+ link = &(*link)->rb_left;
+ else
+ link = &(*link)->rb_right;
+ }
+
+ rb_link_node(&new->node, parent, link);
+ rb_insert_color(&new->node, sl_id_map);
+}
+
+static struct id_map_entry *
+id_map_alloc(struct ib_device *ibdev, int slave_id, u32 sl_cm_id)
+{
+ int ret;
+ struct id_map_entry *ent;
+ struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+
+ ent = kmalloc(sizeof (struct id_map_entry), GFP_KERNEL);
+ if (!ent)
+ return ERR_PTR(-ENOMEM);
+
+ ent->sl_cm_id = sl_cm_id;
+ ent->slave_id = slave_id;
+ ent->scheduled_delete = 0;
+ ent->dev = to_mdev(ibdev);
+ INIT_DELAYED_WORK(&ent->timeout, id_map_ent_timeout);
+
+ ret = xa_alloc_cyclic(&sriov->pv_id_table, &ent->pv_cm_id, ent,
+ xa_limit_32b, &sriov->pv_id_next, GFP_KERNEL);
+ if (ret >= 0) {
+ spin_lock(&sriov->id_map_lock);
+ sl_id_map_add(ibdev, ent);
+ list_add_tail(&ent->list, &sriov->cm_list);
+ spin_unlock(&sriov->id_map_lock);
+ return ent;
+ }
+
+ /*error flow*/
+ kfree(ent);
+ mlx4_ib_warn(ibdev, "Allocation failed (err:0x%x)\n", ret);
+ return ERR_PTR(-ENOMEM);
+}
+
+static struct id_map_entry *
+id_map_get(struct ib_device *ibdev, int *pv_cm_id, int slave_id, int sl_cm_id)
+{
+ struct id_map_entry *ent;
+ struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+
+ spin_lock(&sriov->id_map_lock);
+ if (*pv_cm_id == -1) {
+ ent = id_map_find_by_sl_id(ibdev, slave_id, sl_cm_id);
+ if (ent)
+ *pv_cm_id = (int) ent->pv_cm_id;
+ } else
+ ent = xa_load(&sriov->pv_id_table, *pv_cm_id);
+ spin_unlock(&sriov->id_map_lock);
+
+ return ent;
+}
+
+static void schedule_delayed(struct ib_device *ibdev, struct id_map_entry *id)
+{
+ struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+ unsigned long flags;
+
+ spin_lock(&sriov->id_map_lock);
+ spin_lock_irqsave(&sriov->going_down_lock, flags);
+ /*make sure that there is no schedule inside the scheduled work.*/
+ if (!sriov->is_going_down && !id->scheduled_delete) {
+ id->scheduled_delete = 1;
+ queue_delayed_work(cm_wq, &id->timeout, CM_CLEANUP_CACHE_TIMEOUT);
+ } else if (id->scheduled_delete) {
+ /* Adjust timeout if already scheduled */
+ mod_delayed_work(cm_wq, &id->timeout, CM_CLEANUP_CACHE_TIMEOUT);
+ }
+ spin_unlock_irqrestore(&sriov->going_down_lock, flags);
+ spin_unlock(&sriov->id_map_lock);
+}
+
+#define REJ_REASON(m) be16_to_cpu(((struct cm_generic_msg *)(m))->rej_reason)
+int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id,
+ struct ib_mad *mad)
+{
+ struct id_map_entry *id;
+ u32 sl_cm_id;
+ int pv_cm_id = -1;
+
+ if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
+ mad->mad_hdr.attr_id == CM_REP_ATTR_ID ||
+ mad->mad_hdr.attr_id == CM_MRA_ATTR_ID ||
+ mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID ||
+ (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID && REJ_REASON(mad) == IB_CM_REJ_TIMEOUT)) {
+ sl_cm_id = get_local_comm_id(mad);
+ id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id);
+ if (id)
+ goto cont;
+ id = id_map_alloc(ibdev, slave_id, sl_cm_id);
+ if (IS_ERR(id)) {
+ mlx4_ib_warn(ibdev, "%s: id{slave: %d, sl_cm_id: 0x%x} Failed to id_map_alloc\n",
+ __func__, slave_id, sl_cm_id);
+ return PTR_ERR(id);
+ }
+ } else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID ||
+ mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+ return 0;
+ } else {
+ sl_cm_id = get_local_comm_id(mad);
+ id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id);
+ }
+
+ if (!id) {
+ pr_debug("id{slave: %d, sl_cm_id: 0x%x} is NULL! attr_id: 0x%x\n",
+ slave_id, sl_cm_id, be16_to_cpu(mad->mad_hdr.attr_id));
+ return -EINVAL;
+ }
+
+cont:
+ set_local_comm_id(mad, id->pv_cm_id);
+
+ if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID)
+ schedule_delayed(ibdev, id);
+ return 0;
+}
+
+static void rej_tmout_timeout(struct work_struct *work)
+{
+ struct delayed_work *delay = to_delayed_work(work);
+ struct rej_tmout_entry *item = container_of(delay, struct rej_tmout_entry, timeout);
+ struct rej_tmout_entry *deleted;
+
+ deleted = xa_cmpxchg(item->xa_rej_tmout, item->rem_pv_cm_id, item, NULL, 0);
+
+ if (deleted != item)
+ pr_debug("deleted(%p) != item(%p)\n", deleted, item);
+
+ kfree(item);
+}
+
+static int alloc_rej_tmout(struct mlx4_ib_sriov *sriov, u32 rem_pv_cm_id, int slave)
+{
+ struct rej_tmout_entry *item;
+ struct rej_tmout_entry *old;
+ int ret = 0;
+
+ xa_lock(&sriov->xa_rej_tmout);
+ item = xa_load(&sriov->xa_rej_tmout, (unsigned long)rem_pv_cm_id);
+
+ if (item) {
+ if (xa_err(item))
+ ret = xa_err(item);
+ else
+ /* If a retry, adjust delayed work */
+ mod_delayed_work(cm_wq, &item->timeout, CM_CLEANUP_CACHE_TIMEOUT);
+ goto err_or_exists;
+ }
+ xa_unlock(&sriov->xa_rej_tmout);
+
+ item = kmalloc(sizeof(*item), GFP_KERNEL);
+ if (!item)
+ return -ENOMEM;
+
+ INIT_DELAYED_WORK(&item->timeout, rej_tmout_timeout);
+ item->slave = slave;
+ item->rem_pv_cm_id = rem_pv_cm_id;
+ item->xa_rej_tmout = &sriov->xa_rej_tmout;
+
+ old = xa_cmpxchg(&sriov->xa_rej_tmout, (unsigned long)rem_pv_cm_id, NULL, item, GFP_KERNEL);
+ if (old) {
+ pr_debug(
+ "Non-null old entry (%p) or error (%d) when inserting\n",
+ old, xa_err(old));
+ kfree(item);
+ return xa_err(old);
+ }
+
+ queue_delayed_work(cm_wq, &item->timeout, CM_CLEANUP_CACHE_TIMEOUT);
+
+ return 0;
+
+err_or_exists:
+ xa_unlock(&sriov->xa_rej_tmout);
+ return ret;
+}
+
+static int lookup_rej_tmout_slave(struct mlx4_ib_sriov *sriov, u32 rem_pv_cm_id)
+{
+ struct rej_tmout_entry *item;
+ int slave;
+
+ xa_lock(&sriov->xa_rej_tmout);
+ item = xa_load(&sriov->xa_rej_tmout, (unsigned long)rem_pv_cm_id);
+
+ if (!item || xa_err(item)) {
+ pr_debug("Could not find slave. rem_pv_cm_id 0x%x error: %d\n",
+ rem_pv_cm_id, xa_err(item));
+ slave = !item ? -ENOENT : xa_err(item);
+ } else {
+ slave = item->slave;
+ }
+ xa_unlock(&sriov->xa_rej_tmout);
+
+ return slave;
+}
+
+int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
+ struct ib_mad *mad)
+{
+ struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+ u32 rem_pv_cm_id = get_local_comm_id(mad);
+ u32 pv_cm_id;
+ struct id_map_entry *id;
+ int sts;
+
+ if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
+ mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+ union ib_gid gid;
+
+ if (!slave)
+ return 0;
+
+ gid = gid_from_req_msg(ibdev, mad);
+ *slave = mlx4_ib_find_real_gid(ibdev, port, gid.global.interface_id);
+ if (*slave < 0) {
+ mlx4_ib_warn(ibdev, "failed matching slave_id by gid (0x%llx)\n",
+ be64_to_cpu(gid.global.interface_id));
+ return -ENOENT;
+ }
+
+ sts = alloc_rej_tmout(sriov, rem_pv_cm_id, *slave);
+ if (sts)
+ /* Even if this fails, we pass on the REQ to the slave */
+ pr_debug("Could not allocate rej_tmout entry. rem_pv_cm_id 0x%x slave %d status %d\n",
+ rem_pv_cm_id, *slave, sts);
+
+ return 0;
+ }
+
+ pv_cm_id = get_remote_comm_id(mad);
+ id = id_map_get(ibdev, (int *)&pv_cm_id, -1, -1);
+
+ if (!id) {
+ if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID &&
+ REJ_REASON(mad) == IB_CM_REJ_TIMEOUT && slave) {
+ *slave = lookup_rej_tmout_slave(sriov, rem_pv_cm_id);
+
+ return (*slave < 0) ? *slave : 0;
+ }
+ pr_debug("Couldn't find an entry for pv_cm_id 0x%x, attr_id 0x%x\n",
+ pv_cm_id, be16_to_cpu(mad->mad_hdr.attr_id));
+ return -ENOENT;
+ }
+
+ if (slave)
+ *slave = id->slave_id;
+ set_remote_comm_id(mad, id->sl_cm_id);
+
+ if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID ||
+ mad->mad_hdr.attr_id == CM_REJ_ATTR_ID)
+ schedule_delayed(ibdev, id);
+
+ return 0;
+}
+
+void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev)
+{
+ spin_lock_init(&dev->sriov.id_map_lock);
+ INIT_LIST_HEAD(&dev->sriov.cm_list);
+ dev->sriov.sl_id_map = RB_ROOT;
+ xa_init_flags(&dev->sriov.pv_id_table, XA_FLAGS_ALLOC);
+ xa_init(&dev->sriov.xa_rej_tmout);
+}
+
+static void rej_tmout_xa_cleanup(struct mlx4_ib_sriov *sriov, int slave)
+{
+ struct rej_tmout_entry *item;
+ bool flush_needed = false;
+ unsigned long id;
+ int cnt = 0;
+
+ xa_lock(&sriov->xa_rej_tmout);
+ xa_for_each(&sriov->xa_rej_tmout, id, item) {
+ if (slave < 0 || slave == item->slave) {
+ mod_delayed_work(cm_wq, &item->timeout, 0);
+ flush_needed = true;
+ ++cnt;
+ }
+ }
+ xa_unlock(&sriov->xa_rej_tmout);
+
+ if (flush_needed) {
+ flush_workqueue(cm_wq);
+ pr_debug("Deleted %d entries in xarray for slave %d during cleanup\n",
+ cnt, slave);
+ }
+
+ if (slave < 0)
+ WARN_ON(!xa_empty(&sriov->xa_rej_tmout));
+}
+
+/* slave = -1 ==> all slaves */
+/* TBD -- call paravirt clean for single slave. Need for slave RESET event */
+void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave)
+{
+ struct mlx4_ib_sriov *sriov = &dev->sriov;
+ struct rb_root *sl_id_map = &sriov->sl_id_map;
+ struct list_head lh;
+ struct rb_node *nd;
+ int need_flush = 0;
+ struct id_map_entry *map, *tmp_map;
+ /* cancel all delayed work queue entries */
+ INIT_LIST_HEAD(&lh);
+ spin_lock(&sriov->id_map_lock);
+ list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) {
+ if (slave < 0 || slave == map->slave_id) {
+ if (map->scheduled_delete)
+ need_flush |= !cancel_delayed_work(&map->timeout);
+ }
+ }
+
+ spin_unlock(&sriov->id_map_lock);
+
+ if (need_flush)
+ flush_workqueue(cm_wq); /* make sure all timers were flushed */
+
+ /* now, remove all leftover entries from databases*/
+ spin_lock(&sriov->id_map_lock);
+ if (slave < 0) {
+ while (rb_first(sl_id_map)) {
+ struct id_map_entry *ent =
+ rb_entry(rb_first(sl_id_map),
+ struct id_map_entry, node);
+
+ rb_erase(&ent->node, sl_id_map);
+ xa_erase(&sriov->pv_id_table, ent->pv_cm_id);
+ }
+ list_splice_init(&dev->sriov.cm_list, &lh);
+ } else {
+ /* first, move nodes belonging to slave to db remove list */
+ nd = rb_first(sl_id_map);
+ while (nd) {
+ struct id_map_entry *ent =
+ rb_entry(nd, struct id_map_entry, node);
+ nd = rb_next(nd);
+ if (ent->slave_id == slave)
+ list_move_tail(&ent->list, &lh);
+ }
+ /* remove those nodes from databases */
+ list_for_each_entry_safe(map, tmp_map, &lh, list) {
+ rb_erase(&map->node, sl_id_map);
+ xa_erase(&sriov->pv_id_table, map->pv_cm_id);
+ }
+
+ /* add remaining nodes from cm_list */
+ list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) {
+ if (slave == map->slave_id)
+ list_move_tail(&map->list, &lh);
+ }
+ }
+
+ spin_unlock(&sriov->id_map_lock);
+
+ /* free any map entries left behind due to cancel_delayed_work above */
+ list_for_each_entry_safe(map, tmp_map, &lh, list) {
+ list_del(&map->list);
+ kfree(map);
+ }
+
+ rej_tmout_xa_cleanup(sriov, slave);
+}
+
+int mlx4_ib_cm_init(void)
+{
+ cm_wq = alloc_workqueue("mlx4_ib_cm", 0, 0);
+ if (!cm_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void mlx4_ib_cm_destroy(void)
+{
+ destroy_workqueue(cm_wq);
+}
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
new file mode 100644
index 000000000..4cd738aae
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -0,0 +1,973 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/srq.h>
+#include <linux/slab.h>
+
+#include "mlx4_ib.h"
+#include <rdma/mlx4-abi.h>
+#include <rdma/uverbs_ioctl.h>
+
+static void mlx4_ib_cq_comp(struct mlx4_cq *cq)
+{
+ struct ib_cq *ibcq = &to_mibcq(cq)->ibcq;
+ ibcq->comp_handler(ibcq, ibcq->cq_context);
+}
+
+static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type)
+{
+ struct ib_event event;
+ struct ib_cq *ibcq;
+
+ if (type != MLX4_EVENT_TYPE_CQ_ERROR) {
+ pr_warn("Unexpected event type %d "
+ "on CQ %06x\n", type, cq->cqn);
+ return;
+ }
+
+ ibcq = &to_mibcq(cq)->ibcq;
+ if (ibcq->event_handler) {
+ event.device = ibcq->device;
+ event.event = IB_EVENT_CQ_ERR;
+ event.element.cq = ibcq;
+ ibcq->event_handler(&event, ibcq->cq_context);
+ }
+}
+
+static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n)
+{
+ return mlx4_buf_offset(&buf->buf, n * buf->entry_size);
+}
+
+static void *get_cqe(struct mlx4_ib_cq *cq, int n)
+{
+ return get_cqe_from_buf(&cq->buf, n);
+}
+
+static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n)
+{
+ struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe);
+ struct mlx4_cqe *tcqe = ((cq->buf.entry_size == 64) ? (cqe + 1) : cqe);
+
+ return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+ !!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe;
+}
+
+static struct mlx4_cqe *next_cqe_sw(struct mlx4_ib_cq *cq)
+{
+ return get_sw_cqe(cq, cq->mcq.cons_index);
+}
+
+int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+ struct mlx4_ib_cq *mcq = to_mcq(cq);
+ struct mlx4_ib_dev *dev = to_mdev(cq->device);
+
+ return mlx4_cq_modify(dev->dev, &mcq->mcq, cq_count, cq_period);
+}
+
+static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int nent)
+{
+ int err;
+
+ err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size,
+ PAGE_SIZE * 2, &buf->buf);
+
+ if (err)
+ goto out;
+
+ buf->entry_size = dev->dev->caps.cqe_size;
+ err = mlx4_mtt_init(dev->dev, buf->buf.npages, buf->buf.page_shift,
+ &buf->mtt);
+ if (err)
+ goto err_buf;
+
+ err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf);
+ if (err)
+ goto err_mtt;
+
+ return 0;
+
+err_mtt:
+ mlx4_mtt_cleanup(dev->dev, &buf->mtt);
+
+err_buf:
+ mlx4_buf_free(dev->dev, nent * buf->entry_size, &buf->buf);
+
+out:
+ return err;
+}
+
+static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int cqe)
+{
+ mlx4_buf_free(dev->dev, (cqe + 1) * buf->entry_size, &buf->buf);
+}
+
+static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev,
+ struct mlx4_ib_cq_buf *buf,
+ struct ib_umem **umem, u64 buf_addr, int cqe)
+{
+ int err;
+ int cqe_size = dev->dev->caps.cqe_size;
+ int shift;
+ int n;
+
+ *umem = ib_umem_get(&dev->ib_dev, buf_addr, cqe * cqe_size,
+ IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(*umem))
+ return PTR_ERR(*umem);
+
+ shift = mlx4_ib_umem_calc_optimal_mtt_size(*umem, 0, &n);
+ err = mlx4_mtt_init(dev->dev, n, shift, &buf->mtt);
+
+ if (err)
+ goto err_buf;
+
+ err = mlx4_ib_umem_write_mtt(dev, &buf->mtt, *umem);
+ if (err)
+ goto err_mtt;
+
+ return 0;
+
+err_mtt:
+ mlx4_mtt_cleanup(dev->dev, &buf->mtt);
+
+err_buf:
+ ib_umem_release(*umem);
+
+ return err;
+}
+
+#define CQ_CREATE_FLAGS_SUPPORTED IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION
+int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+ struct ib_udata *udata)
+{
+ struct ib_device *ibdev = ibcq->device;
+ int entries = attr->cqe;
+ int vector = attr->comp_vector;
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct mlx4_ib_cq *cq = to_mcq(ibcq);
+ struct mlx4_uar *uar;
+ void *buf_addr;
+ int err;
+ struct mlx4_ib_ucontext *context = rdma_udata_to_drv_context(
+ udata, struct mlx4_ib_ucontext, ibucontext);
+
+ if (entries < 1 || entries > dev->dev->caps.max_cqes)
+ return -EINVAL;
+
+ if (attr->flags & ~CQ_CREATE_FLAGS_SUPPORTED)
+ return -EINVAL;
+
+ entries = roundup_pow_of_two(entries + 1);
+ cq->ibcq.cqe = entries - 1;
+ mutex_init(&cq->resize_mutex);
+ spin_lock_init(&cq->lock);
+ cq->resize_buf = NULL;
+ cq->resize_umem = NULL;
+ cq->create_flags = attr->flags;
+ INIT_LIST_HEAD(&cq->send_qp_list);
+ INIT_LIST_HEAD(&cq->recv_qp_list);
+
+ if (udata) {
+ struct mlx4_ib_create_cq ucmd;
+
+ if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+ err = -EFAULT;
+ goto err_cq;
+ }
+
+ buf_addr = (void *)(unsigned long)ucmd.buf_addr;
+ err = mlx4_ib_get_cq_umem(dev, &cq->buf, &cq->umem,
+ ucmd.buf_addr, entries);
+ if (err)
+ goto err_cq;
+
+ err = mlx4_ib_db_map_user(udata, ucmd.db_addr, &cq->db);
+ if (err)
+ goto err_mtt;
+
+ uar = &context->uar;
+ cq->mcq.usage = MLX4_RES_USAGE_USER_VERBS;
+ } else {
+ err = mlx4_db_alloc(dev->dev, &cq->db, 1);
+ if (err)
+ goto err_cq;
+
+ cq->mcq.set_ci_db = cq->db.db;
+ cq->mcq.arm_db = cq->db.db + 1;
+ *cq->mcq.set_ci_db = 0;
+ *cq->mcq.arm_db = 0;
+
+ err = mlx4_ib_alloc_cq_buf(dev, &cq->buf, entries);
+ if (err)
+ goto err_db;
+
+ buf_addr = &cq->buf.buf;
+
+ uar = &dev->priv_uar;
+ cq->mcq.usage = MLX4_RES_USAGE_DRIVER;
+ }
+
+ if (dev->eq_table)
+ vector = dev->eq_table[vector % ibdev->num_comp_vectors];
+
+ err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar, cq->db.dma,
+ &cq->mcq, vector, 0,
+ !!(cq->create_flags &
+ IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION),
+ buf_addr, !!udata);
+ if (err)
+ goto err_dbmap;
+
+ if (udata)
+ cq->mcq.tasklet_ctx.comp = mlx4_ib_cq_comp;
+ else
+ cq->mcq.comp = mlx4_ib_cq_comp;
+ cq->mcq.event = mlx4_ib_cq_event;
+
+ if (udata)
+ if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof (__u32))) {
+ err = -EFAULT;
+ goto err_cq_free;
+ }
+
+ return 0;
+
+err_cq_free:
+ mlx4_cq_free(dev->dev, &cq->mcq);
+
+err_dbmap:
+ if (udata)
+ mlx4_ib_db_unmap_user(context, &cq->db);
+
+err_mtt:
+ mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt);
+
+ ib_umem_release(cq->umem);
+ if (!udata)
+ mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+
+err_db:
+ if (!udata)
+ mlx4_db_free(dev->dev, &cq->db);
+err_cq:
+ return err;
+}
+
+static int mlx4_alloc_resize_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq,
+ int entries)
+{
+ int err;
+
+ if (cq->resize_buf)
+ return -EBUSY;
+
+ cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_KERNEL);
+ if (!cq->resize_buf)
+ return -ENOMEM;
+
+ err = mlx4_ib_alloc_cq_buf(dev, &cq->resize_buf->buf, entries);
+ if (err) {
+ kfree(cq->resize_buf);
+ cq->resize_buf = NULL;
+ return err;
+ }
+
+ cq->resize_buf->cqe = entries - 1;
+
+ return 0;
+}
+
+static int mlx4_alloc_resize_umem(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq,
+ int entries, struct ib_udata *udata)
+{
+ struct mlx4_ib_resize_cq ucmd;
+ int err;
+
+ if (cq->resize_umem)
+ return -EBUSY;
+
+ if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
+ return -EFAULT;
+
+ cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_KERNEL);
+ if (!cq->resize_buf)
+ return -ENOMEM;
+
+ err = mlx4_ib_get_cq_umem(dev, &cq->resize_buf->buf, &cq->resize_umem,
+ ucmd.buf_addr, entries);
+ if (err) {
+ kfree(cq->resize_buf);
+ cq->resize_buf = NULL;
+ return err;
+ }
+
+ cq->resize_buf->cqe = entries - 1;
+
+ return 0;
+}
+
+static int mlx4_ib_get_outstanding_cqes(struct mlx4_ib_cq *cq)
+{
+ u32 i;
+
+ i = cq->mcq.cons_index;
+ while (get_sw_cqe(cq, i))
+ ++i;
+
+ return i - cq->mcq.cons_index;
+}
+
+static void mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq)
+{
+ struct mlx4_cqe *cqe, *new_cqe;
+ int i;
+ int cqe_size = cq->buf.entry_size;
+ int cqe_inc = cqe_size == 64 ? 1 : 0;
+
+ i = cq->mcq.cons_index;
+ cqe = get_cqe(cq, i & cq->ibcq.cqe);
+ cqe += cqe_inc;
+
+ while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) {
+ new_cqe = get_cqe_from_buf(&cq->resize_buf->buf,
+ (i + 1) & cq->resize_buf->cqe);
+ memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), cqe_size);
+ new_cqe += cqe_inc;
+
+ new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) |
+ (((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0);
+ cqe = get_cqe(cq, ++i & cq->ibcq.cqe);
+ cqe += cqe_inc;
+ }
+ ++cq->mcq.cons_index;
+}
+
+int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibcq->device);
+ struct mlx4_ib_cq *cq = to_mcq(ibcq);
+ struct mlx4_mtt mtt;
+ int outst_cqe;
+ int err;
+
+ mutex_lock(&cq->resize_mutex);
+ if (entries < 1 || entries > dev->dev->caps.max_cqes) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ entries = roundup_pow_of_two(entries + 1);
+ if (entries == ibcq->cqe + 1) {
+ err = 0;
+ goto out;
+ }
+
+ if (entries > dev->dev->caps.max_cqes + 1) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (ibcq->uobject) {
+ err = mlx4_alloc_resize_umem(dev, cq, entries, udata);
+ if (err)
+ goto out;
+ } else {
+ /* Can't be smaller than the number of outstanding CQEs */
+ outst_cqe = mlx4_ib_get_outstanding_cqes(cq);
+ if (entries < outst_cqe + 1) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = mlx4_alloc_resize_buf(dev, cq, entries);
+ if (err)
+ goto out;
+ }
+
+ mtt = cq->buf.mtt;
+
+ err = mlx4_cq_resize(dev->dev, &cq->mcq, entries, &cq->resize_buf->buf.mtt);
+ if (err)
+ goto err_buf;
+
+ mlx4_mtt_cleanup(dev->dev, &mtt);
+ if (ibcq->uobject) {
+ cq->buf = cq->resize_buf->buf;
+ cq->ibcq.cqe = cq->resize_buf->cqe;
+ ib_umem_release(cq->umem);
+ cq->umem = cq->resize_umem;
+
+ kfree(cq->resize_buf);
+ cq->resize_buf = NULL;
+ cq->resize_umem = NULL;
+ } else {
+ struct mlx4_ib_cq_buf tmp_buf;
+ int tmp_cqe = 0;
+
+ spin_lock_irq(&cq->lock);
+ if (cq->resize_buf) {
+ mlx4_ib_cq_resize_copy_cqes(cq);
+ tmp_buf = cq->buf;
+ tmp_cqe = cq->ibcq.cqe;
+ cq->buf = cq->resize_buf->buf;
+ cq->ibcq.cqe = cq->resize_buf->cqe;
+
+ kfree(cq->resize_buf);
+ cq->resize_buf = NULL;
+ }
+ spin_unlock_irq(&cq->lock);
+
+ if (tmp_cqe)
+ mlx4_ib_free_cq_buf(dev, &tmp_buf, tmp_cqe);
+ }
+
+ goto out;
+
+err_buf:
+ mlx4_mtt_cleanup(dev->dev, &cq->resize_buf->buf.mtt);
+ if (!ibcq->uobject)
+ mlx4_ib_free_cq_buf(dev, &cq->resize_buf->buf,
+ cq->resize_buf->cqe);
+
+ kfree(cq->resize_buf);
+ cq->resize_buf = NULL;
+
+ ib_umem_release(cq->resize_umem);
+ cq->resize_umem = NULL;
+out:
+ mutex_unlock(&cq->resize_mutex);
+
+ return err;
+}
+
+int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(cq->device);
+ struct mlx4_ib_cq *mcq = to_mcq(cq);
+
+ mlx4_cq_free(dev->dev, &mcq->mcq);
+ mlx4_mtt_cleanup(dev->dev, &mcq->buf.mtt);
+
+ if (udata) {
+ mlx4_ib_db_unmap_user(
+ rdma_udata_to_drv_context(
+ udata,
+ struct mlx4_ib_ucontext,
+ ibucontext),
+ &mcq->db);
+ } else {
+ mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe);
+ mlx4_db_free(dev->dev, &mcq->db);
+ }
+ ib_umem_release(mcq->umem);
+ return 0;
+}
+
+static void dump_cqe(void *cqe)
+{
+ __be32 *buf = cqe;
+
+ pr_debug("CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n",
+ be32_to_cpu(buf[0]), be32_to_cpu(buf[1]), be32_to_cpu(buf[2]),
+ be32_to_cpu(buf[3]), be32_to_cpu(buf[4]), be32_to_cpu(buf[5]),
+ be32_to_cpu(buf[6]), be32_to_cpu(buf[7]));
+}
+
+static void mlx4_ib_handle_error_cqe(struct mlx4_err_cqe *cqe,
+ struct ib_wc *wc)
+{
+ if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) {
+ pr_debug("local QP operation err "
+ "(QPN %06x, WQE index %x, vendor syndrome %02x, "
+ "opcode = %02x)\n",
+ be32_to_cpu(cqe->my_qpn), be16_to_cpu(cqe->wqe_index),
+ cqe->vendor_err_syndrome,
+ cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+ dump_cqe(cqe);
+ }
+
+ switch (cqe->syndrome) {
+ case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR:
+ wc->status = IB_WC_LOC_LEN_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR:
+ wc->status = IB_WC_LOC_QP_OP_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR:
+ wc->status = IB_WC_LOC_PROT_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_WR_FLUSH_ERR:
+ wc->status = IB_WC_WR_FLUSH_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_MW_BIND_ERR:
+ wc->status = IB_WC_MW_BIND_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_BAD_RESP_ERR:
+ wc->status = IB_WC_BAD_RESP_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR:
+ wc->status = IB_WC_LOC_ACCESS_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
+ wc->status = IB_WC_REM_INV_REQ_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR:
+ wc->status = IB_WC_REM_ACCESS_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_REMOTE_OP_ERR:
+ wc->status = IB_WC_REM_OP_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
+ wc->status = IB_WC_RETRY_EXC_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
+ wc->status = IB_WC_RNR_RETRY_EXC_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR:
+ wc->status = IB_WC_REM_ABORT_ERR;
+ break;
+ default:
+ wc->status = IB_WC_GENERAL_ERR;
+ break;
+ }
+
+ wc->vendor_err = cqe->vendor_err_syndrome;
+}
+
+static int mlx4_ib_ipoib_csum_ok(__be16 status, u8 badfcs_enc, __be16 checksum)
+{
+ return ((badfcs_enc & MLX4_CQE_STATUS_L4_CSUM) ||
+ ((status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) &&
+ (status & cpu_to_be16(MLX4_CQE_STATUS_TCP |
+ MLX4_CQE_STATUS_UDP)) &&
+ (checksum == cpu_to_be16(0xffff))));
+}
+
+static void use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc,
+ unsigned tail, struct mlx4_cqe *cqe, int is_eth)
+{
+ struct mlx4_ib_proxy_sqp_hdr *hdr;
+
+ ib_dma_sync_single_for_cpu(qp->ibqp.device,
+ qp->sqp_proxy_rcv[tail].map,
+ sizeof (struct mlx4_ib_proxy_sqp_hdr),
+ DMA_FROM_DEVICE);
+ hdr = (struct mlx4_ib_proxy_sqp_hdr *) (qp->sqp_proxy_rcv[tail].addr);
+ wc->pkey_index = be16_to_cpu(hdr->tun.pkey_index);
+ wc->src_qp = be32_to_cpu(hdr->tun.flags_src_qp) & 0xFFFFFF;
+ wc->wc_flags |= (hdr->tun.g_ml_path & 0x80) ? (IB_WC_GRH) : 0;
+ wc->dlid_path_bits = 0;
+
+ if (is_eth) {
+ wc->slid = 0;
+ wc->vlan_id = be16_to_cpu(hdr->tun.sl_vid);
+ memcpy(&(wc->smac[0]), (char *)&hdr->tun.mac_31_0, 4);
+ memcpy(&(wc->smac[4]), (char *)&hdr->tun.slid_mac_47_32, 2);
+ wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC);
+ } else {
+ wc->slid = be16_to_cpu(hdr->tun.slid_mac_47_32);
+ wc->sl = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12);
+ }
+}
+
+static void mlx4_ib_qp_sw_comp(struct mlx4_ib_qp *qp, int num_entries,
+ struct ib_wc *wc, int *npolled, int is_send)
+{
+ struct mlx4_ib_wq *wq;
+ unsigned cur;
+ int i;
+
+ wq = is_send ? &qp->sq : &qp->rq;
+ cur = wq->head - wq->tail;
+
+ if (cur == 0)
+ return;
+
+ for (i = 0; i < cur && *npolled < num_entries; i++) {
+ wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+ wc->status = IB_WC_WR_FLUSH_ERR;
+ wc->vendor_err = MLX4_CQE_SYNDROME_WR_FLUSH_ERR;
+ wq->tail++;
+ (*npolled)++;
+ wc->qp = &qp->ibqp;
+ wc++;
+ }
+}
+
+static void mlx4_ib_poll_sw_comp(struct mlx4_ib_cq *cq, int num_entries,
+ struct ib_wc *wc, int *npolled)
+{
+ struct mlx4_ib_qp *qp;
+
+ *npolled = 0;
+ /* Find uncompleted WQEs belonging to that cq and return
+ * simulated FLUSH_ERR completions
+ */
+ list_for_each_entry(qp, &cq->send_qp_list, cq_send_list) {
+ mlx4_ib_qp_sw_comp(qp, num_entries, wc + *npolled, npolled, 1);
+ if (*npolled >= num_entries)
+ goto out;
+ }
+
+ list_for_each_entry(qp, &cq->recv_qp_list, cq_recv_list) {
+ mlx4_ib_qp_sw_comp(qp, num_entries, wc + *npolled, npolled, 0);
+ if (*npolled >= num_entries)
+ goto out;
+ }
+
+out:
+ return;
+}
+
+static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
+ struct mlx4_ib_qp **cur_qp,
+ struct ib_wc *wc)
+{
+ struct mlx4_cqe *cqe;
+ struct mlx4_qp *mqp;
+ struct mlx4_ib_wq *wq;
+ struct mlx4_ib_srq *srq;
+ struct mlx4_srq *msrq = NULL;
+ int is_send;
+ int is_error;
+ int is_eth;
+ u32 g_mlpath_rqpn;
+ u16 wqe_ctr;
+ unsigned tail = 0;
+
+repoll:
+ cqe = next_cqe_sw(cq);
+ if (!cqe)
+ return -EAGAIN;
+
+ if (cq->buf.entry_size == 64)
+ cqe++;
+
+ ++cq->mcq.cons_index;
+
+ /*
+ * Make sure we read CQ entry contents after we've checked the
+ * ownership bit.
+ */
+ rmb();
+
+ is_send = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK;
+ is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+ MLX4_CQE_OPCODE_ERROR;
+
+ /* Resize CQ in progress */
+ if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_RESIZE)) {
+ if (cq->resize_buf) {
+ struct mlx4_ib_dev *dev = to_mdev(cq->ibcq.device);
+
+ mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+ cq->buf = cq->resize_buf->buf;
+ cq->ibcq.cqe = cq->resize_buf->cqe;
+
+ kfree(cq->resize_buf);
+ cq->resize_buf = NULL;
+ }
+
+ goto repoll;
+ }
+
+ if (!*cur_qp ||
+ (be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) != (*cur_qp)->mqp.qpn) {
+ /*
+ * We do not have to take the QP table lock here,
+ * because CQs will be locked while QPs are removed
+ * from the table.
+ */
+ mqp = __mlx4_qp_lookup(to_mdev(cq->ibcq.device)->dev,
+ be32_to_cpu(cqe->vlan_my_qpn));
+ *cur_qp = to_mibqp(mqp);
+ }
+
+ wc->qp = &(*cur_qp)->ibqp;
+
+ if (wc->qp->qp_type == IB_QPT_XRC_TGT) {
+ u32 srq_num;
+ g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn);
+ srq_num = g_mlpath_rqpn & 0xffffff;
+ /* SRQ is also in the radix tree */
+ msrq = mlx4_srq_lookup(to_mdev(cq->ibcq.device)->dev,
+ srq_num);
+ }
+
+ if (is_send) {
+ wq = &(*cur_qp)->sq;
+ if (!(*cur_qp)->sq_signal_bits) {
+ wqe_ctr = be16_to_cpu(cqe->wqe_index);
+ wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
+ }
+ wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+ ++wq->tail;
+ } else if ((*cur_qp)->ibqp.srq) {
+ srq = to_msrq((*cur_qp)->ibqp.srq);
+ wqe_ctr = be16_to_cpu(cqe->wqe_index);
+ wc->wr_id = srq->wrid[wqe_ctr];
+ mlx4_ib_free_srq_wqe(srq, wqe_ctr);
+ } else if (msrq) {
+ srq = to_mibsrq(msrq);
+ wqe_ctr = be16_to_cpu(cqe->wqe_index);
+ wc->wr_id = srq->wrid[wqe_ctr];
+ mlx4_ib_free_srq_wqe(srq, wqe_ctr);
+ } else {
+ wq = &(*cur_qp)->rq;
+ tail = wq->tail & (wq->wqe_cnt - 1);
+ wc->wr_id = wq->wrid[tail];
+ ++wq->tail;
+ }
+
+ if (unlikely(is_error)) {
+ mlx4_ib_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc);
+ return 0;
+ }
+
+ wc->status = IB_WC_SUCCESS;
+
+ if (is_send) {
+ wc->wc_flags = 0;
+ switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+ case MLX4_OPCODE_RDMA_WRITE_IMM:
+ wc->wc_flags |= IB_WC_WITH_IMM;
+ fallthrough;
+ case MLX4_OPCODE_RDMA_WRITE:
+ wc->opcode = IB_WC_RDMA_WRITE;
+ break;
+ case MLX4_OPCODE_SEND_IMM:
+ wc->wc_flags |= IB_WC_WITH_IMM;
+ fallthrough;
+ case MLX4_OPCODE_SEND:
+ case MLX4_OPCODE_SEND_INVAL:
+ wc->opcode = IB_WC_SEND;
+ break;
+ case MLX4_OPCODE_RDMA_READ:
+ wc->opcode = IB_WC_RDMA_READ;
+ wc->byte_len = be32_to_cpu(cqe->byte_cnt);
+ break;
+ case MLX4_OPCODE_ATOMIC_CS:
+ wc->opcode = IB_WC_COMP_SWAP;
+ wc->byte_len = 8;
+ break;
+ case MLX4_OPCODE_ATOMIC_FA:
+ wc->opcode = IB_WC_FETCH_ADD;
+ wc->byte_len = 8;
+ break;
+ case MLX4_OPCODE_MASKED_ATOMIC_CS:
+ wc->opcode = IB_WC_MASKED_COMP_SWAP;
+ wc->byte_len = 8;
+ break;
+ case MLX4_OPCODE_MASKED_ATOMIC_FA:
+ wc->opcode = IB_WC_MASKED_FETCH_ADD;
+ wc->byte_len = 8;
+ break;
+ case MLX4_OPCODE_LSO:
+ wc->opcode = IB_WC_LSO;
+ break;
+ case MLX4_OPCODE_FMR:
+ wc->opcode = IB_WC_REG_MR;
+ break;
+ case MLX4_OPCODE_LOCAL_INVAL:
+ wc->opcode = IB_WC_LOCAL_INV;
+ break;
+ }
+ } else {
+ wc->byte_len = be32_to_cpu(cqe->byte_cnt);
+
+ switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+ case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
+ wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+ wc->wc_flags = IB_WC_WITH_IMM;
+ wc->ex.imm_data = cqe->immed_rss_invalid;
+ break;
+ case MLX4_RECV_OPCODE_SEND_INVAL:
+ wc->opcode = IB_WC_RECV;
+ wc->wc_flags = IB_WC_WITH_INVALIDATE;
+ wc->ex.invalidate_rkey = be32_to_cpu(cqe->immed_rss_invalid);
+ break;
+ case MLX4_RECV_OPCODE_SEND:
+ wc->opcode = IB_WC_RECV;
+ wc->wc_flags = 0;
+ break;
+ case MLX4_RECV_OPCODE_SEND_IMM:
+ wc->opcode = IB_WC_RECV;
+ wc->wc_flags = IB_WC_WITH_IMM;
+ wc->ex.imm_data = cqe->immed_rss_invalid;
+ break;
+ }
+
+ is_eth = (rdma_port_get_link_layer(wc->qp->device,
+ (*cur_qp)->port) ==
+ IB_LINK_LAYER_ETHERNET);
+ if (mlx4_is_mfunc(to_mdev(cq->ibcq.device)->dev)) {
+ if ((*cur_qp)->mlx4_ib_qp_type &
+ (MLX4_IB_QPT_PROXY_SMI_OWNER |
+ MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) {
+ use_tunnel_data(*cur_qp, cq, wc, tail, cqe,
+ is_eth);
+ return 0;
+ }
+ }
+
+ g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn);
+ wc->src_qp = g_mlpath_rqpn & 0xffffff;
+ wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f;
+ wc->wc_flags |= g_mlpath_rqpn & 0x80000000 ? IB_WC_GRH : 0;
+ wc->pkey_index = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f;
+ wc->wc_flags |= mlx4_ib_ipoib_csum_ok(cqe->status,
+ cqe->badfcs_enc,
+ cqe->checksum) ? IB_WC_IP_CSUM_OK : 0;
+ if (is_eth) {
+ wc->slid = 0;
+ wc->sl = be16_to_cpu(cqe->sl_vid) >> 13;
+ if (be32_to_cpu(cqe->vlan_my_qpn) &
+ MLX4_CQE_CVLAN_PRESENT_MASK) {
+ wc->vlan_id = be16_to_cpu(cqe->sl_vid) &
+ MLX4_CQE_VID_MASK;
+ } else {
+ wc->vlan_id = 0xffff;
+ }
+ memcpy(wc->smac, cqe->smac, ETH_ALEN);
+ wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC);
+ } else {
+ wc->slid = be16_to_cpu(cqe->rlid);
+ wc->sl = be16_to_cpu(cqe->sl_vid) >> 12;
+ wc->vlan_id = 0xffff;
+ }
+ }
+
+ return 0;
+}
+
+int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+ struct mlx4_ib_cq *cq = to_mcq(ibcq);
+ struct mlx4_ib_qp *cur_qp = NULL;
+ unsigned long flags;
+ int npolled;
+ struct mlx4_ib_dev *mdev = to_mdev(cq->ibcq.device);
+
+ spin_lock_irqsave(&cq->lock, flags);
+ if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+ mlx4_ib_poll_sw_comp(cq, num_entries, wc, &npolled);
+ goto out;
+ }
+
+ for (npolled = 0; npolled < num_entries; ++npolled) {
+ if (mlx4_ib_poll_one(cq, &cur_qp, wc + npolled))
+ break;
+ }
+
+ mlx4_cq_set_ci(&cq->mcq);
+
+out:
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ return npolled;
+}
+
+int mlx4_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+ mlx4_cq_arm(&to_mcq(ibcq)->mcq,
+ (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+ MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT,
+ to_mdev(ibcq->device)->uar_map,
+ MLX4_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->uar_lock));
+
+ return 0;
+}
+
+void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
+{
+ u32 prod_index;
+ int nfreed = 0;
+ struct mlx4_cqe *cqe, *dest;
+ u8 owner_bit;
+ int cqe_inc = cq->buf.entry_size == 64 ? 1 : 0;
+
+ /*
+ * First we need to find the current producer index, so we
+ * know where to start cleaning from. It doesn't matter if HW
+ * adds new entries after this loop -- the QP we're worried
+ * about is already in RESET, so the new entries won't come
+ * from our QP and therefore don't need to be checked.
+ */
+ for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); ++prod_index)
+ if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe)
+ break;
+
+ /*
+ * Now sweep backwards through the CQ, removing CQ entries
+ * that match our QP by copying older entries on top of them.
+ */
+ while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
+ cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
+ cqe += cqe_inc;
+
+ if ((be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) {
+ if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
+ mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index));
+ ++nfreed;
+ } else if (nfreed) {
+ dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe);
+ dest += cqe_inc;
+
+ owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK;
+ memcpy(dest, cqe, sizeof *cqe);
+ dest->owner_sr_opcode = owner_bit |
+ (dest->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+ }
+ }
+
+ if (nfreed) {
+ cq->mcq.cons_index += nfreed;
+ /*
+ * Make sure update of buffer contents is done before
+ * updating consumer index.
+ */
+ wmb();
+ mlx4_cq_set_ci(&cq->mcq);
+ }
+}
+
+void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
+{
+ spin_lock_irq(&cq->lock);
+ __mlx4_ib_cq_clean(cq, qpn, srq);
+ spin_unlock_irq(&cq->lock);
+}
diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c
new file mode 100644
index 000000000..9bbd695a9
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/doorbell.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <rdma/uverbs_ioctl.h>
+
+#include "mlx4_ib.h"
+
+struct mlx4_ib_user_db_page {
+ struct list_head list;
+ struct ib_umem *umem;
+ unsigned long user_virt;
+ int refcnt;
+};
+
+int mlx4_ib_db_map_user(struct ib_udata *udata, unsigned long virt,
+ struct mlx4_db *db)
+{
+ struct mlx4_ib_user_db_page *page;
+ int err = 0;
+ struct mlx4_ib_ucontext *context = rdma_udata_to_drv_context(
+ udata, struct mlx4_ib_ucontext, ibucontext);
+
+ mutex_lock(&context->db_page_mutex);
+
+ list_for_each_entry(page, &context->db_page_list, list)
+ if (page->user_virt == (virt & PAGE_MASK))
+ goto found;
+
+ page = kmalloc(sizeof *page, GFP_KERNEL);
+ if (!page) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ page->user_virt = (virt & PAGE_MASK);
+ page->refcnt = 0;
+ page->umem = ib_umem_get(context->ibucontext.device, virt & PAGE_MASK,
+ PAGE_SIZE, 0);
+ if (IS_ERR(page->umem)) {
+ err = PTR_ERR(page->umem);
+ kfree(page);
+ goto out;
+ }
+
+ list_add(&page->list, &context->db_page_list);
+
+found:
+ db->dma = sg_dma_address(page->umem->sgt_append.sgt.sgl) +
+ (virt & ~PAGE_MASK);
+ db->u.user_page = page;
+ ++page->refcnt;
+
+out:
+ mutex_unlock(&context->db_page_mutex);
+
+ return err;
+}
+
+void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db)
+{
+ mutex_lock(&context->db_page_mutex);
+
+ if (!--db->u.user_page->refcnt) {
+ list_del(&db->u.user_page->list);
+ ib_umem_release(db->u.user_page->umem);
+ kfree(db->u.user_page);
+ }
+
+ mutex_unlock(&context->db_page_mutex);
+}
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
new file mode 100644
index 000000000..a37cfac5e
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -0,0 +1,2399 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_cache.h>
+
+#include <linux/random.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/gfp.h>
+#include <rdma/ib_pma.h>
+#include <linux/ip.h>
+#include <net/ipv6.h>
+
+#include <linux/mlx4/driver.h>
+#include "mlx4_ib.h"
+
+enum {
+ MLX4_IB_VENDOR_CLASS1 = 0x9,
+ MLX4_IB_VENDOR_CLASS2 = 0xa
+};
+
+#define MLX4_TUN_SEND_WRID_SHIFT 34
+#define MLX4_TUN_QPN_SHIFT 32
+#define MLX4_TUN_WRID_RECV (((u64) 1) << MLX4_TUN_SEND_WRID_SHIFT)
+#define MLX4_TUN_SET_WRID_QPN(a) (((u64) ((a) & 0x3)) << MLX4_TUN_QPN_SHIFT)
+
+#define MLX4_TUN_IS_RECV(a) (((a) >> MLX4_TUN_SEND_WRID_SHIFT) & 0x1)
+#define MLX4_TUN_WRID_QPN(a) (((a) >> MLX4_TUN_QPN_SHIFT) & 0x3)
+
+ /* Port mgmt change event handling */
+
+#define GET_BLK_PTR_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.block_ptr)
+#define GET_MASK_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.tbl_entries_mask)
+#define NUM_IDX_IN_PKEY_TBL_BLK 32
+#define GUID_TBL_ENTRY_SIZE 8 /* size in bytes */
+#define GUID_TBL_BLK_NUM_ENTRIES 8
+#define GUID_TBL_BLK_SIZE (GUID_TBL_ENTRY_SIZE * GUID_TBL_BLK_NUM_ENTRIES)
+
+struct mlx4_mad_rcv_buf {
+ struct ib_grh grh;
+ u8 payload[256];
+} __packed;
+
+struct mlx4_mad_snd_buf {
+ u8 payload[256];
+} __packed;
+
+struct mlx4_tunnel_mad {
+ struct ib_grh grh;
+ struct mlx4_ib_tunnel_header hdr;
+ struct ib_mad mad;
+} __packed;
+
+struct mlx4_rcv_tunnel_mad {
+ struct mlx4_rcv_tunnel_hdr hdr;
+ struct ib_grh grh;
+ struct ib_mad mad;
+} __packed;
+
+static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u32 port_num);
+static void handle_lid_change_event(struct mlx4_ib_dev *dev, u32 port_num);
+static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
+ int block, u32 change_bitmap);
+
+__be64 mlx4_ib_gen_node_guid(void)
+{
+#define NODE_GUID_HI ((u64) (((u64)IB_OPENIB_OUI) << 40))
+ return cpu_to_be64(NODE_GUID_HI | get_random_u32());
+}
+
+__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx)
+{
+ return cpu_to_be64(atomic_inc_return(&ctx->tid)) |
+ cpu_to_be64(0xff00000000000000LL);
+}
+
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
+ int port, const struct ib_wc *in_wc,
+ const struct ib_grh *in_grh,
+ const void *in_mad, void *response_mad)
+{
+ struct mlx4_cmd_mailbox *inmailbox, *outmailbox;
+ void *inbox;
+ int err;
+ u32 in_modifier = port;
+ u8 op_modifier = 0;
+
+ inmailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+ if (IS_ERR(inmailbox))
+ return PTR_ERR(inmailbox);
+ inbox = inmailbox->buf;
+
+ outmailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+ if (IS_ERR(outmailbox)) {
+ mlx4_free_cmd_mailbox(dev->dev, inmailbox);
+ return PTR_ERR(outmailbox);
+ }
+
+ memcpy(inbox, in_mad, 256);
+
+ /*
+ * Key check traps can't be generated unless we have in_wc to
+ * tell us where to send the trap.
+ */
+ if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_MKEY) || !in_wc)
+ op_modifier |= 0x1;
+ if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_BKEY) || !in_wc)
+ op_modifier |= 0x2;
+ if (mlx4_is_mfunc(dev->dev) &&
+ (mad_ifc_flags & MLX4_MAD_IFC_NET_VIEW || in_wc))
+ op_modifier |= 0x8;
+
+ if (in_wc) {
+ struct {
+ __be32 my_qpn;
+ u32 reserved1;
+ __be32 rqpn;
+ u8 sl;
+ u8 g_path;
+ u16 reserved2[2];
+ __be16 pkey;
+ u32 reserved3[11];
+ u8 grh[40];
+ } *ext_info;
+
+ memset(inbox + 256, 0, 256);
+ ext_info = inbox + 256;
+
+ ext_info->my_qpn = cpu_to_be32(in_wc->qp->qp_num);
+ ext_info->rqpn = cpu_to_be32(in_wc->src_qp);
+ ext_info->sl = in_wc->sl << 4;
+ ext_info->g_path = in_wc->dlid_path_bits |
+ (in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0);
+ ext_info->pkey = cpu_to_be16(in_wc->pkey_index);
+
+ if (in_grh)
+ memcpy(ext_info->grh, in_grh, 40);
+
+ op_modifier |= 0x4;
+
+ in_modifier |= ib_lid_cpu16(in_wc->slid) << 16;
+ }
+
+ err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, in_modifier,
+ mlx4_is_master(dev->dev) ? (op_modifier & ~0x8) : op_modifier,
+ MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C,
+ (op_modifier & 0x8) ? MLX4_CMD_NATIVE : MLX4_CMD_WRAPPED);
+
+ if (!err)
+ memcpy(response_mad, outmailbox->buf, 256);
+
+ mlx4_free_cmd_mailbox(dev->dev, inmailbox);
+ mlx4_free_cmd_mailbox(dev->dev, outmailbox);
+
+ return err;
+}
+
+static void update_sm_ah(struct mlx4_ib_dev *dev, u32 port_num, u16 lid, u8 sl)
+{
+ struct ib_ah *new_ah;
+ struct rdma_ah_attr ah_attr;
+ unsigned long flags;
+
+ if (!dev->send_agent[port_num - 1][0])
+ return;
+
+ memset(&ah_attr, 0, sizeof ah_attr);
+ ah_attr.type = rdma_ah_find_type(&dev->ib_dev, port_num);
+ rdma_ah_set_dlid(&ah_attr, lid);
+ rdma_ah_set_sl(&ah_attr, sl);
+ rdma_ah_set_port_num(&ah_attr, port_num);
+
+ new_ah = rdma_create_ah(dev->send_agent[port_num - 1][0]->qp->pd,
+ &ah_attr, 0);
+ if (IS_ERR(new_ah))
+ return;
+
+ spin_lock_irqsave(&dev->sm_lock, flags);
+ if (dev->sm_ah[port_num - 1])
+ rdma_destroy_ah(dev->sm_ah[port_num - 1], 0);
+ dev->sm_ah[port_num - 1] = new_ah;
+ spin_unlock_irqrestore(&dev->sm_lock, flags);
+}
+
+/*
+ * Snoop SM MADs for port info, GUID info, and P_Key table sets, so we can
+ * synthesize LID change, Client-Rereg, GID change, and P_Key change events.
+ */
+static void smp_snoop(struct ib_device *ibdev, u32 port_num,
+ const struct ib_mad *mad, u16 prev_lid)
+{
+ struct ib_port_info *pinfo;
+ u16 lid;
+ __be16 *base;
+ u32 bn, pkey_change_bitmap;
+ int i;
+
+
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+ mad->mad_hdr.method == IB_MGMT_METHOD_SET)
+ switch (mad->mad_hdr.attr_id) {
+ case IB_SMP_ATTR_PORT_INFO:
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV)
+ return;
+ pinfo = (struct ib_port_info *) ((struct ib_smp *) mad)->data;
+ lid = be16_to_cpu(pinfo->lid);
+
+ update_sm_ah(dev, port_num,
+ be16_to_cpu(pinfo->sm_lid),
+ pinfo->neighbormtu_mastersmsl & 0xf);
+
+ if (pinfo->clientrereg_resv_subnetto & 0x80)
+ handle_client_rereg_event(dev, port_num);
+
+ if (prev_lid != lid)
+ handle_lid_change_event(dev, port_num);
+ break;
+
+ case IB_SMP_ATTR_PKEY_TABLE:
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV)
+ return;
+ if (!mlx4_is_mfunc(dev->dev)) {
+ mlx4_ib_dispatch_event(dev, port_num,
+ IB_EVENT_PKEY_CHANGE);
+ break;
+ }
+
+ /* at this point, we are running in the master.
+ * Slaves do not receive SMPs.
+ */
+ bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod) & 0xFFFF;
+ base = (__be16 *) &(((struct ib_smp *)mad)->data[0]);
+ pkey_change_bitmap = 0;
+ for (i = 0; i < 32; i++) {
+ pr_debug("PKEY[%d] = x%x\n",
+ i + bn*32, be16_to_cpu(base[i]));
+ if (be16_to_cpu(base[i]) !=
+ dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32]) {
+ pkey_change_bitmap |= (1 << i);
+ dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32] =
+ be16_to_cpu(base[i]);
+ }
+ }
+ pr_debug("PKEY Change event: port=%u, "
+ "block=0x%x, change_bitmap=0x%x\n",
+ port_num, bn, pkey_change_bitmap);
+
+ if (pkey_change_bitmap) {
+ mlx4_ib_dispatch_event(dev, port_num,
+ IB_EVENT_PKEY_CHANGE);
+ if (!dev->sriov.is_going_down)
+ __propagate_pkey_ev(dev, port_num, bn,
+ pkey_change_bitmap);
+ }
+ break;
+
+ case IB_SMP_ATTR_GUID_INFO:
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV)
+ return;
+ /* paravirtualized master's guid is guid 0 -- does not change */
+ if (!mlx4_is_master(dev->dev))
+ mlx4_ib_dispatch_event(dev, port_num,
+ IB_EVENT_GID_CHANGE);
+ /*if master, notify relevant slaves*/
+ if (mlx4_is_master(dev->dev) &&
+ !dev->sriov.is_going_down) {
+ bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod);
+ mlx4_ib_update_cache_on_guid_change(dev, bn, port_num,
+ (u8 *)(&((struct ib_smp *)mad)->data));
+ mlx4_ib_notify_slaves_on_guid_change(dev, bn, port_num,
+ (u8 *)(&((struct ib_smp *)mad)->data));
+ }
+ break;
+
+ case IB_SMP_ATTR_SL_TO_VL_TABLE:
+ /* cache sl to vl mapping changes for use in
+ * filling QP1 LRH VL field when sending packets
+ */
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV &&
+ dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT)
+ return;
+ if (!mlx4_is_slave(dev->dev)) {
+ union sl2vl_tbl_to_u64 sl2vl64;
+ int jj;
+
+ for (jj = 0; jj < 8; jj++) {
+ sl2vl64.sl8[jj] = ((struct ib_smp *)mad)->data[jj];
+ pr_debug("port %u, sl2vl[%d] = %02x\n",
+ port_num, jj, sl2vl64.sl8[jj]);
+ }
+ atomic64_set(&dev->sl2vl[port_num - 1], sl2vl64.sl64);
+ }
+ break;
+
+ default:
+ break;
+ }
+}
+
+static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
+ int block, u32 change_bitmap)
+{
+ int i, ix, slave, err;
+ int have_event = 0;
+
+ for (slave = 0; slave < dev->dev->caps.sqp_demux; slave++) {
+ if (slave == mlx4_master_func_num(dev->dev))
+ continue;
+ if (!mlx4_is_slave_active(dev->dev, slave))
+ continue;
+
+ have_event = 0;
+ for (i = 0; i < 32; i++) {
+ if (!(change_bitmap & (1 << i)))
+ continue;
+ for (ix = 0;
+ ix < dev->dev->caps.pkey_table_len[port_num]; ix++) {
+ if (dev->pkeys.virt2phys_pkey[slave][port_num - 1]
+ [ix] == i + 32 * block) {
+ err = mlx4_gen_pkey_eqe(dev->dev, slave, port_num);
+ pr_debug("propagate_pkey_ev: slave %d,"
+ " port %d, ix %d (%d)\n",
+ slave, port_num, ix, err);
+ have_event = 1;
+ break;
+ }
+ }
+ if (have_event)
+ break;
+ }
+ }
+}
+
+static void node_desc_override(struct ib_device *dev,
+ struct ib_mad *mad)
+{
+ unsigned long flags;
+
+ if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+ mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP &&
+ mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) {
+ spin_lock_irqsave(&to_mdev(dev)->sm_lock, flags);
+ memcpy(((struct ib_smp *) mad)->data, dev->node_desc,
+ IB_DEVICE_NODE_DESC_MAX);
+ spin_unlock_irqrestore(&to_mdev(dev)->sm_lock, flags);
+ }
+}
+
+static void forward_trap(struct mlx4_ib_dev *dev, u32 port_num,
+ const struct ib_mad *mad)
+{
+ int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED;
+ struct ib_mad_send_buf *send_buf;
+ struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn];
+ int ret;
+ unsigned long flags;
+
+ if (agent) {
+ send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR,
+ IB_MGMT_MAD_DATA, GFP_ATOMIC,
+ IB_MGMT_BASE_VERSION);
+ if (IS_ERR(send_buf))
+ return;
+ /*
+ * We rely here on the fact that MLX QPs don't use the
+ * address handle after the send is posted (this is
+ * wrong following the IB spec strictly, but we know
+ * it's OK for our devices).
+ */
+ spin_lock_irqsave(&dev->sm_lock, flags);
+ memcpy(send_buf->mad, mad, sizeof *mad);
+ if ((send_buf->ah = dev->sm_ah[port_num - 1]))
+ ret = ib_post_send_mad(send_buf, NULL);
+ else
+ ret = -EINVAL;
+ spin_unlock_irqrestore(&dev->sm_lock, flags);
+
+ if (ret)
+ ib_free_send_mad(send_buf);
+ }
+}
+
+static int mlx4_ib_demux_sa_handler(struct ib_device *ibdev, int port, int slave,
+ struct ib_sa_mad *sa_mad)
+{
+ int ret = 0;
+
+ /* dispatch to different sa handlers */
+ switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) {
+ case IB_SA_ATTR_MC_MEMBER_REC:
+ ret = mlx4_ib_mcg_demux_handler(ibdev, port, slave, sa_mad);
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+int mlx4_ib_find_real_gid(struct ib_device *ibdev, u32 port, __be64 guid)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ int i;
+
+ for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+ if (dev->sriov.demux[port - 1].guid_cache[i] == guid)
+ return i;
+ }
+ return -1;
+}
+
+
+static int find_slave_port_pkey_ix(struct mlx4_ib_dev *dev, int slave,
+ u32 port, u16 pkey, u16 *ix)
+{
+ int i, ret;
+ u8 unassigned_pkey_ix, pkey_ix, partial_ix = 0xFF;
+ u16 slot_pkey;
+
+ if (slave == mlx4_master_func_num(dev->dev))
+ return ib_find_cached_pkey(&dev->ib_dev, port, pkey, ix);
+
+ unassigned_pkey_ix = dev->dev->phys_caps.pkey_phys_table_len[port] - 1;
+
+ for (i = 0; i < dev->dev->caps.pkey_table_len[port]; i++) {
+ if (dev->pkeys.virt2phys_pkey[slave][port - 1][i] == unassigned_pkey_ix)
+ continue;
+
+ pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][i];
+
+ ret = ib_get_cached_pkey(&dev->ib_dev, port, pkey_ix, &slot_pkey);
+ if (ret)
+ continue;
+ if ((slot_pkey & 0x7FFF) == (pkey & 0x7FFF)) {
+ if (slot_pkey & 0x8000) {
+ *ix = (u16) pkey_ix;
+ return 0;
+ } else {
+ /* take first partial pkey index found */
+ if (partial_ix == 0xFF)
+ partial_ix = pkey_ix;
+ }
+ }
+ }
+
+ if (partial_ix < 0xFF) {
+ *ix = (u16) partial_ix;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static int get_gids_from_l3_hdr(struct ib_grh *grh, union ib_gid *sgid,
+ union ib_gid *dgid)
+{
+ int version = ib_get_rdma_header_version((const union rdma_network_hdr *)grh);
+ enum rdma_network_type net_type;
+
+ if (version == 4)
+ net_type = RDMA_NETWORK_IPV4;
+ else if (version == 6)
+ net_type = RDMA_NETWORK_IPV6;
+ else
+ return -EINVAL;
+
+ return ib_get_gids_from_rdma_hdr((union rdma_network_hdr *)grh, net_type,
+ sgid, dgid);
+}
+
+static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave)
+{
+ int proxy_start = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave;
+
+ return (qpn >= proxy_start && qpn <= proxy_start + 1);
+}
+
+int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u32 port,
+ enum ib_qp_type dest_qpt, struct ib_wc *wc,
+ struct ib_grh *grh, struct ib_mad *mad)
+{
+ struct ib_sge list;
+ struct ib_ud_wr wr;
+ const struct ib_send_wr *bad_wr;
+ struct mlx4_ib_demux_pv_ctx *tun_ctx;
+ struct mlx4_ib_demux_pv_qp *tun_qp;
+ struct mlx4_rcv_tunnel_mad *tun_mad;
+ struct rdma_ah_attr attr;
+ struct ib_ah *ah;
+ struct ib_qp *src_qp = NULL;
+ unsigned tun_tx_ix = 0;
+ int dqpn;
+ int ret = 0;
+ u16 tun_pkey_ix;
+ u16 cached_pkey;
+ u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH;
+
+ if (dest_qpt > IB_QPT_GSI) {
+ pr_debug("dest_qpt (%d) > IB_QPT_GSI\n", dest_qpt);
+ return -EINVAL;
+ }
+
+ tun_ctx = dev->sriov.demux[port-1].tun[slave];
+
+ /* check if proxy qp created */
+ if (!tun_ctx || tun_ctx->state != DEMUX_PV_STATE_ACTIVE)
+ return -EAGAIN;
+
+ if (!dest_qpt)
+ tun_qp = &tun_ctx->qp[0];
+ else
+ tun_qp = &tun_ctx->qp[1];
+
+ /* compute P_Key index to put in tunnel header for slave */
+ if (dest_qpt) {
+ u16 pkey_ix;
+ ret = ib_get_cached_pkey(&dev->ib_dev, port, wc->pkey_index, &cached_pkey);
+ if (ret) {
+ pr_debug("unable to get %s cached pkey for index %d, ret %d\n",
+ is_proxy_qp0(dev, wc->src_qp, slave) ? "SMI" : "GSI",
+ wc->pkey_index, ret);
+ return -EINVAL;
+ }
+
+ ret = find_slave_port_pkey_ix(dev, slave, port, cached_pkey, &pkey_ix);
+ if (ret) {
+ pr_debug("unable to get %s pkey ix for pkey 0x%x, ret %d\n",
+ is_proxy_qp0(dev, wc->src_qp, slave) ? "SMI" : "GSI",
+ cached_pkey, ret);
+ return -EINVAL;
+ }
+ tun_pkey_ix = pkey_ix;
+ } else
+ tun_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0];
+
+ dqpn = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave + port + (dest_qpt * 2) - 1;
+
+ /* get tunnel tx data buf for slave */
+ src_qp = tun_qp->qp;
+
+ /* create ah. Just need an empty one with the port num for the post send.
+ * The driver will set the force loopback bit in post_send */
+ memset(&attr, 0, sizeof attr);
+ attr.type = rdma_ah_find_type(&dev->ib_dev, port);
+
+ rdma_ah_set_port_num(&attr, port);
+ if (is_eth) {
+ union ib_gid sgid;
+ union ib_gid dgid;
+
+ if (get_gids_from_l3_hdr(grh, &sgid, &dgid))
+ return -EINVAL;
+ rdma_ah_set_grh(&attr, &dgid, 0, 0, 0, 0);
+ }
+ ah = rdma_create_ah(tun_ctx->pd, &attr, 0);
+ if (IS_ERR(ah))
+ return -ENOMEM;
+
+ /* allocate tunnel tx buf after pass failure returns */
+ spin_lock(&tun_qp->tx_lock);
+ if (tun_qp->tx_ix_head - tun_qp->tx_ix_tail >=
+ (MLX4_NUM_TUNNEL_BUFS - 1))
+ ret = -EAGAIN;
+ else
+ tun_tx_ix = (++tun_qp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1);
+ spin_unlock(&tun_qp->tx_lock);
+ if (ret)
+ goto end;
+
+ tun_mad = (struct mlx4_rcv_tunnel_mad *) (tun_qp->tx_ring[tun_tx_ix].buf.addr);
+ if (tun_qp->tx_ring[tun_tx_ix].ah)
+ rdma_destroy_ah(tun_qp->tx_ring[tun_tx_ix].ah, 0);
+ tun_qp->tx_ring[tun_tx_ix].ah = ah;
+ ib_dma_sync_single_for_cpu(&dev->ib_dev,
+ tun_qp->tx_ring[tun_tx_ix].buf.map,
+ sizeof (struct mlx4_rcv_tunnel_mad),
+ DMA_TO_DEVICE);
+
+ /* copy over to tunnel buffer */
+ if (grh)
+ memcpy(&tun_mad->grh, grh, sizeof *grh);
+ memcpy(&tun_mad->mad, mad, sizeof *mad);
+
+ /* adjust tunnel data */
+ tun_mad->hdr.pkey_index = cpu_to_be16(tun_pkey_ix);
+ tun_mad->hdr.flags_src_qp = cpu_to_be32(wc->src_qp & 0xFFFFFF);
+ tun_mad->hdr.g_ml_path = (grh && (wc->wc_flags & IB_WC_GRH)) ? 0x80 : 0;
+
+ if (is_eth) {
+ u16 vlan = 0;
+ if (mlx4_get_slave_default_vlan(dev->dev, port, slave, &vlan,
+ NULL)) {
+ /* VST mode */
+ if (vlan != wc->vlan_id)
+ /* Packet vlan is not the VST-assigned vlan.
+ * Drop the packet.
+ */
+ goto out;
+ else
+ /* Remove the vlan tag before forwarding
+ * the packet to the VF.
+ */
+ vlan = 0xffff;
+ } else {
+ vlan = wc->vlan_id;
+ }
+
+ tun_mad->hdr.sl_vid = cpu_to_be16(vlan);
+ memcpy((char *)&tun_mad->hdr.mac_31_0, &(wc->smac[0]), 4);
+ memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2);
+ } else {
+ tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12);
+ tun_mad->hdr.slid_mac_47_32 = ib_lid_be16(wc->slid);
+ }
+
+ ib_dma_sync_single_for_device(&dev->ib_dev,
+ tun_qp->tx_ring[tun_tx_ix].buf.map,
+ sizeof (struct mlx4_rcv_tunnel_mad),
+ DMA_TO_DEVICE);
+
+ list.addr = tun_qp->tx_ring[tun_tx_ix].buf.map;
+ list.length = sizeof (struct mlx4_rcv_tunnel_mad);
+ list.lkey = tun_ctx->pd->local_dma_lkey;
+
+ wr.ah = ah;
+ wr.port_num = port;
+ wr.remote_qkey = IB_QP_SET_QKEY;
+ wr.remote_qpn = dqpn;
+ wr.wr.next = NULL;
+ wr.wr.wr_id = ((u64) tun_tx_ix) | MLX4_TUN_SET_WRID_QPN(dest_qpt);
+ wr.wr.sg_list = &list;
+ wr.wr.num_sge = 1;
+ wr.wr.opcode = IB_WR_SEND;
+ wr.wr.send_flags = IB_SEND_SIGNALED;
+
+ ret = ib_post_send(src_qp, &wr.wr, &bad_wr);
+ if (!ret)
+ return 0;
+ out:
+ spin_lock(&tun_qp->tx_lock);
+ tun_qp->tx_ix_tail++;
+ spin_unlock(&tun_qp->tx_lock);
+ tun_qp->tx_ring[tun_tx_ix].ah = NULL;
+end:
+ rdma_destroy_ah(ah, 0);
+ return ret;
+}
+
+static int mlx4_ib_demux_mad(struct ib_device *ibdev, u32 port,
+ struct ib_wc *wc, struct ib_grh *grh,
+ struct ib_mad *mad)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ int err, other_port;
+ int slave = -1;
+ u8 *slave_id;
+ int is_eth = 0;
+
+ if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND)
+ is_eth = 0;
+ else
+ is_eth = 1;
+
+ if (is_eth) {
+ union ib_gid dgid;
+ union ib_gid sgid;
+
+ if (get_gids_from_l3_hdr(grh, &sgid, &dgid))
+ return -EINVAL;
+ if (!(wc->wc_flags & IB_WC_GRH)) {
+ mlx4_ib_warn(ibdev, "RoCE grh not present.\n");
+ return -EINVAL;
+ }
+ if (mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_CM) {
+ mlx4_ib_warn(ibdev, "RoCE mgmt class is not CM\n");
+ return -EINVAL;
+ }
+ err = mlx4_get_slave_from_roce_gid(dev->dev, port, dgid.raw, &slave);
+ if (err && mlx4_is_mf_bonded(dev->dev)) {
+ other_port = (port == 1) ? 2 : 1;
+ err = mlx4_get_slave_from_roce_gid(dev->dev, other_port, dgid.raw, &slave);
+ if (!err) {
+ port = other_port;
+ pr_debug("resolved slave %d from gid %pI6 wire port %d other %d\n",
+ slave, grh->dgid.raw, port, other_port);
+ }
+ }
+ if (err) {
+ mlx4_ib_warn(ibdev, "failed matching grh\n");
+ return -ENOENT;
+ }
+ if (slave >= dev->dev->caps.sqp_demux) {
+ mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n",
+ slave, dev->dev->caps.sqp_demux);
+ return -ENOENT;
+ }
+
+ if (mlx4_ib_demux_cm_handler(ibdev, port, NULL, mad))
+ return 0;
+
+ err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad);
+ if (err)
+ pr_debug("failed sending %s to slave %d via tunnel qp (%d)\n",
+ is_proxy_qp0(dev, wc->src_qp, slave) ? "SMI" : "GSI",
+ slave, err);
+ return 0;
+ }
+
+ /* Initially assume that this mad is for us */
+ slave = mlx4_master_func_num(dev->dev);
+
+ /* See if the slave id is encoded in a response mad */
+ if (mad->mad_hdr.method & 0x80) {
+ slave_id = (u8 *) &mad->mad_hdr.tid;
+ slave = *slave_id;
+ if (slave != 255) /*255 indicates the dom0*/
+ *slave_id = 0; /* remap tid */
+ }
+
+ /* If a grh is present, we demux according to it */
+ if (wc->wc_flags & IB_WC_GRH) {
+ if (grh->dgid.global.interface_id ==
+ cpu_to_be64(IB_SA_WELL_KNOWN_GUID) &&
+ grh->dgid.global.subnet_prefix == cpu_to_be64(
+ atomic64_read(&dev->sriov.demux[port - 1].subnet_prefix))) {
+ slave = 0;
+ } else {
+ slave = mlx4_ib_find_real_gid(ibdev, port,
+ grh->dgid.global.interface_id);
+ if (slave < 0) {
+ mlx4_ib_warn(ibdev, "failed matching grh\n");
+ return -ENOENT;
+ }
+ }
+ }
+ /* Class-specific handling */
+ switch (mad->mad_hdr.mgmt_class) {
+ case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+ case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+ /* 255 indicates the dom0 */
+ if (slave != 255 && slave != mlx4_master_func_num(dev->dev)) {
+ if (!mlx4_vf_smi_enabled(dev->dev, slave, port))
+ return -EPERM;
+ /* for a VF. drop unsolicited MADs */
+ if (!(mad->mad_hdr.method & IB_MGMT_METHOD_RESP)) {
+ mlx4_ib_warn(ibdev, "demux QP0. rejecting unsolicited mad for slave %d class 0x%x, method 0x%x\n",
+ slave, mad->mad_hdr.mgmt_class,
+ mad->mad_hdr.method);
+ return -EINVAL;
+ }
+ }
+ break;
+ case IB_MGMT_CLASS_SUBN_ADM:
+ if (mlx4_ib_demux_sa_handler(ibdev, port, slave,
+ (struct ib_sa_mad *) mad))
+ return 0;
+ break;
+ case IB_MGMT_CLASS_CM:
+ if (mlx4_ib_demux_cm_handler(ibdev, port, &slave, mad))
+ return 0;
+ break;
+ case IB_MGMT_CLASS_DEVICE_MGMT:
+ if (mad->mad_hdr.method != IB_MGMT_METHOD_GET_RESP)
+ return 0;
+ break;
+ default:
+ /* Drop unsupported classes for slaves in tunnel mode */
+ if (slave != mlx4_master_func_num(dev->dev)) {
+ pr_debug("dropping unsupported ingress mad from class:%d "
+ "for slave:%d\n", mad->mad_hdr.mgmt_class, slave);
+ return 0;
+ }
+ }
+ /*make sure that no slave==255 was not handled yet.*/
+ if (slave >= dev->dev->caps.sqp_demux) {
+ mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n",
+ slave, dev->dev->caps.sqp_demux);
+ return -ENOENT;
+ }
+
+ err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad);
+ if (err)
+ pr_debug("failed sending %s to slave %d via tunnel qp (%d)\n",
+ is_proxy_qp0(dev, wc->src_qp, slave) ? "SMI" : "GSI",
+ slave, err);
+ return 0;
+}
+
+static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u32 port_num,
+ const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+ const struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+ u16 slid, prev_lid = 0;
+ int err;
+ struct ib_port_attr pattr;
+
+ slid = in_wc ? ib_lid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE);
+
+ if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) {
+ forward_trap(to_mdev(ibdev), port_num, in_mad);
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+ }
+
+ if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET &&
+ in_mad->mad_hdr.method != IB_MGMT_METHOD_SET &&
+ in_mad->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS)
+ return IB_MAD_RESULT_SUCCESS;
+
+ /*
+ * Don't process SMInfo queries -- the SMA can't handle them.
+ */
+ if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO)
+ return IB_MAD_RESULT_SUCCESS;
+ } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+ in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS1 ||
+ in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS2 ||
+ in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) {
+ if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET &&
+ in_mad->mad_hdr.method != IB_MGMT_METHOD_SET)
+ return IB_MAD_RESULT_SUCCESS;
+ } else
+ return IB_MAD_RESULT_SUCCESS;
+
+ if ((in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+ in_mad->mad_hdr.method == IB_MGMT_METHOD_SET &&
+ in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+ !ib_query_port(ibdev, port_num, &pattr))
+ prev_lid = ib_lid_cpu16(pattr.lid);
+
+ err = mlx4_MAD_IFC(to_mdev(ibdev),
+ (mad_flags & IB_MAD_IGNORE_MKEY ? MLX4_MAD_IFC_IGNORE_MKEY : 0) |
+ (mad_flags & IB_MAD_IGNORE_BKEY ? MLX4_MAD_IFC_IGNORE_BKEY : 0) |
+ MLX4_MAD_IFC_NET_VIEW,
+ port_num, in_wc, in_grh, in_mad, out_mad);
+ if (err)
+ return IB_MAD_RESULT_FAILURE;
+
+ if (!out_mad->mad_hdr.status) {
+ smp_snoop(ibdev, port_num, in_mad, prev_lid);
+ /* slaves get node desc from FW */
+ if (!mlx4_is_slave(to_mdev(ibdev)->dev))
+ node_desc_override(ibdev, out_mad);
+ }
+
+ /* set return bit in status of directed route responses */
+ if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+ if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+ /* no response for trap repress */
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static void edit_counter(struct mlx4_counter *cnt, void *counters,
+ __be16 attr_id)
+{
+ switch (attr_id) {
+ case IB_PMA_PORT_COUNTERS:
+ {
+ struct ib_pma_portcounters *pma_cnt =
+ (struct ib_pma_portcounters *)counters;
+
+ ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_data,
+ (be64_to_cpu(cnt->tx_bytes) >> 2));
+ ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_data,
+ (be64_to_cpu(cnt->rx_bytes) >> 2));
+ ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_packets,
+ be64_to_cpu(cnt->tx_frames));
+ ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_packets,
+ be64_to_cpu(cnt->rx_frames));
+ break;
+ }
+ case IB_PMA_PORT_COUNTERS_EXT:
+ {
+ struct ib_pma_portcounters_ext *pma_cnt_ext =
+ (struct ib_pma_portcounters_ext *)counters;
+
+ pma_cnt_ext->port_xmit_data =
+ cpu_to_be64(be64_to_cpu(cnt->tx_bytes) >> 2);
+ pma_cnt_ext->port_rcv_data =
+ cpu_to_be64(be64_to_cpu(cnt->rx_bytes) >> 2);
+ pma_cnt_ext->port_xmit_packets = cnt->tx_frames;
+ pma_cnt_ext->port_rcv_packets = cnt->rx_frames;
+ break;
+ }
+ }
+}
+
+static int iboe_process_mad_port_info(void *out_mad)
+{
+ struct ib_class_port_info cpi = {};
+
+ cpi.capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH;
+ memcpy(out_mad, &cpi, sizeof(cpi));
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static int iboe_process_mad(struct ib_device *ibdev, int mad_flags,
+ u32 port_num, const struct ib_wc *in_wc,
+ const struct ib_grh *in_grh,
+ const struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+ struct mlx4_counter counter_stats;
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct counter_index *tmp_counter;
+ int err = IB_MAD_RESULT_FAILURE, stats_avail = 0;
+
+ if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT)
+ return -EINVAL;
+
+ if (in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO)
+ return iboe_process_mad_port_info((void *)(out_mad->data + 40));
+
+ memset(&counter_stats, 0, sizeof(counter_stats));
+ mutex_lock(&dev->counters_table[port_num - 1].mutex);
+ list_for_each_entry(tmp_counter,
+ &dev->counters_table[port_num - 1].counters_list,
+ list) {
+ err = mlx4_get_counter_stats(dev->dev,
+ tmp_counter->index,
+ &counter_stats, 0);
+ if (err) {
+ err = IB_MAD_RESULT_FAILURE;
+ stats_avail = 0;
+ break;
+ }
+ stats_avail = 1;
+ }
+ mutex_unlock(&dev->counters_table[port_num - 1].mutex);
+ if (stats_avail) {
+ switch (counter_stats.counter_mode & 0xf) {
+ case 0:
+ edit_counter(&counter_stats,
+ (void *)(out_mad->data + 40),
+ in_mad->mad_hdr.attr_id);
+ err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+ break;
+ default:
+ err = IB_MAD_RESULT_FAILURE;
+ }
+ }
+
+ return err;
+}
+
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u32 port_num,
+ const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+ const struct ib_mad *in, struct ib_mad *out,
+ size_t *out_mad_size, u16 *out_mad_pkey_index)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ enum rdma_link_layer link = rdma_port_get_link_layer(ibdev, port_num);
+
+ /* iboe_process_mad() which uses the HCA flow-counters to implement IB PMA
+ * queries, should be called only by VFs and for that specific purpose
+ */
+ if (link == IB_LINK_LAYER_INFINIBAND) {
+ if (mlx4_is_slave(dev->dev) &&
+ (in->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT &&
+ (in->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS ||
+ in->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT ||
+ in->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO)))
+ return iboe_process_mad(ibdev, mad_flags, port_num,
+ in_wc, in_grh, in, out);
+
+ return ib_process_mad(ibdev, mad_flags, port_num, in_wc, in_grh,
+ in, out);
+ }
+
+ if (link == IB_LINK_LAYER_ETHERNET)
+ return iboe_process_mad(ibdev, mad_flags, port_num, in_wc,
+ in_grh, in, out);
+
+ return -EINVAL;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ if (mad_send_wc->send_buf->context[0])
+ rdma_destroy_ah(mad_send_wc->send_buf->context[0], 0);
+ ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int mlx4_ib_mad_init(struct mlx4_ib_dev *dev)
+{
+ struct ib_mad_agent *agent;
+ int p, q;
+ int ret;
+ enum rdma_link_layer ll;
+
+ for (p = 0; p < dev->num_ports; ++p) {
+ ll = rdma_port_get_link_layer(&dev->ib_dev, p + 1);
+ for (q = 0; q <= 1; ++q) {
+ if (ll == IB_LINK_LAYER_INFINIBAND) {
+ agent = ib_register_mad_agent(&dev->ib_dev, p + 1,
+ q ? IB_QPT_GSI : IB_QPT_SMI,
+ NULL, 0, send_handler,
+ NULL, NULL, 0);
+ if (IS_ERR(agent)) {
+ ret = PTR_ERR(agent);
+ goto err;
+ }
+ dev->send_agent[p][q] = agent;
+ } else
+ dev->send_agent[p][q] = NULL;
+ }
+ }
+
+ return 0;
+
+err:
+ for (p = 0; p < dev->num_ports; ++p)
+ for (q = 0; q <= 1; ++q)
+ if (dev->send_agent[p][q])
+ ib_unregister_mad_agent(dev->send_agent[p][q]);
+
+ return ret;
+}
+
+void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev)
+{
+ struct ib_mad_agent *agent;
+ int p, q;
+
+ for (p = 0; p < dev->num_ports; ++p) {
+ for (q = 0; q <= 1; ++q) {
+ agent = dev->send_agent[p][q];
+ if (agent) {
+ dev->send_agent[p][q] = NULL;
+ ib_unregister_mad_agent(agent);
+ }
+ }
+
+ if (dev->sm_ah[p])
+ rdma_destroy_ah(dev->sm_ah[p], 0);
+ }
+}
+
+static void handle_lid_change_event(struct mlx4_ib_dev *dev, u32 port_num)
+{
+ mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_LID_CHANGE);
+
+ if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down)
+ mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num,
+ MLX4_EQ_PORT_INFO_LID_CHANGE_MASK);
+}
+
+static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u32 port_num)
+{
+ /* re-configure the alias-guid and mcg's */
+ if (mlx4_is_master(dev->dev)) {
+ mlx4_ib_invalidate_all_guid_record(dev, port_num);
+
+ if (!dev->sriov.is_going_down) {
+ mlx4_ib_mcg_port_cleanup(&dev->sriov.demux[port_num - 1], 0);
+ mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num,
+ MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK);
+ }
+ }
+
+ /* Update the sl to vl table from inside client rereg
+ * only if in secure-host mode (snooping is not possible)
+ * and the sl-to-vl change event is not generated by FW.
+ */
+ if (!mlx4_is_slave(dev->dev) &&
+ dev->dev->flags & MLX4_FLAG_SECURE_HOST &&
+ !(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT)) {
+ if (mlx4_is_master(dev->dev))
+ /* already in work queue from mlx4_ib_event queueing
+ * mlx4_handle_port_mgmt_change_event, which calls
+ * this procedure. Therefore, call sl2vl_update directly.
+ */
+ mlx4_ib_sl2vl_update(dev, port_num);
+ else
+ mlx4_sched_ib_sl2vl_update_work(dev, port_num);
+ }
+ mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_CLIENT_REREGISTER);
+}
+
+static void propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
+ struct mlx4_eqe *eqe)
+{
+ __propagate_pkey_ev(dev, port_num, GET_BLK_PTR_FROM_EQE(eqe),
+ GET_MASK_FROM_EQE(eqe));
+}
+
+static void handle_slaves_guid_change(struct mlx4_ib_dev *dev, u32 port_num,
+ u32 guid_tbl_blk_num, u32 change_bitmap)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ u16 i;
+
+ if (!mlx4_is_mfunc(dev->dev) || !mlx4_is_master(dev->dev))
+ return;
+
+ in_mad = kmalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ guid_tbl_blk_num *= 4;
+
+ for (i = 0; i < 4; i++) {
+ if (change_bitmap && (!((change_bitmap >> (8 * i)) & 0xff)))
+ continue;
+ memset(in_mad, 0, sizeof *in_mad);
+ memset(out_mad, 0, sizeof *out_mad);
+
+ in_mad->base_version = 1;
+ in_mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+ in_mad->class_version = 1;
+ in_mad->method = IB_MGMT_METHOD_GET;
+ in_mad->attr_id = IB_SMP_ATTR_GUID_INFO;
+ in_mad->attr_mod = cpu_to_be32(guid_tbl_blk_num + i);
+
+ if (mlx4_MAD_IFC(dev,
+ MLX4_MAD_IFC_IGNORE_KEYS | MLX4_MAD_IFC_NET_VIEW,
+ port_num, NULL, NULL, in_mad, out_mad)) {
+ mlx4_ib_warn(&dev->ib_dev, "Failed in get GUID INFO MAD_IFC\n");
+ goto out;
+ }
+
+ mlx4_ib_update_cache_on_guid_change(dev, guid_tbl_blk_num + i,
+ port_num,
+ (u8 *)(&((struct ib_smp *)out_mad)->data));
+ mlx4_ib_notify_slaves_on_guid_change(dev, guid_tbl_blk_num + i,
+ port_num,
+ (u8 *)(&((struct ib_smp *)out_mad)->data));
+ }
+
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return;
+}
+
+void handle_port_mgmt_change_event(struct work_struct *work)
+{
+ struct ib_event_work *ew = container_of(work, struct ib_event_work, work);
+ struct mlx4_ib_dev *dev = ew->ib_dev;
+ struct mlx4_eqe *eqe = &(ew->ib_eqe);
+ u32 port = eqe->event.port_mgmt_change.port;
+ u32 changed_attr;
+ u32 tbl_block;
+ u32 change_bitmap;
+
+ switch (eqe->subtype) {
+ case MLX4_DEV_PMC_SUBTYPE_PORT_INFO:
+ changed_attr = be32_to_cpu(eqe->event.port_mgmt_change.params.port_info.changed_attr);
+
+ /* Update the SM ah - This should be done before handling
+ the other changed attributes so that MADs can be sent to the SM */
+ if (changed_attr & MSTR_SM_CHANGE_MASK) {
+ u16 lid = be16_to_cpu(eqe->event.port_mgmt_change.params.port_info.mstr_sm_lid);
+ u8 sl = eqe->event.port_mgmt_change.params.port_info.mstr_sm_sl & 0xf;
+ update_sm_ah(dev, port, lid, sl);
+ }
+
+ /* Check if it is a lid change event */
+ if (changed_attr & MLX4_EQ_PORT_INFO_LID_CHANGE_MASK)
+ handle_lid_change_event(dev, port);
+
+ /* Generate GUID changed event */
+ if (changed_attr & MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK) {
+ if (mlx4_is_master(dev->dev)) {
+ union ib_gid gid;
+ int err = 0;
+
+ if (!eqe->event.port_mgmt_change.params.port_info.gid_prefix)
+ err = __mlx4_ib_query_gid(&dev->ib_dev, port, 0, &gid, 1);
+ else
+ gid.global.subnet_prefix =
+ eqe->event.port_mgmt_change.params.port_info.gid_prefix;
+ if (err) {
+ pr_warn("Could not change QP1 subnet prefix for port %d: query_gid error (%d)\n",
+ port, err);
+ } else {
+ pr_debug("Changing QP1 subnet prefix for port %d. old=0x%llx. new=0x%llx\n",
+ port,
+ (u64)atomic64_read(&dev->sriov.demux[port - 1].subnet_prefix),
+ be64_to_cpu(gid.global.subnet_prefix));
+ atomic64_set(&dev->sriov.demux[port - 1].subnet_prefix,
+ be64_to_cpu(gid.global.subnet_prefix));
+ }
+ }
+ mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE);
+ /*if master, notify all slaves*/
+ if (mlx4_is_master(dev->dev))
+ mlx4_gen_slaves_port_mgt_ev(dev->dev, port,
+ MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK);
+ }
+
+ if (changed_attr & MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK)
+ handle_client_rereg_event(dev, port);
+ break;
+
+ case MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE:
+ mlx4_ib_dispatch_event(dev, port, IB_EVENT_PKEY_CHANGE);
+ if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down)
+ propagate_pkey_ev(dev, port, eqe);
+ break;
+ case MLX4_DEV_PMC_SUBTYPE_GUID_INFO:
+ /* paravirtualized master's guid is guid 0 -- does not change */
+ if (!mlx4_is_master(dev->dev))
+ mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE);
+ /*if master, notify relevant slaves*/
+ else if (!dev->sriov.is_going_down) {
+ tbl_block = GET_BLK_PTR_FROM_EQE(eqe);
+ change_bitmap = GET_MASK_FROM_EQE(eqe);
+ handle_slaves_guid_change(dev, port, tbl_block, change_bitmap);
+ }
+ break;
+
+ case MLX4_DEV_PMC_SUBTYPE_SL_TO_VL_MAP:
+ /* cache sl to vl mapping changes for use in
+ * filling QP1 LRH VL field when sending packets
+ */
+ if (!mlx4_is_slave(dev->dev)) {
+ union sl2vl_tbl_to_u64 sl2vl64;
+ int jj;
+
+ for (jj = 0; jj < 8; jj++) {
+ sl2vl64.sl8[jj] =
+ eqe->event.port_mgmt_change.params.sl2vl_tbl_change_info.sl2vl_table[jj];
+ pr_debug("port %u, sl2vl[%d] = %02x\n",
+ port, jj, sl2vl64.sl8[jj]);
+ }
+ atomic64_set(&dev->sl2vl[port - 1], sl2vl64.sl64);
+ }
+ break;
+ default:
+ pr_warn("Unsupported subtype 0x%x for "
+ "Port Management Change event\n", eqe->subtype);
+ }
+
+ kfree(ew);
+}
+
+void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u32 port_num,
+ enum ib_event_type type)
+{
+ struct ib_event event;
+
+ event.device = &dev->ib_dev;
+ event.element.port_num = port_num;
+ event.event = type;
+
+ ib_dispatch_event(&event);
+}
+
+static void mlx4_ib_tunnel_comp_handler(struct ib_cq *cq, void *arg)
+{
+ unsigned long flags;
+ struct mlx4_ib_demux_pv_ctx *ctx = cq->cq_context;
+ struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+ spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+ if (!dev->sriov.is_going_down && ctx->state == DEMUX_PV_STATE_ACTIVE)
+ queue_work(ctx->wq, &ctx->work);
+ spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+static void mlx4_ib_wire_comp_handler(struct ib_cq *cq, void *arg)
+{
+ unsigned long flags;
+ struct mlx4_ib_demux_pv_ctx *ctx = cq->cq_context;
+ struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+
+ spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+ if (!dev->sriov.is_going_down && ctx->state == DEMUX_PV_STATE_ACTIVE)
+ queue_work(ctx->wi_wq, &ctx->work);
+ spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx,
+ struct mlx4_ib_demux_pv_qp *tun_qp,
+ int index)
+{
+ struct ib_sge sg_list;
+ struct ib_recv_wr recv_wr;
+ const struct ib_recv_wr *bad_recv_wr;
+ int size;
+
+ size = (tun_qp->qp->qp_type == IB_QPT_UD) ?
+ sizeof (struct mlx4_tunnel_mad) : sizeof (struct mlx4_mad_rcv_buf);
+
+ sg_list.addr = tun_qp->ring[index].map;
+ sg_list.length = size;
+ sg_list.lkey = ctx->pd->local_dma_lkey;
+
+ recv_wr.next = NULL;
+ recv_wr.sg_list = &sg_list;
+ recv_wr.num_sge = 1;
+ recv_wr.wr_id = (u64) index | MLX4_TUN_WRID_RECV |
+ MLX4_TUN_SET_WRID_QPN(tun_qp->proxy_qpt);
+ ib_dma_sync_single_for_device(ctx->ib_dev, tun_qp->ring[index].map,
+ size, DMA_FROM_DEVICE);
+ return ib_post_recv(tun_qp->qp, &recv_wr, &bad_recv_wr);
+}
+
+static int mlx4_ib_multiplex_sa_handler(struct ib_device *ibdev, int port,
+ int slave, struct ib_sa_mad *sa_mad)
+{
+ int ret = 0;
+
+ /* dispatch to different sa handlers */
+ switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) {
+ case IB_SA_ATTR_MC_MEMBER_REC:
+ ret = mlx4_ib_mcg_multiplex_handler(ibdev, port, slave, sa_mad);
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u32 port,
+ enum ib_qp_type dest_qpt, u16 pkey_index,
+ u32 remote_qpn, u32 qkey, struct rdma_ah_attr *attr,
+ u8 *s_mac, u16 vlan_id, struct ib_mad *mad)
+{
+ struct ib_sge list;
+ struct ib_ud_wr wr;
+ const struct ib_send_wr *bad_wr;
+ struct mlx4_ib_demux_pv_ctx *sqp_ctx;
+ struct mlx4_ib_demux_pv_qp *sqp;
+ struct mlx4_mad_snd_buf *sqp_mad;
+ struct ib_ah *ah;
+ struct ib_qp *send_qp = NULL;
+ unsigned wire_tx_ix = 0;
+ u16 wire_pkey_ix;
+ int src_qpnum;
+ int ret;
+
+ sqp_ctx = dev->sriov.sqps[port-1];
+
+ /* check if proxy qp created */
+ if (!sqp_ctx || sqp_ctx->state != DEMUX_PV_STATE_ACTIVE)
+ return -EAGAIN;
+
+ if (dest_qpt == IB_QPT_SMI) {
+ src_qpnum = 0;
+ sqp = &sqp_ctx->qp[0];
+ wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0];
+ } else {
+ src_qpnum = 1;
+ sqp = &sqp_ctx->qp[1];
+ wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][pkey_index];
+ }
+
+ send_qp = sqp->qp;
+
+ ah = rdma_zalloc_drv_obj(sqp_ctx->pd->device, ib_ah);
+ if (!ah)
+ return -ENOMEM;
+
+ ah->device = sqp_ctx->pd->device;
+ ah->pd = sqp_ctx->pd;
+
+ /* create ah */
+ ret = mlx4_ib_create_ah_slave(ah, attr,
+ rdma_ah_retrieve_grh(attr)->sgid_index,
+ s_mac, vlan_id);
+ if (ret)
+ goto out;
+
+ spin_lock(&sqp->tx_lock);
+ if (sqp->tx_ix_head - sqp->tx_ix_tail >=
+ (MLX4_NUM_WIRE_BUFS - 1))
+ ret = -EAGAIN;
+ else
+ wire_tx_ix = (++sqp->tx_ix_head) & (MLX4_NUM_WIRE_BUFS - 1);
+ spin_unlock(&sqp->tx_lock);
+ if (ret)
+ goto out;
+
+ sqp_mad = (struct mlx4_mad_snd_buf *) (sqp->tx_ring[wire_tx_ix].buf.addr);
+ kfree(sqp->tx_ring[wire_tx_ix].ah);
+ sqp->tx_ring[wire_tx_ix].ah = ah;
+ ib_dma_sync_single_for_cpu(&dev->ib_dev,
+ sqp->tx_ring[wire_tx_ix].buf.map,
+ sizeof (struct mlx4_mad_snd_buf),
+ DMA_TO_DEVICE);
+
+ memcpy(&sqp_mad->payload, mad, sizeof *mad);
+
+ ib_dma_sync_single_for_device(&dev->ib_dev,
+ sqp->tx_ring[wire_tx_ix].buf.map,
+ sizeof (struct mlx4_mad_snd_buf),
+ DMA_TO_DEVICE);
+
+ list.addr = sqp->tx_ring[wire_tx_ix].buf.map;
+ list.length = sizeof (struct mlx4_mad_snd_buf);
+ list.lkey = sqp_ctx->pd->local_dma_lkey;
+
+ wr.ah = ah;
+ wr.port_num = port;
+ wr.pkey_index = wire_pkey_ix;
+ wr.remote_qkey = qkey;
+ wr.remote_qpn = remote_qpn;
+ wr.wr.next = NULL;
+ wr.wr.wr_id = ((u64) wire_tx_ix) | MLX4_TUN_SET_WRID_QPN(src_qpnum);
+ wr.wr.sg_list = &list;
+ wr.wr.num_sge = 1;
+ wr.wr.opcode = IB_WR_SEND;
+ wr.wr.send_flags = IB_SEND_SIGNALED;
+
+ ret = ib_post_send(send_qp, &wr.wr, &bad_wr);
+ if (!ret)
+ return 0;
+
+ spin_lock(&sqp->tx_lock);
+ sqp->tx_ix_tail++;
+ spin_unlock(&sqp->tx_lock);
+ sqp->tx_ring[wire_tx_ix].ah = NULL;
+out:
+ kfree(ah);
+ return ret;
+}
+
+static int get_slave_base_gid_ix(struct mlx4_ib_dev *dev, int slave, int port)
+{
+ if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND)
+ return slave;
+ return mlx4_get_base_gid_ix(dev->dev, slave, port);
+}
+
+static void fill_in_real_sgid_index(struct mlx4_ib_dev *dev, int slave, int port,
+ struct rdma_ah_attr *ah_attr)
+{
+ struct ib_global_route *grh = rdma_ah_retrieve_grh(ah_attr);
+ if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND)
+ grh->sgid_index = slave;
+ else
+ grh->sgid_index += get_slave_base_gid_ix(dev, slave, port);
+}
+
+static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc *wc)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+ struct mlx4_ib_demux_pv_qp *tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc->wr_id)];
+ int wr_ix = wc->wr_id & (MLX4_NUM_TUNNEL_BUFS - 1);
+ struct mlx4_tunnel_mad *tunnel = tun_qp->ring[wr_ix].addr;
+ struct mlx4_ib_ah ah;
+ struct rdma_ah_attr ah_attr;
+ u8 *slave_id;
+ int slave;
+ int port;
+ u16 vlan_id;
+ u8 qos;
+ u8 *dmac;
+ int sts;
+
+ /* Get slave that sent this packet */
+ if (wc->src_qp < dev->dev->phys_caps.base_proxy_sqpn ||
+ wc->src_qp >= dev->dev->phys_caps.base_proxy_sqpn + 8 * MLX4_MFUNC_MAX ||
+ (wc->src_qp & 0x1) != ctx->port - 1 ||
+ wc->src_qp & 0x4) {
+ mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d\n", wc->src_qp);
+ return;
+ }
+ slave = ((wc->src_qp & ~0x7) - dev->dev->phys_caps.base_proxy_sqpn) / 8;
+ if (slave != ctx->slave) {
+ mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: "
+ "belongs to another slave\n", wc->src_qp);
+ return;
+ }
+
+ /* Map transaction ID */
+ ib_dma_sync_single_for_cpu(ctx->ib_dev, tun_qp->ring[wr_ix].map,
+ sizeof (struct mlx4_tunnel_mad),
+ DMA_FROM_DEVICE);
+ switch (tunnel->mad.mad_hdr.method) {
+ case IB_MGMT_METHOD_SET:
+ case IB_MGMT_METHOD_GET:
+ case IB_MGMT_METHOD_REPORT:
+ case IB_SA_METHOD_GET_TABLE:
+ case IB_SA_METHOD_DELETE:
+ case IB_SA_METHOD_GET_MULTI:
+ case IB_SA_METHOD_GET_TRACE_TBL:
+ slave_id = (u8 *) &tunnel->mad.mad_hdr.tid;
+ if (*slave_id) {
+ mlx4_ib_warn(ctx->ib_dev, "egress mad has non-null tid msb:%d "
+ "class:%d slave:%d\n", *slave_id,
+ tunnel->mad.mad_hdr.mgmt_class, slave);
+ return;
+ } else
+ *slave_id = slave;
+ break;
+ default:
+ /* nothing */;
+ }
+
+ /* Class-specific handling */
+ switch (tunnel->mad.mad_hdr.mgmt_class) {
+ case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+ case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+ if (slave != mlx4_master_func_num(dev->dev) &&
+ !mlx4_vf_smi_enabled(dev->dev, slave, ctx->port))
+ return;
+ break;
+ case IB_MGMT_CLASS_SUBN_ADM:
+ if (mlx4_ib_multiplex_sa_handler(ctx->ib_dev, ctx->port, slave,
+ (struct ib_sa_mad *) &tunnel->mad))
+ return;
+ break;
+ case IB_MGMT_CLASS_CM:
+ if (mlx4_ib_multiplex_cm_handler(ctx->ib_dev, ctx->port, slave,
+ (struct ib_mad *) &tunnel->mad))
+ return;
+ break;
+ case IB_MGMT_CLASS_DEVICE_MGMT:
+ if (tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_GET &&
+ tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_SET)
+ return;
+ break;
+ default:
+ /* Drop unsupported classes for slaves in tunnel mode */
+ if (slave != mlx4_master_func_num(dev->dev)) {
+ mlx4_ib_warn(ctx->ib_dev, "dropping unsupported egress mad from class:%d "
+ "for slave:%d\n", tunnel->mad.mad_hdr.mgmt_class, slave);
+ return;
+ }
+ }
+
+ /* We are using standard ib_core services to send the mad, so generate a
+ * stadard address handle by decoding the tunnelled mlx4_ah fields */
+ memcpy(&ah.av, &tunnel->hdr.av, sizeof (struct mlx4_av));
+ ah.ibah.device = ctx->ib_dev;
+
+ port = be32_to_cpu(ah.av.ib.port_pd) >> 24;
+ port = mlx4_slave_convert_port(dev->dev, slave, port);
+ if (port < 0)
+ return;
+ ah.av.ib.port_pd = cpu_to_be32(port << 24 | (be32_to_cpu(ah.av.ib.port_pd) & 0xffffff));
+ ah.ibah.type = rdma_ah_find_type(&dev->ib_dev, port);
+
+ mlx4_ib_query_ah(&ah.ibah, &ah_attr);
+ if (rdma_ah_get_ah_flags(&ah_attr) & IB_AH_GRH)
+ fill_in_real_sgid_index(dev, slave, ctx->port, &ah_attr);
+ dmac = rdma_ah_retrieve_dmac(&ah_attr);
+ if (dmac)
+ memcpy(dmac, tunnel->hdr.mac, ETH_ALEN);
+ vlan_id = be16_to_cpu(tunnel->hdr.vlan);
+ /* if slave have default vlan use it */
+ if (mlx4_get_slave_default_vlan(dev->dev, ctx->port, slave,
+ &vlan_id, &qos))
+ rdma_ah_set_sl(&ah_attr, qos);
+
+ sts = mlx4_ib_send_to_wire(dev, slave, ctx->port,
+ is_proxy_qp0(dev, wc->src_qp, slave) ?
+ IB_QPT_SMI : IB_QPT_GSI,
+ be16_to_cpu(tunnel->hdr.pkey_index),
+ be32_to_cpu(tunnel->hdr.remote_qpn),
+ be32_to_cpu(tunnel->hdr.qkey),
+ &ah_attr, wc->smac, vlan_id, &tunnel->mad);
+ if (sts)
+ pr_debug("failed sending %s to wire on behalf of slave %d (%d)\n",
+ is_proxy_qp0(dev, wc->src_qp, slave) ? "SMI" : "GSI",
+ slave, sts);
+}
+
+static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
+ enum ib_qp_type qp_type, int is_tun)
+{
+ int i;
+ struct mlx4_ib_demux_pv_qp *tun_qp;
+ int rx_buf_size, tx_buf_size;
+ const int nmbr_bufs = is_tun ? MLX4_NUM_TUNNEL_BUFS : MLX4_NUM_WIRE_BUFS;
+
+ if (qp_type > IB_QPT_GSI)
+ return -EINVAL;
+
+ tun_qp = &ctx->qp[qp_type];
+
+ tun_qp->ring = kcalloc(nmbr_bufs,
+ sizeof(struct mlx4_ib_buf),
+ GFP_KERNEL);
+ if (!tun_qp->ring)
+ return -ENOMEM;
+
+ tun_qp->tx_ring = kcalloc(nmbr_bufs,
+ sizeof (struct mlx4_ib_tun_tx_buf),
+ GFP_KERNEL);
+ if (!tun_qp->tx_ring) {
+ kfree(tun_qp->ring);
+ tun_qp->ring = NULL;
+ return -ENOMEM;
+ }
+
+ if (is_tun) {
+ rx_buf_size = sizeof (struct mlx4_tunnel_mad);
+ tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad);
+ } else {
+ rx_buf_size = sizeof (struct mlx4_mad_rcv_buf);
+ tx_buf_size = sizeof (struct mlx4_mad_snd_buf);
+ }
+
+ for (i = 0; i < nmbr_bufs; i++) {
+ tun_qp->ring[i].addr = kmalloc(rx_buf_size, GFP_KERNEL);
+ if (!tun_qp->ring[i].addr)
+ goto err;
+ tun_qp->ring[i].map = ib_dma_map_single(ctx->ib_dev,
+ tun_qp->ring[i].addr,
+ rx_buf_size,
+ DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(ctx->ib_dev, tun_qp->ring[i].map)) {
+ kfree(tun_qp->ring[i].addr);
+ goto err;
+ }
+ }
+
+ for (i = 0; i < nmbr_bufs; i++) {
+ tun_qp->tx_ring[i].buf.addr =
+ kmalloc(tx_buf_size, GFP_KERNEL);
+ if (!tun_qp->tx_ring[i].buf.addr)
+ goto tx_err;
+ tun_qp->tx_ring[i].buf.map =
+ ib_dma_map_single(ctx->ib_dev,
+ tun_qp->tx_ring[i].buf.addr,
+ tx_buf_size,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(ctx->ib_dev,
+ tun_qp->tx_ring[i].buf.map)) {
+ kfree(tun_qp->tx_ring[i].buf.addr);
+ goto tx_err;
+ }
+ tun_qp->tx_ring[i].ah = NULL;
+ }
+ spin_lock_init(&tun_qp->tx_lock);
+ tun_qp->tx_ix_head = 0;
+ tun_qp->tx_ix_tail = 0;
+ tun_qp->proxy_qpt = qp_type;
+
+ return 0;
+
+tx_err:
+ while (i > 0) {
+ --i;
+ ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map,
+ tx_buf_size, DMA_TO_DEVICE);
+ kfree(tun_qp->tx_ring[i].buf.addr);
+ }
+ i = nmbr_bufs;
+err:
+ while (i > 0) {
+ --i;
+ ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map,
+ rx_buf_size, DMA_FROM_DEVICE);
+ kfree(tun_qp->ring[i].addr);
+ }
+ kfree(tun_qp->tx_ring);
+ tun_qp->tx_ring = NULL;
+ kfree(tun_qp->ring);
+ tun_qp->ring = NULL;
+ return -ENOMEM;
+}
+
+static void mlx4_ib_free_pv_qp_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
+ enum ib_qp_type qp_type, int is_tun)
+{
+ int i;
+ struct mlx4_ib_demux_pv_qp *tun_qp;
+ int rx_buf_size, tx_buf_size;
+ const int nmbr_bufs = is_tun ? MLX4_NUM_TUNNEL_BUFS : MLX4_NUM_WIRE_BUFS;
+
+ if (qp_type > IB_QPT_GSI)
+ return;
+
+ tun_qp = &ctx->qp[qp_type];
+ if (is_tun) {
+ rx_buf_size = sizeof (struct mlx4_tunnel_mad);
+ tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad);
+ } else {
+ rx_buf_size = sizeof (struct mlx4_mad_rcv_buf);
+ tx_buf_size = sizeof (struct mlx4_mad_snd_buf);
+ }
+
+
+ for (i = 0; i < nmbr_bufs; i++) {
+ ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map,
+ rx_buf_size, DMA_FROM_DEVICE);
+ kfree(tun_qp->ring[i].addr);
+ }
+
+ for (i = 0; i < nmbr_bufs; i++) {
+ ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map,
+ tx_buf_size, DMA_TO_DEVICE);
+ kfree(tun_qp->tx_ring[i].buf.addr);
+ if (tun_qp->tx_ring[i].ah)
+ rdma_destroy_ah(tun_qp->tx_ring[i].ah, 0);
+ }
+ kfree(tun_qp->tx_ring);
+ kfree(tun_qp->ring);
+}
+
+static void mlx4_ib_tunnel_comp_worker(struct work_struct *work)
+{
+ struct mlx4_ib_demux_pv_ctx *ctx;
+ struct mlx4_ib_demux_pv_qp *tun_qp;
+ struct ib_wc wc;
+ int ret;
+ ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work);
+ ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP);
+
+ while (ib_poll_cq(ctx->cq, 1, &wc) == 1) {
+ tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)];
+ if (wc.status == IB_WC_SUCCESS) {
+ switch (wc.opcode) {
+ case IB_WC_RECV:
+ mlx4_ib_multiplex_mad(ctx, &wc);
+ ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp,
+ wc.wr_id &
+ (MLX4_NUM_TUNNEL_BUFS - 1));
+ if (ret)
+ pr_err("Failed reposting tunnel "
+ "buf:%lld\n", wc.wr_id);
+ break;
+ case IB_WC_SEND:
+ rdma_destroy_ah(tun_qp->tx_ring[wc.wr_id &
+ (MLX4_NUM_TUNNEL_BUFS - 1)].ah, 0);
+ tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+ = NULL;
+ spin_lock(&tun_qp->tx_lock);
+ tun_qp->tx_ix_tail++;
+ spin_unlock(&tun_qp->tx_lock);
+
+ break;
+ default:
+ break;
+ }
+ } else {
+ pr_debug("mlx4_ib: completion error in tunnel: %d."
+ " status = %d, wrid = 0x%llx\n",
+ ctx->slave, wc.status, wc.wr_id);
+ if (!MLX4_TUN_IS_RECV(wc.wr_id)) {
+ rdma_destroy_ah(tun_qp->tx_ring[wc.wr_id &
+ (MLX4_NUM_TUNNEL_BUFS - 1)].ah, 0);
+ tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+ = NULL;
+ spin_lock(&tun_qp->tx_lock);
+ tun_qp->tx_ix_tail++;
+ spin_unlock(&tun_qp->tx_lock);
+ }
+ }
+ }
+}
+
+static void pv_qp_event_handler(struct ib_event *event, void *qp_context)
+{
+ struct mlx4_ib_demux_pv_ctx *sqp = qp_context;
+
+ /* It's worse than that! He's dead, Jim! */
+ pr_err("Fatal error (%d) on a MAD QP on port %d\n",
+ event->event, sqp->port);
+}
+
+static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx,
+ enum ib_qp_type qp_type, int create_tun)
+{
+ int i, ret;
+ struct mlx4_ib_demux_pv_qp *tun_qp;
+ struct mlx4_ib_qp_tunnel_init_attr qp_init_attr;
+ struct ib_qp_attr attr;
+ int qp_attr_mask_INIT;
+ const int nmbr_bufs = create_tun ? MLX4_NUM_TUNNEL_BUFS : MLX4_NUM_WIRE_BUFS;
+
+ if (qp_type > IB_QPT_GSI)
+ return -EINVAL;
+
+ tun_qp = &ctx->qp[qp_type];
+
+ memset(&qp_init_attr, 0, sizeof qp_init_attr);
+ qp_init_attr.init_attr.send_cq = ctx->cq;
+ qp_init_attr.init_attr.recv_cq = ctx->cq;
+ qp_init_attr.init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+ qp_init_attr.init_attr.cap.max_send_wr = nmbr_bufs;
+ qp_init_attr.init_attr.cap.max_recv_wr = nmbr_bufs;
+ qp_init_attr.init_attr.cap.max_send_sge = 1;
+ qp_init_attr.init_attr.cap.max_recv_sge = 1;
+ if (create_tun) {
+ qp_init_attr.init_attr.qp_type = IB_QPT_UD;
+ qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_TUNNEL_QP;
+ qp_init_attr.port = ctx->port;
+ qp_init_attr.slave = ctx->slave;
+ qp_init_attr.proxy_qp_type = qp_type;
+ qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX |
+ IB_QP_QKEY | IB_QP_PORT;
+ } else {
+ qp_init_attr.init_attr.qp_type = qp_type;
+ qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_SQP;
+ qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY;
+ }
+ qp_init_attr.init_attr.port_num = ctx->port;
+ qp_init_attr.init_attr.qp_context = ctx;
+ qp_init_attr.init_attr.event_handler = pv_qp_event_handler;
+ tun_qp->qp = ib_create_qp(ctx->pd, &qp_init_attr.init_attr);
+ if (IS_ERR(tun_qp->qp)) {
+ ret = PTR_ERR(tun_qp->qp);
+ tun_qp->qp = NULL;
+ pr_err("Couldn't create %s QP (%d)\n",
+ create_tun ? "tunnel" : "special", ret);
+ return ret;
+ }
+
+ memset(&attr, 0, sizeof attr);
+ attr.qp_state = IB_QPS_INIT;
+ ret = 0;
+ if (create_tun)
+ ret = find_slave_port_pkey_ix(to_mdev(ctx->ib_dev), ctx->slave,
+ ctx->port, IB_DEFAULT_PKEY_FULL,
+ &attr.pkey_index);
+ if (ret || !create_tun)
+ attr.pkey_index =
+ to_mdev(ctx->ib_dev)->pkeys.virt2phys_pkey[ctx->slave][ctx->port - 1][0];
+ attr.qkey = IB_QP1_QKEY;
+ attr.port_num = ctx->port;
+ ret = ib_modify_qp(tun_qp->qp, &attr, qp_attr_mask_INIT);
+ if (ret) {
+ pr_err("Couldn't change %s qp state to INIT (%d)\n",
+ create_tun ? "tunnel" : "special", ret);
+ goto err_qp;
+ }
+ attr.qp_state = IB_QPS_RTR;
+ ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE);
+ if (ret) {
+ pr_err("Couldn't change %s qp state to RTR (%d)\n",
+ create_tun ? "tunnel" : "special", ret);
+ goto err_qp;
+ }
+ attr.qp_state = IB_QPS_RTS;
+ attr.sq_psn = 0;
+ ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN);
+ if (ret) {
+ pr_err("Couldn't change %s qp state to RTS (%d)\n",
+ create_tun ? "tunnel" : "special", ret);
+ goto err_qp;
+ }
+
+ for (i = 0; i < nmbr_bufs; i++) {
+ ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, i);
+ if (ret) {
+ pr_err(" mlx4_ib_post_pv_buf error"
+ " (err = %d, i = %d)\n", ret, i);
+ goto err_qp;
+ }
+ }
+ return 0;
+
+err_qp:
+ ib_destroy_qp(tun_qp->qp);
+ tun_qp->qp = NULL;
+ return ret;
+}
+
+/*
+ * IB MAD completion callback for real SQPs
+ */
+static void mlx4_ib_sqp_comp_worker(struct work_struct *work)
+{
+ struct mlx4_ib_demux_pv_ctx *ctx;
+ struct mlx4_ib_demux_pv_qp *sqp;
+ struct ib_wc wc;
+ struct ib_grh *grh;
+ struct ib_mad *mad;
+
+ ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work);
+ ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP);
+
+ while (mlx4_ib_poll_cq(ctx->cq, 1, &wc) == 1) {
+ sqp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)];
+ if (wc.status == IB_WC_SUCCESS) {
+ switch (wc.opcode) {
+ case IB_WC_SEND:
+ kfree(sqp->tx_ring[wc.wr_id &
+ (MLX4_NUM_WIRE_BUFS - 1)].ah);
+ sqp->tx_ring[wc.wr_id & (MLX4_NUM_WIRE_BUFS - 1)].ah
+ = NULL;
+ spin_lock(&sqp->tx_lock);
+ sqp->tx_ix_tail++;
+ spin_unlock(&sqp->tx_lock);
+ break;
+ case IB_WC_RECV:
+ mad = (struct ib_mad *) &(((struct mlx4_mad_rcv_buf *)
+ (sqp->ring[wc.wr_id &
+ (MLX4_NUM_WIRE_BUFS - 1)].addr))->payload);
+ grh = &(((struct mlx4_mad_rcv_buf *)
+ (sqp->ring[wc.wr_id &
+ (MLX4_NUM_WIRE_BUFS - 1)].addr))->grh);
+ mlx4_ib_demux_mad(ctx->ib_dev, ctx->port, &wc, grh, mad);
+ if (mlx4_ib_post_pv_qp_buf(ctx, sqp, wc.wr_id &
+ (MLX4_NUM_WIRE_BUFS - 1)))
+ pr_err("Failed reposting SQP "
+ "buf:%lld\n", wc.wr_id);
+ break;
+ default:
+ break;
+ }
+ } else {
+ pr_debug("mlx4_ib: completion error in tunnel: %d."
+ " status = %d, wrid = 0x%llx\n",
+ ctx->slave, wc.status, wc.wr_id);
+ if (!MLX4_TUN_IS_RECV(wc.wr_id)) {
+ kfree(sqp->tx_ring[wc.wr_id &
+ (MLX4_NUM_WIRE_BUFS - 1)].ah);
+ sqp->tx_ring[wc.wr_id & (MLX4_NUM_WIRE_BUFS - 1)].ah
+ = NULL;
+ spin_lock(&sqp->tx_lock);
+ sqp->tx_ix_tail++;
+ spin_unlock(&sqp->tx_lock);
+ }
+ }
+ }
+}
+
+static int alloc_pv_object(struct mlx4_ib_dev *dev, int slave, int port,
+ struct mlx4_ib_demux_pv_ctx **ret_ctx)
+{
+ struct mlx4_ib_demux_pv_ctx *ctx;
+
+ *ret_ctx = NULL;
+ ctx = kzalloc(sizeof (struct mlx4_ib_demux_pv_ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ib_dev = &dev->ib_dev;
+ ctx->port = port;
+ ctx->slave = slave;
+ *ret_ctx = ctx;
+ return 0;
+}
+
+static void free_pv_object(struct mlx4_ib_dev *dev, int slave, int port)
+{
+ if (dev->sriov.demux[port - 1].tun[slave]) {
+ kfree(dev->sriov.demux[port - 1].tun[slave]);
+ dev->sriov.demux[port - 1].tun[slave] = NULL;
+ }
+}
+
+static int create_pv_resources(struct ib_device *ibdev, int slave, int port,
+ int create_tun, struct mlx4_ib_demux_pv_ctx *ctx)
+{
+ int ret, cq_size;
+ struct ib_cq_init_attr cq_attr = {};
+ const int nmbr_bufs = create_tun ? MLX4_NUM_TUNNEL_BUFS : MLX4_NUM_WIRE_BUFS;
+
+ if (ctx->state != DEMUX_PV_STATE_DOWN)
+ return -EEXIST;
+
+ ctx->state = DEMUX_PV_STATE_STARTING;
+ /* have QP0 only if link layer is IB */
+ if (rdma_port_get_link_layer(ibdev, ctx->port) ==
+ IB_LINK_LAYER_INFINIBAND)
+ ctx->has_smi = 1;
+
+ if (ctx->has_smi) {
+ ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_SMI, create_tun);
+ if (ret) {
+ pr_err("Failed allocating qp0 tunnel bufs (%d)\n", ret);
+ goto err_out;
+ }
+ }
+
+ ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_GSI, create_tun);
+ if (ret) {
+ pr_err("Failed allocating qp1 tunnel bufs (%d)\n", ret);
+ goto err_out_qp0;
+ }
+
+ cq_size = 2 * nmbr_bufs;
+ if (ctx->has_smi)
+ cq_size *= 2;
+
+ cq_attr.cqe = cq_size;
+ ctx->cq = ib_create_cq(ctx->ib_dev,
+ create_tun ? mlx4_ib_tunnel_comp_handler : mlx4_ib_wire_comp_handler,
+ NULL, ctx, &cq_attr);
+ if (IS_ERR(ctx->cq)) {
+ ret = PTR_ERR(ctx->cq);
+ pr_err("Couldn't create tunnel CQ (%d)\n", ret);
+ goto err_buf;
+ }
+
+ ctx->pd = ib_alloc_pd(ctx->ib_dev, 0);
+ if (IS_ERR(ctx->pd)) {
+ ret = PTR_ERR(ctx->pd);
+ pr_err("Couldn't create tunnel PD (%d)\n", ret);
+ goto err_cq;
+ }
+
+ if (ctx->has_smi) {
+ ret = create_pv_sqp(ctx, IB_QPT_SMI, create_tun);
+ if (ret) {
+ pr_err("Couldn't create %s QP0 (%d)\n",
+ create_tun ? "tunnel for" : "", ret);
+ goto err_pd;
+ }
+ }
+
+ ret = create_pv_sqp(ctx, IB_QPT_GSI, create_tun);
+ if (ret) {
+ pr_err("Couldn't create %s QP1 (%d)\n",
+ create_tun ? "tunnel for" : "", ret);
+ goto err_qp0;
+ }
+
+ if (create_tun)
+ INIT_WORK(&ctx->work, mlx4_ib_tunnel_comp_worker);
+ else
+ INIT_WORK(&ctx->work, mlx4_ib_sqp_comp_worker);
+
+ ctx->wq = to_mdev(ibdev)->sriov.demux[port - 1].wq;
+ ctx->wi_wq = to_mdev(ibdev)->sriov.demux[port - 1].wi_wq;
+
+ ret = ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP);
+ if (ret) {
+ pr_err("Couldn't arm tunnel cq (%d)\n", ret);
+ goto err_wq;
+ }
+ ctx->state = DEMUX_PV_STATE_ACTIVE;
+ return 0;
+
+err_wq:
+ ctx->wq = NULL;
+ ib_destroy_qp(ctx->qp[1].qp);
+ ctx->qp[1].qp = NULL;
+
+
+err_qp0:
+ if (ctx->has_smi)
+ ib_destroy_qp(ctx->qp[0].qp);
+ ctx->qp[0].qp = NULL;
+
+err_pd:
+ ib_dealloc_pd(ctx->pd);
+ ctx->pd = NULL;
+
+err_cq:
+ ib_destroy_cq(ctx->cq);
+ ctx->cq = NULL;
+
+err_buf:
+ mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, create_tun);
+
+err_out_qp0:
+ if (ctx->has_smi)
+ mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, create_tun);
+err_out:
+ ctx->state = DEMUX_PV_STATE_DOWN;
+ return ret;
+}
+
+static void destroy_pv_resources(struct mlx4_ib_dev *dev, int slave, int port,
+ struct mlx4_ib_demux_pv_ctx *ctx, int flush)
+{
+ if (!ctx)
+ return;
+ if (ctx->state > DEMUX_PV_STATE_DOWN) {
+ ctx->state = DEMUX_PV_STATE_DOWNING;
+ if (flush)
+ flush_workqueue(ctx->wq);
+ if (ctx->has_smi) {
+ ib_destroy_qp(ctx->qp[0].qp);
+ ctx->qp[0].qp = NULL;
+ mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, 1);
+ }
+ ib_destroy_qp(ctx->qp[1].qp);
+ ctx->qp[1].qp = NULL;
+ mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, 1);
+ ib_dealloc_pd(ctx->pd);
+ ctx->pd = NULL;
+ ib_destroy_cq(ctx->cq);
+ ctx->cq = NULL;
+ ctx->state = DEMUX_PV_STATE_DOWN;
+ }
+}
+
+static int mlx4_ib_tunnels_update(struct mlx4_ib_dev *dev, int slave,
+ int port, int do_init)
+{
+ int ret = 0;
+
+ if (!do_init) {
+ clean_vf_mcast(&dev->sriov.demux[port - 1], slave);
+ /* for master, destroy real sqp resources */
+ if (slave == mlx4_master_func_num(dev->dev))
+ destroy_pv_resources(dev, slave, port,
+ dev->sriov.sqps[port - 1], 1);
+ /* destroy the tunnel qp resources */
+ destroy_pv_resources(dev, slave, port,
+ dev->sriov.demux[port - 1].tun[slave], 1);
+ return 0;
+ }
+
+ /* create the tunnel qp resources */
+ ret = create_pv_resources(&dev->ib_dev, slave, port, 1,
+ dev->sriov.demux[port - 1].tun[slave]);
+
+ /* for master, create the real sqp resources */
+ if (!ret && slave == mlx4_master_func_num(dev->dev))
+ ret = create_pv_resources(&dev->ib_dev, slave, port, 0,
+ dev->sriov.sqps[port - 1]);
+ return ret;
+}
+
+void mlx4_ib_tunnels_update_work(struct work_struct *work)
+{
+ struct mlx4_ib_demux_work *dmxw;
+
+ dmxw = container_of(work, struct mlx4_ib_demux_work, work);
+ mlx4_ib_tunnels_update(dmxw->dev, dmxw->slave, (int) dmxw->port,
+ dmxw->do_init);
+ kfree(dmxw);
+ return;
+}
+
+static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev,
+ struct mlx4_ib_demux_ctx *ctx,
+ int port)
+{
+ char name[12];
+ int ret = 0;
+ int i;
+
+ ctx->tun = kcalloc(dev->dev->caps.sqp_demux,
+ sizeof (struct mlx4_ib_demux_pv_ctx *), GFP_KERNEL);
+ if (!ctx->tun)
+ return -ENOMEM;
+
+ ctx->dev = dev;
+ ctx->port = port;
+ ctx->ib_dev = &dev->ib_dev;
+
+ for (i = 0;
+ i < min(dev->dev->caps.sqp_demux,
+ (u16)(dev->dev->persist->num_vfs + 1));
+ i++) {
+ struct mlx4_active_ports actv_ports =
+ mlx4_get_active_ports(dev->dev, i);
+
+ if (!test_bit(port - 1, actv_ports.ports))
+ continue;
+
+ ret = alloc_pv_object(dev, i, port, &ctx->tun[i]);
+ if (ret) {
+ ret = -ENOMEM;
+ goto err_mcg;
+ }
+ }
+
+ ret = mlx4_ib_mcg_port_init(ctx);
+ if (ret) {
+ pr_err("Failed initializing mcg para-virt (%d)\n", ret);
+ goto err_mcg;
+ }
+
+ snprintf(name, sizeof(name), "mlx4_ibt%d", port);
+ ctx->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
+ if (!ctx->wq) {
+ pr_err("Failed to create tunnelling WQ for port %d\n", port);
+ ret = -ENOMEM;
+ goto err_wq;
+ }
+
+ snprintf(name, sizeof(name), "mlx4_ibwi%d", port);
+ ctx->wi_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
+ if (!ctx->wi_wq) {
+ pr_err("Failed to create wire WQ for port %d\n", port);
+ ret = -ENOMEM;
+ goto err_wiwq;
+ }
+
+ snprintf(name, sizeof(name), "mlx4_ibud%d", port);
+ ctx->ud_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
+ if (!ctx->ud_wq) {
+ pr_err("Failed to create up/down WQ for port %d\n", port);
+ ret = -ENOMEM;
+ goto err_udwq;
+ }
+
+ return 0;
+
+err_udwq:
+ destroy_workqueue(ctx->wi_wq);
+ ctx->wi_wq = NULL;
+
+err_wiwq:
+ destroy_workqueue(ctx->wq);
+ ctx->wq = NULL;
+
+err_wq:
+ mlx4_ib_mcg_port_cleanup(ctx, 1);
+err_mcg:
+ for (i = 0; i < dev->dev->caps.sqp_demux; i++)
+ free_pv_object(dev, i, port);
+ kfree(ctx->tun);
+ ctx->tun = NULL;
+ return ret;
+}
+
+static void mlx4_ib_free_sqp_ctx(struct mlx4_ib_demux_pv_ctx *sqp_ctx)
+{
+ if (sqp_ctx->state > DEMUX_PV_STATE_DOWN) {
+ sqp_ctx->state = DEMUX_PV_STATE_DOWNING;
+ flush_workqueue(sqp_ctx->wq);
+ if (sqp_ctx->has_smi) {
+ ib_destroy_qp(sqp_ctx->qp[0].qp);
+ sqp_ctx->qp[0].qp = NULL;
+ mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_SMI, 0);
+ }
+ ib_destroy_qp(sqp_ctx->qp[1].qp);
+ sqp_ctx->qp[1].qp = NULL;
+ mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_GSI, 0);
+ ib_dealloc_pd(sqp_ctx->pd);
+ sqp_ctx->pd = NULL;
+ ib_destroy_cq(sqp_ctx->cq);
+ sqp_ctx->cq = NULL;
+ sqp_ctx->state = DEMUX_PV_STATE_DOWN;
+ }
+}
+
+static void mlx4_ib_free_demux_ctx(struct mlx4_ib_demux_ctx *ctx)
+{
+ int i;
+ if (ctx) {
+ struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+ mlx4_ib_mcg_port_cleanup(ctx, 1);
+ for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+ if (!ctx->tun[i])
+ continue;
+ if (ctx->tun[i]->state > DEMUX_PV_STATE_DOWN)
+ ctx->tun[i]->state = DEMUX_PV_STATE_DOWNING;
+ }
+ flush_workqueue(ctx->wq);
+ flush_workqueue(ctx->wi_wq);
+ for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+ destroy_pv_resources(dev, i, ctx->port, ctx->tun[i], 0);
+ free_pv_object(dev, i, ctx->port);
+ }
+ kfree(ctx->tun);
+ destroy_workqueue(ctx->ud_wq);
+ destroy_workqueue(ctx->wi_wq);
+ destroy_workqueue(ctx->wq);
+ }
+}
+
+static void mlx4_ib_master_tunnels(struct mlx4_ib_dev *dev, int do_init)
+{
+ int i;
+
+ if (!mlx4_is_master(dev->dev))
+ return;
+ /* initialize or tear down tunnel QPs for the master */
+ for (i = 0; i < dev->dev->caps.num_ports; i++)
+ mlx4_ib_tunnels_update(dev, mlx4_master_func_num(dev->dev), i + 1, do_init);
+ return;
+}
+
+int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev)
+{
+ int i = 0;
+ int err;
+
+ if (!mlx4_is_mfunc(dev->dev))
+ return 0;
+
+ dev->sriov.is_going_down = 0;
+ spin_lock_init(&dev->sriov.going_down_lock);
+ mlx4_ib_cm_paravirt_init(dev);
+
+ mlx4_ib_warn(&dev->ib_dev, "multi-function enabled\n");
+
+ if (mlx4_is_slave(dev->dev)) {
+ mlx4_ib_warn(&dev->ib_dev, "operating in qp1 tunnel mode\n");
+ return 0;
+ }
+
+ for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+ if (i == mlx4_master_func_num(dev->dev))
+ mlx4_put_slave_node_guid(dev->dev, i, dev->ib_dev.node_guid);
+ else
+ mlx4_put_slave_node_guid(dev->dev, i, mlx4_ib_gen_node_guid());
+ }
+
+ err = mlx4_ib_init_alias_guid_service(dev);
+ if (err) {
+ mlx4_ib_warn(&dev->ib_dev, "Failed init alias guid process.\n");
+ goto paravirt_err;
+ }
+ err = mlx4_ib_device_register_sysfs(dev);
+ if (err) {
+ mlx4_ib_warn(&dev->ib_dev, "Failed to register sysfs\n");
+ goto sysfs_err;
+ }
+
+ mlx4_ib_warn(&dev->ib_dev, "initializing demux service for %d qp1 clients\n",
+ dev->dev->caps.sqp_demux);
+ for (i = 0; i < dev->num_ports; i++) {
+ union ib_gid gid;
+ err = __mlx4_ib_query_gid(&dev->ib_dev, i + 1, 0, &gid, 1);
+ if (err)
+ goto demux_err;
+ dev->sriov.demux[i].guid_cache[0] = gid.global.interface_id;
+ atomic64_set(&dev->sriov.demux[i].subnet_prefix,
+ be64_to_cpu(gid.global.subnet_prefix));
+ err = alloc_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1,
+ &dev->sriov.sqps[i]);
+ if (err)
+ goto demux_err;
+ err = mlx4_ib_alloc_demux_ctx(dev, &dev->sriov.demux[i], i + 1);
+ if (err)
+ goto free_pv;
+ }
+ mlx4_ib_master_tunnels(dev, 1);
+ return 0;
+
+free_pv:
+ free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1);
+demux_err:
+ while (--i >= 0) {
+ free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1);
+ mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]);
+ }
+ mlx4_ib_device_unregister_sysfs(dev);
+
+sysfs_err:
+ mlx4_ib_destroy_alias_guid_service(dev);
+
+paravirt_err:
+ mlx4_ib_cm_paravirt_clean(dev, -1);
+
+ return err;
+}
+
+void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev)
+{
+ int i;
+ unsigned long flags;
+
+ if (!mlx4_is_mfunc(dev->dev))
+ return;
+
+ spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+ dev->sriov.is_going_down = 1;
+ spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+ if (mlx4_is_master(dev->dev)) {
+ for (i = 0; i < dev->num_ports; i++) {
+ flush_workqueue(dev->sriov.demux[i].ud_wq);
+ mlx4_ib_free_sqp_ctx(dev->sriov.sqps[i]);
+ kfree(dev->sriov.sqps[i]);
+ dev->sriov.sqps[i] = NULL;
+ mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]);
+ }
+
+ mlx4_ib_cm_paravirt_clean(dev, -1);
+ mlx4_ib_destroy_alias_guid_service(dev);
+ mlx4_ib_device_unregister_sysfs(dev);
+ }
+}
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
new file mode 100644
index 000000000..ba47874f9
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -0,0 +1,3344 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
+
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <net/devlink.h>
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+
+#include <net/bonding.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4_ib.h"
+#include <rdma/mlx4-abi.h>
+
+#define DRV_NAME MLX4_IB_DRV_NAME
+#define DRV_VERSION "4.0-0"
+
+#define MLX4_IB_FLOW_MAX_PRIO 0xFFF
+#define MLX4_IB_FLOW_QPN_MASK 0xFFFFFF
+#define MLX4_IB_CARD_REV_A0 0xA0
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+int mlx4_ib_sm_guid_assign = 0;
+module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444);
+MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 0)");
+
+static const char mlx4_ib_version[] =
+ DRV_NAME ": Mellanox ConnectX InfiniBand driver v"
+ DRV_VERSION "\n";
+
+static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init);
+static enum rdma_link_layer mlx4_ib_port_link_layer(struct ib_device *device,
+ u32 port_num);
+
+static struct workqueue_struct *wq;
+
+static int check_flow_steering_support(struct mlx4_dev *dev)
+{
+ int eth_num_ports = 0;
+ int ib_num_ports = 0;
+
+ int dmfs = dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED;
+
+ if (dmfs) {
+ int i;
+ mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
+ eth_num_ports++;
+ mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+ ib_num_ports++;
+ dmfs &= (!ib_num_ports ||
+ (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB)) &&
+ (!eth_num_ports ||
+ (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN));
+ if (ib_num_ports && mlx4_is_mfunc(dev)) {
+ pr_warn("Device managed flow steering is unavailable for IB port in multifunction env.\n");
+ dmfs = 0;
+ }
+ }
+ return dmfs;
+}
+
+static int num_ib_ports(struct mlx4_dev *dev)
+{
+ int ib_ports = 0;
+ int i;
+
+ mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+ ib_ports++;
+
+ return ib_ports;
+}
+
+static struct net_device *mlx4_ib_get_netdev(struct ib_device *device,
+ u32 port_num)
+{
+ struct mlx4_ib_dev *ibdev = to_mdev(device);
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port_num);
+
+ if (dev) {
+ if (mlx4_is_bonded(ibdev->dev)) {
+ struct net_device *upper = NULL;
+
+ upper = netdev_master_upper_dev_get_rcu(dev);
+ if (upper) {
+ struct net_device *active;
+
+ active = bond_option_active_slave_get_rcu(netdev_priv(upper));
+ if (active)
+ dev = active;
+ }
+ }
+ }
+ if (dev)
+ dev_hold(dev);
+
+ rcu_read_unlock();
+ return dev;
+}
+
+static int mlx4_ib_update_gids_v1(struct gid_entry *gids,
+ struct mlx4_ib_dev *ibdev,
+ u32 port_num)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ int err;
+ struct mlx4_dev *dev = ibdev->dev;
+ int i;
+ union ib_gid *gid_tbl;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return -ENOMEM;
+
+ gid_tbl = mailbox->buf;
+
+ for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i)
+ memcpy(&gid_tbl[i], &gids[i].gid, sizeof(union ib_gid));
+
+ err = mlx4_cmd(dev, mailbox->dma,
+ MLX4_SET_PORT_GID_TABLE << 8 | port_num,
+ 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+ MLX4_CMD_WRAPPED);
+ if (mlx4_is_bonded(dev))
+ err += mlx4_cmd(dev, mailbox->dma,
+ MLX4_SET_PORT_GID_TABLE << 8 | 2,
+ 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+ MLX4_CMD_WRAPPED);
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
+
+static int mlx4_ib_update_gids_v1_v2(struct gid_entry *gids,
+ struct mlx4_ib_dev *ibdev,
+ u32 port_num)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ int err;
+ struct mlx4_dev *dev = ibdev->dev;
+ int i;
+ struct {
+ union ib_gid gid;
+ __be32 rsrvd1[2];
+ __be16 rsrvd2;
+ u8 type;
+ u8 version;
+ __be32 rsrvd3;
+ } *gid_tbl;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return -ENOMEM;
+
+ gid_tbl = mailbox->buf;
+ for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) {
+ memcpy(&gid_tbl[i].gid, &gids[i].gid, sizeof(union ib_gid));
+ if (gids[i].gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) {
+ gid_tbl[i].version = 2;
+ if (!ipv6_addr_v4mapped((struct in6_addr *)&gids[i].gid))
+ gid_tbl[i].type = 1;
+ }
+ }
+
+ err = mlx4_cmd(dev, mailbox->dma,
+ MLX4_SET_PORT_ROCE_ADDR << 8 | port_num,
+ 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+ MLX4_CMD_WRAPPED);
+ if (mlx4_is_bonded(dev))
+ err += mlx4_cmd(dev, mailbox->dma,
+ MLX4_SET_PORT_ROCE_ADDR << 8 | 2,
+ 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+ MLX4_CMD_WRAPPED);
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
+
+static int mlx4_ib_update_gids(struct gid_entry *gids,
+ struct mlx4_ib_dev *ibdev,
+ u32 port_num)
+{
+ if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)
+ return mlx4_ib_update_gids_v1_v2(gids, ibdev, port_num);
+
+ return mlx4_ib_update_gids_v1(gids, ibdev, port_num);
+}
+
+static void free_gid_entry(struct gid_entry *entry)
+{
+ memset(&entry->gid, 0, sizeof(entry->gid));
+ kfree(entry->ctx);
+ entry->ctx = NULL;
+}
+
+static int mlx4_ib_add_gid(const struct ib_gid_attr *attr, void **context)
+{
+ struct mlx4_ib_dev *ibdev = to_mdev(attr->device);
+ struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+ struct mlx4_port_gid_table *port_gid_table;
+ int free = -1, found = -1;
+ int ret = 0;
+ int hw_update = 0;
+ int i;
+ struct gid_entry *gids = NULL;
+ u16 vlan_id = 0xffff;
+ u8 mac[ETH_ALEN];
+
+ if (!rdma_cap_roce_gid_table(attr->device, attr->port_num))
+ return -EINVAL;
+
+ if (attr->port_num > MLX4_MAX_PORTS)
+ return -EINVAL;
+
+ if (!context)
+ return -EINVAL;
+
+ ret = rdma_read_gid_l2_fields(attr, &vlan_id, &mac[0]);
+ if (ret)
+ return ret;
+ port_gid_table = &iboe->gids[attr->port_num - 1];
+ spin_lock_bh(&iboe->lock);
+ for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) {
+ if (!memcmp(&port_gid_table->gids[i].gid,
+ &attr->gid, sizeof(attr->gid)) &&
+ port_gid_table->gids[i].gid_type == attr->gid_type &&
+ port_gid_table->gids[i].vlan_id == vlan_id) {
+ found = i;
+ break;
+ }
+ if (free < 0 && rdma_is_zero_gid(&port_gid_table->gids[i].gid))
+ free = i; /* HW has space */
+ }
+
+ if (found < 0) {
+ if (free < 0) {
+ ret = -ENOSPC;
+ } else {
+ port_gid_table->gids[free].ctx = kmalloc(sizeof(*port_gid_table->gids[free].ctx), GFP_ATOMIC);
+ if (!port_gid_table->gids[free].ctx) {
+ ret = -ENOMEM;
+ } else {
+ *context = port_gid_table->gids[free].ctx;
+ memcpy(&port_gid_table->gids[free].gid,
+ &attr->gid, sizeof(attr->gid));
+ port_gid_table->gids[free].gid_type = attr->gid_type;
+ port_gid_table->gids[free].vlan_id = vlan_id;
+ port_gid_table->gids[free].ctx->real_index = free;
+ port_gid_table->gids[free].ctx->refcount = 1;
+ hw_update = 1;
+ }
+ }
+ } else {
+ struct gid_cache_context *ctx = port_gid_table->gids[found].ctx;
+ *context = ctx;
+ ctx->refcount++;
+ }
+ if (!ret && hw_update) {
+ gids = kmalloc_array(MLX4_MAX_PORT_GIDS, sizeof(*gids),
+ GFP_ATOMIC);
+ if (!gids) {
+ ret = -ENOMEM;
+ *context = NULL;
+ free_gid_entry(&port_gid_table->gids[free]);
+ } else {
+ for (i = 0; i < MLX4_MAX_PORT_GIDS; i++) {
+ memcpy(&gids[i].gid, &port_gid_table->gids[i].gid, sizeof(union ib_gid));
+ gids[i].gid_type = port_gid_table->gids[i].gid_type;
+ }
+ }
+ }
+ spin_unlock_bh(&iboe->lock);
+
+ if (!ret && hw_update) {
+ ret = mlx4_ib_update_gids(gids, ibdev, attr->port_num);
+ if (ret) {
+ spin_lock_bh(&iboe->lock);
+ *context = NULL;
+ free_gid_entry(&port_gid_table->gids[free]);
+ spin_unlock_bh(&iboe->lock);
+ }
+ kfree(gids);
+ }
+
+ return ret;
+}
+
+static int mlx4_ib_del_gid(const struct ib_gid_attr *attr, void **context)
+{
+ struct gid_cache_context *ctx = *context;
+ struct mlx4_ib_dev *ibdev = to_mdev(attr->device);
+ struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+ struct mlx4_port_gid_table *port_gid_table;
+ int ret = 0;
+ int hw_update = 0;
+ struct gid_entry *gids = NULL;
+
+ if (!rdma_cap_roce_gid_table(attr->device, attr->port_num))
+ return -EINVAL;
+
+ if (attr->port_num > MLX4_MAX_PORTS)
+ return -EINVAL;
+
+ port_gid_table = &iboe->gids[attr->port_num - 1];
+ spin_lock_bh(&iboe->lock);
+ if (ctx) {
+ ctx->refcount--;
+ if (!ctx->refcount) {
+ unsigned int real_index = ctx->real_index;
+
+ free_gid_entry(&port_gid_table->gids[real_index]);
+ hw_update = 1;
+ }
+ }
+ if (!ret && hw_update) {
+ int i;
+
+ gids = kmalloc_array(MLX4_MAX_PORT_GIDS, sizeof(*gids),
+ GFP_ATOMIC);
+ if (!gids) {
+ ret = -ENOMEM;
+ } else {
+ for (i = 0; i < MLX4_MAX_PORT_GIDS; i++) {
+ memcpy(&gids[i].gid,
+ &port_gid_table->gids[i].gid,
+ sizeof(union ib_gid));
+ gids[i].gid_type =
+ port_gid_table->gids[i].gid_type;
+ }
+ }
+ }
+ spin_unlock_bh(&iboe->lock);
+
+ if (!ret && hw_update) {
+ ret = mlx4_ib_update_gids(gids, ibdev, attr->port_num);
+ kfree(gids);
+ }
+ return ret;
+}
+
+int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
+ const struct ib_gid_attr *attr)
+{
+ struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+ struct gid_cache_context *ctx = NULL;
+ struct mlx4_port_gid_table *port_gid_table;
+ int real_index = -EINVAL;
+ int i;
+ unsigned long flags;
+ u32 port_num = attr->port_num;
+
+ if (port_num > MLX4_MAX_PORTS)
+ return -EINVAL;
+
+ if (mlx4_is_bonded(ibdev->dev))
+ port_num = 1;
+
+ if (!rdma_cap_roce_gid_table(&ibdev->ib_dev, port_num))
+ return attr->index;
+
+ spin_lock_irqsave(&iboe->lock, flags);
+ port_gid_table = &iboe->gids[port_num - 1];
+
+ for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i)
+ if (!memcmp(&port_gid_table->gids[i].gid,
+ &attr->gid, sizeof(attr->gid)) &&
+ attr->gid_type == port_gid_table->gids[i].gid_type) {
+ ctx = port_gid_table->gids[i].ctx;
+ break;
+ }
+ if (ctx)
+ real_index = ctx->real_index;
+ spin_unlock_irqrestore(&iboe->lock, flags);
+ return real_index;
+}
+
+static int mlx4_ib_query_device(struct ib_device *ibdev,
+ struct ib_device_attr *props,
+ struct ib_udata *uhw)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int err;
+ int have_ib_ports;
+ struct mlx4_uverbs_ex_query_device cmd;
+ struct mlx4_uverbs_ex_query_device_resp resp = {};
+ struct mlx4_clock_params clock_params;
+
+ if (uhw->inlen) {
+ if (uhw->inlen < sizeof(cmd))
+ return -EINVAL;
+
+ err = ib_copy_from_udata(&cmd, uhw, sizeof(cmd));
+ if (err)
+ return err;
+
+ if (cmd.comp_mask)
+ return -EINVAL;
+
+ if (cmd.reserved)
+ return -EINVAL;
+ }
+
+ resp.response_length = offsetof(typeof(resp), response_length) +
+ sizeof(resp.response_length);
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ err = -ENOMEM;
+ if (!in_mad || !out_mad)
+ goto out;
+
+ ib_init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+ err = mlx4_MAD_IFC(to_mdev(ibdev), MLX4_MAD_IFC_IGNORE_KEYS,
+ 1, NULL, NULL, in_mad, out_mad);
+ if (err)
+ goto out;
+
+ memset(props, 0, sizeof *props);
+
+ have_ib_ports = num_ib_ports(dev->dev);
+
+ props->fw_ver = dev->dev->caps.fw_ver;
+ props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT |
+ IB_DEVICE_PORT_ACTIVE_EVENT |
+ IB_DEVICE_SYS_IMAGE_GUID |
+ IB_DEVICE_RC_RNR_NAK_GEN;
+ props->kernel_cap_flags = IBK_BLOCK_MULTICAST_LOOPBACK;
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR)
+ props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR)
+ props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM && have_ib_ports)
+ props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT)
+ props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM)
+ props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
+ if (dev->dev->caps.max_gso_sz &&
+ (dev->dev->rev_id != MLX4_IB_CARD_REV_A0) &&
+ (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH))
+ props->kernel_cap_flags |= IBK_UD_TSO;
+ if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY)
+ props->kernel_cap_flags |= IBK_LOCAL_DMA_LKEY;
+ if ((dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_LOCAL_INV) &&
+ (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_REMOTE_INV) &&
+ (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_FAST_REG_WR))
+ props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)
+ props->device_cap_flags |= IB_DEVICE_XRC;
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW)
+ props->device_cap_flags |= IB_DEVICE_MEM_WINDOW;
+ if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) {
+ if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_WIN_TYPE_2B)
+ props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B;
+ else
+ props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A;
+ }
+ if (dev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED)
+ props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
+
+ props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
+
+ props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
+ 0xffffff;
+ props->vendor_part_id = dev->dev->persist->pdev->device;
+ props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32));
+ memcpy(&props->sys_image_guid, out_mad->data + 4, 8);
+
+ props->max_mr_size = ~0ull;
+ props->page_size_cap = dev->dev->caps.page_size_cap;
+ props->max_qp = dev->dev->quotas.qp;
+ props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE;
+ props->max_send_sge =
+ min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg);
+ props->max_recv_sge =
+ min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg);
+ props->max_sge_rd = MLX4_MAX_SGE_RD;
+ props->max_cq = dev->dev->quotas.cq;
+ props->max_cqe = dev->dev->caps.max_cqes;
+ props->max_mr = dev->dev->quotas.mpt;
+ props->max_pd = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds;
+ props->max_qp_rd_atom = dev->dev->caps.max_qp_dest_rdma;
+ props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma;
+ props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp;
+ props->max_srq = dev->dev->quotas.srq;
+ props->max_srq_wr = dev->dev->caps.max_srq_wqes - 1;
+ props->max_srq_sge = dev->dev->caps.max_srq_sge;
+ props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES;
+ props->local_ca_ack_delay = dev->dev->caps.local_ca_ack_delay;
+ props->atomic_cap = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ?
+ IB_ATOMIC_HCA : IB_ATOMIC_NONE;
+ props->masked_atomic_cap = props->atomic_cap;
+ props->max_pkeys = dev->dev->caps.pkey_table_len[1];
+ props->max_mcast_grp = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms;
+ props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm;
+ props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+ props->max_mcast_grp;
+ props->hca_core_clock = dev->dev->caps.hca_core_clock * 1000UL;
+ props->timestamp_mask = 0xFFFFFFFFFFFFULL;
+ props->max_ah = INT_MAX;
+
+ if (mlx4_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET ||
+ mlx4_ib_port_link_layer(ibdev, 2) == IB_LINK_LAYER_ETHERNET) {
+ if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) {
+ props->rss_caps.max_rwq_indirection_tables =
+ props->max_qp;
+ props->rss_caps.max_rwq_indirection_table_size =
+ dev->dev->caps.max_rss_tbl_sz;
+ props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET;
+ props->max_wq_type_rq = props->max_qp;
+ }
+
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP)
+ props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS;
+ }
+
+ props->cq_caps.max_cq_moderation_count = MLX4_MAX_CQ_COUNT;
+ props->cq_caps.max_cq_moderation_period = MLX4_MAX_CQ_PERIOD;
+
+ if (uhw->outlen >= resp.response_length + sizeof(resp.hca_core_clock_offset)) {
+ resp.response_length += sizeof(resp.hca_core_clock_offset);
+ if (!mlx4_get_internal_clock_params(dev->dev, &clock_params)) {
+ resp.comp_mask |= MLX4_IB_QUERY_DEV_RESP_MASK_CORE_CLOCK_OFFSET;
+ resp.hca_core_clock_offset = clock_params.offset % PAGE_SIZE;
+ }
+ }
+
+ if (uhw->outlen >= resp.response_length +
+ sizeof(resp.max_inl_recv_sz)) {
+ resp.response_length += sizeof(resp.max_inl_recv_sz);
+ resp.max_inl_recv_sz = dev->dev->caps.max_rq_sg *
+ sizeof(struct mlx4_wqe_data_seg);
+ }
+
+ if (offsetofend(typeof(resp), rss_caps) <= uhw->outlen) {
+ if (props->rss_caps.supported_qpts) {
+ resp.rss_caps.rx_hash_function =
+ MLX4_IB_RX_HASH_FUNC_TOEPLITZ;
+
+ resp.rss_caps.rx_hash_fields_mask =
+ MLX4_IB_RX_HASH_SRC_IPV4 |
+ MLX4_IB_RX_HASH_DST_IPV4 |
+ MLX4_IB_RX_HASH_SRC_IPV6 |
+ MLX4_IB_RX_HASH_DST_IPV6 |
+ MLX4_IB_RX_HASH_SRC_PORT_TCP |
+ MLX4_IB_RX_HASH_DST_PORT_TCP |
+ MLX4_IB_RX_HASH_SRC_PORT_UDP |
+ MLX4_IB_RX_HASH_DST_PORT_UDP;
+
+ if (dev->dev->caps.tunnel_offload_mode ==
+ MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)
+ resp.rss_caps.rx_hash_fields_mask |=
+ MLX4_IB_RX_HASH_INNER;
+ }
+ resp.response_length = offsetof(typeof(resp), rss_caps) +
+ sizeof(resp.rss_caps);
+ }
+
+ if (offsetofend(typeof(resp), tso_caps) <= uhw->outlen) {
+ if (dev->dev->caps.max_gso_sz &&
+ ((mlx4_ib_port_link_layer(ibdev, 1) ==
+ IB_LINK_LAYER_ETHERNET) ||
+ (mlx4_ib_port_link_layer(ibdev, 2) ==
+ IB_LINK_LAYER_ETHERNET))) {
+ resp.tso_caps.max_tso = dev->dev->caps.max_gso_sz;
+ resp.tso_caps.supported_qpts |=
+ 1 << IB_QPT_RAW_PACKET;
+ }
+ resp.response_length = offsetof(typeof(resp), tso_caps) +
+ sizeof(resp.tso_caps);
+ }
+
+ if (uhw->outlen) {
+ err = ib_copy_to_udata(uhw, &resp, resp.response_length);
+ if (err)
+ goto out;
+ }
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+
+ return err;
+}
+
+static enum rdma_link_layer
+mlx4_ib_port_link_layer(struct ib_device *device, u32 port_num)
+{
+ struct mlx4_dev *dev = to_mdev(device)->dev;
+
+ return dev->caps.port_mask[port_num] == MLX4_PORT_TYPE_IB ?
+ IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET;
+}
+
+static int ib_link_query_port(struct ib_device *ibdev, u32 port,
+ struct ib_port_attr *props, int netw_view)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int ext_active_speed;
+ int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
+ int err = -ENOMEM;
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ ib_init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_PORT_INFO;
+ in_mad->attr_mod = cpu_to_be32(port);
+
+ if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view)
+ mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+ err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL,
+ in_mad, out_mad);
+ if (err)
+ goto out;
+
+
+ props->lid = be16_to_cpup((__be16 *) (out_mad->data + 16));
+ props->lmc = out_mad->data[34] & 0x7;
+ props->sm_lid = be16_to_cpup((__be16 *) (out_mad->data + 18));
+ props->sm_sl = out_mad->data[36] & 0xf;
+ props->state = out_mad->data[32] & 0xf;
+ props->phys_state = out_mad->data[33] >> 4;
+ props->port_cap_flags = be32_to_cpup((__be32 *) (out_mad->data + 20));
+ if (netw_view)
+ props->gid_tbl_len = out_mad->data[50];
+ else
+ props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port];
+ props->max_msg_sz = to_mdev(ibdev)->dev->caps.max_msg_sz;
+ props->pkey_tbl_len = to_mdev(ibdev)->dev->caps.pkey_table_len[port];
+ props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46));
+ props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data + 48));
+ props->active_width = out_mad->data[31] & 0xf;
+ props->active_speed = out_mad->data[35] >> 4;
+ props->max_mtu = out_mad->data[41] & 0xf;
+ props->active_mtu = out_mad->data[36] >> 4;
+ props->subnet_timeout = out_mad->data[51] & 0x1f;
+ props->max_vl_num = out_mad->data[37] >> 4;
+ props->init_type_reply = out_mad->data[41] >> 4;
+
+ /* Check if extended speeds (EDR/FDR/...) are supported */
+ if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) {
+ ext_active_speed = out_mad->data[62] >> 4;
+
+ switch (ext_active_speed) {
+ case 1:
+ props->active_speed = IB_SPEED_FDR;
+ break;
+ case 2:
+ props->active_speed = IB_SPEED_EDR;
+ break;
+ }
+ }
+
+ /* If reported active speed is QDR, check if is FDR-10 */
+ if (props->active_speed == IB_SPEED_QDR) {
+ ib_init_query_mad(in_mad);
+ in_mad->attr_id = MLX4_ATTR_EXTENDED_PORT_INFO;
+ in_mad->attr_mod = cpu_to_be32(port);
+
+ err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port,
+ NULL, NULL, in_mad, out_mad);
+ if (err)
+ goto out;
+
+ /* Checking LinkSpeedActive for FDR-10 */
+ if (out_mad->data[15] & 0x1)
+ props->active_speed = IB_SPEED_FDR10;
+ }
+
+ /* Avoid wrong speed value returned by FW if the IB link is down. */
+ if (props->state == IB_PORT_DOWN)
+ props->active_speed = IB_SPEED_SDR;
+
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return err;
+}
+
+static u8 state_to_phys_state(enum ib_port_state state)
+{
+ return state == IB_PORT_ACTIVE ?
+ IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED;
+}
+
+static int eth_link_query_port(struct ib_device *ibdev, u32 port,
+ struct ib_port_attr *props)
+{
+
+ struct mlx4_ib_dev *mdev = to_mdev(ibdev);
+ struct mlx4_ib_iboe *iboe = &mdev->iboe;
+ struct net_device *ndev;
+ enum ib_mtu tmp;
+ struct mlx4_cmd_mailbox *mailbox;
+ int err = 0;
+ int is_bonded = mlx4_is_bonded(mdev->dev);
+
+ mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+
+ err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0,
+ MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B,
+ MLX4_CMD_WRAPPED);
+ if (err)
+ goto out;
+
+ props->active_width = (((u8 *)mailbox->buf)[5] == 0x40) ||
+ (((u8 *)mailbox->buf)[5] == 0x20 /*56Gb*/) ?
+ IB_WIDTH_4X : IB_WIDTH_1X;
+ props->active_speed = (((u8 *)mailbox->buf)[5] == 0x20 /*56Gb*/) ?
+ IB_SPEED_FDR : IB_SPEED_QDR;
+ props->port_cap_flags = IB_PORT_CM_SUP;
+ props->ip_gids = true;
+ props->gid_tbl_len = mdev->dev->caps.gid_table_len[port];
+ props->max_msg_sz = mdev->dev->caps.max_msg_sz;
+ if (mdev->dev->caps.pkey_table_len[port])
+ props->pkey_tbl_len = 1;
+ props->max_mtu = IB_MTU_4096;
+ props->max_vl_num = 2;
+ props->state = IB_PORT_DOWN;
+ props->phys_state = state_to_phys_state(props->state);
+ props->active_mtu = IB_MTU_256;
+ spin_lock_bh(&iboe->lock);
+ ndev = iboe->netdevs[port - 1];
+ if (ndev && is_bonded) {
+ rcu_read_lock(); /* required to get upper dev */
+ ndev = netdev_master_upper_dev_get_rcu(ndev);
+ rcu_read_unlock();
+ }
+ if (!ndev)
+ goto out_unlock;
+
+ tmp = iboe_get_mtu(ndev->mtu);
+ props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256;
+
+ props->state = (netif_running(ndev) && netif_carrier_ok(ndev)) ?
+ IB_PORT_ACTIVE : IB_PORT_DOWN;
+ props->phys_state = state_to_phys_state(props->state);
+out_unlock:
+ spin_unlock_bh(&iboe->lock);
+out:
+ mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+ return err;
+}
+
+int __mlx4_ib_query_port(struct ib_device *ibdev, u32 port,
+ struct ib_port_attr *props, int netw_view)
+{
+ int err;
+
+ /* props being zeroed by the caller, avoid zeroing it here */
+
+ err = mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ?
+ ib_link_query_port(ibdev, port, props, netw_view) :
+ eth_link_query_port(ibdev, port, props);
+
+ return err;
+}
+
+static int mlx4_ib_query_port(struct ib_device *ibdev, u32 port,
+ struct ib_port_attr *props)
+{
+ /* returns host view */
+ return __mlx4_ib_query_port(ibdev, port, props, 0);
+}
+
+int __mlx4_ib_query_gid(struct ib_device *ibdev, u32 port, int index,
+ union ib_gid *gid, int netw_view)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int err = -ENOMEM;
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ int clear = 0;
+ int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ ib_init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_PORT_INFO;
+ in_mad->attr_mod = cpu_to_be32(port);
+
+ if (mlx4_is_mfunc(dev->dev) && netw_view)
+ mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+ err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad);
+ if (err)
+ goto out;
+
+ memcpy(gid->raw, out_mad->data + 8, 8);
+
+ if (mlx4_is_mfunc(dev->dev) && !netw_view) {
+ if (index) {
+ /* For any index > 0, return the null guid */
+ err = 0;
+ clear = 1;
+ goto out;
+ }
+ }
+
+ ib_init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_GUID_INFO;
+ in_mad->attr_mod = cpu_to_be32(index / 8);
+
+ err = mlx4_MAD_IFC(dev, mad_ifc_flags, port,
+ NULL, NULL, in_mad, out_mad);
+ if (err)
+ goto out;
+
+ memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8);
+
+out:
+ if (clear)
+ memset(gid->raw + 8, 0, 8);
+ kfree(in_mad);
+ kfree(out_mad);
+ return err;
+}
+
+static int mlx4_ib_query_gid(struct ib_device *ibdev, u32 port, int index,
+ union ib_gid *gid)
+{
+ if (rdma_protocol_ib(ibdev, port))
+ return __mlx4_ib_query_gid(ibdev, port, index, gid, 0);
+ return 0;
+}
+
+static int mlx4_ib_query_sl2vl(struct ib_device *ibdev, u32 port,
+ u64 *sl2vl_tbl)
+{
+ union sl2vl_tbl_to_u64 sl2vl64;
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
+ int err = -ENOMEM;
+ int jj;
+
+ if (mlx4_is_slave(to_mdev(ibdev)->dev)) {
+ *sl2vl_tbl = 0;
+ return 0;
+ }
+
+ in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+ out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ ib_init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_SL_TO_VL_TABLE;
+ in_mad->attr_mod = 0;
+
+ if (mlx4_is_mfunc(to_mdev(ibdev)->dev))
+ mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+ err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL,
+ in_mad, out_mad);
+ if (err)
+ goto out;
+
+ for (jj = 0; jj < 8; jj++)
+ sl2vl64.sl8[jj] = ((struct ib_smp *)out_mad)->data[jj];
+ *sl2vl_tbl = sl2vl64.sl64;
+
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return err;
+}
+
+static void mlx4_init_sl2vl_tbl(struct mlx4_ib_dev *mdev)
+{
+ u64 sl2vl;
+ int i;
+ int err;
+
+ for (i = 1; i <= mdev->dev->caps.num_ports; i++) {
+ if (mdev->dev->caps.port_type[i] == MLX4_PORT_TYPE_ETH)
+ continue;
+ err = mlx4_ib_query_sl2vl(&mdev->ib_dev, i, &sl2vl);
+ if (err) {
+ pr_err("Unable to get default sl to vl mapping for port %d. Using all zeroes (%d)\n",
+ i, err);
+ sl2vl = 0;
+ }
+ atomic64_set(&mdev->sl2vl[i - 1], sl2vl);
+ }
+}
+
+int __mlx4_ib_query_pkey(struct ib_device *ibdev, u32 port, u16 index,
+ u16 *pkey, int netw_view)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
+ int err = -ENOMEM;
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ ib_init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE;
+ in_mad->attr_mod = cpu_to_be32(index / 32);
+
+ if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view)
+ mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+ err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL,
+ in_mad, out_mad);
+ if (err)
+ goto out;
+
+ *pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]);
+
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return err;
+}
+
+static int mlx4_ib_query_pkey(struct ib_device *ibdev, u32 port, u16 index,
+ u16 *pkey)
+{
+ return __mlx4_ib_query_pkey(ibdev, port, index, pkey, 0);
+}
+
+static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask,
+ struct ib_device_modify *props)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ unsigned long flags;
+
+ if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
+ return -EOPNOTSUPP;
+
+ if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
+ return 0;
+
+ if (mlx4_is_slave(to_mdev(ibdev)->dev))
+ return -EOPNOTSUPP;
+
+ spin_lock_irqsave(&to_mdev(ibdev)->sm_lock, flags);
+ memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
+ spin_unlock_irqrestore(&to_mdev(ibdev)->sm_lock, flags);
+
+ /*
+ * If possible, pass node desc to FW, so it can generate
+ * a 144 trap. If cmd fails, just ignore.
+ */
+ mailbox = mlx4_alloc_cmd_mailbox(to_mdev(ibdev)->dev);
+ if (IS_ERR(mailbox))
+ return 0;
+
+ memcpy(mailbox->buf, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
+ mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0,
+ MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+ mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox);
+
+ return 0;
+}
+
+static int mlx4_ib_SET_PORT(struct mlx4_ib_dev *dev, u32 port,
+ int reset_qkey_viols, u32 cap_mask)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ int err;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+
+ if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+ *(u8 *) mailbox->buf = !!reset_qkey_viols << 6;
+ ((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask);
+ } else {
+ ((u8 *) mailbox->buf)[3] = !!reset_qkey_viols;
+ ((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask);
+ }
+
+ err = mlx4_cmd(dev->dev, mailbox->dma, port, MLX4_SET_PORT_IB_OPCODE,
+ MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+ MLX4_CMD_WRAPPED);
+
+ mlx4_free_cmd_mailbox(dev->dev, mailbox);
+ return err;
+}
+
+static int mlx4_ib_modify_port(struct ib_device *ibdev, u32 port, int mask,
+ struct ib_port_modify *props)
+{
+ struct mlx4_ib_dev *mdev = to_mdev(ibdev);
+ u8 is_eth = mdev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH;
+ struct ib_port_attr attr;
+ u32 cap_mask;
+ int err;
+
+ /* return OK if this is RoCE. CM calls ib_modify_port() regardless
+ * of whether port link layer is ETH or IB. For ETH ports, qkey
+ * violations and port capabilities are not meaningful.
+ */
+ if (is_eth)
+ return 0;
+
+ mutex_lock(&mdev->cap_mask_mutex);
+
+ err = ib_query_port(ibdev, port, &attr);
+ if (err)
+ goto out;
+
+ cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) &
+ ~props->clr_port_cap_mask;
+
+ err = mlx4_ib_SET_PORT(mdev, port,
+ !!(mask & IB_PORT_RESET_QKEY_CNTR),
+ cap_mask);
+
+out:
+ mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex);
+ return err;
+}
+
+static int mlx4_ib_alloc_ucontext(struct ib_ucontext *uctx,
+ struct ib_udata *udata)
+{
+ struct ib_device *ibdev = uctx->device;
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct mlx4_ib_ucontext *context = to_mucontext(uctx);
+ struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3;
+ struct mlx4_ib_alloc_ucontext_resp resp;
+ int err;
+
+ if (!dev->ib_active)
+ return -EAGAIN;
+
+ if (ibdev->ops.uverbs_abi_ver ==
+ MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) {
+ resp_v3.qp_tab_size = dev->dev->caps.num_qps;
+ resp_v3.bf_reg_size = dev->dev->caps.bf_reg_size;
+ resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
+ } else {
+ resp.dev_caps = dev->dev->caps.userspace_caps;
+ resp.qp_tab_size = dev->dev->caps.num_qps;
+ resp.bf_reg_size = dev->dev->caps.bf_reg_size;
+ resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
+ resp.cqe_size = dev->dev->caps.cqe_size;
+ }
+
+ err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar);
+ if (err)
+ return err;
+
+ INIT_LIST_HEAD(&context->db_page_list);
+ mutex_init(&context->db_page_mutex);
+
+ INIT_LIST_HEAD(&context->wqn_ranges_list);
+ mutex_init(&context->wqn_ranges_mutex);
+
+ if (ibdev->ops.uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION)
+ err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3));
+ else
+ err = ib_copy_to_udata(udata, &resp, sizeof(resp));
+
+ if (err) {
+ mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar);
+ return -EFAULT;
+ }
+
+ return err;
+}
+
+static void mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+ struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
+
+ mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar);
+}
+
+static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
+{
+}
+
+static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+ struct mlx4_ib_dev *dev = to_mdev(context->device);
+
+ switch (vma->vm_pgoff) {
+ case 0:
+ return rdma_user_mmap_io(context, vma,
+ to_mucontext(context)->uar.pfn,
+ PAGE_SIZE,
+ pgprot_noncached(vma->vm_page_prot),
+ NULL);
+
+ case 1:
+ if (dev->dev->caps.bf_reg_size == 0)
+ return -EINVAL;
+ return rdma_user_mmap_io(
+ context, vma,
+ to_mucontext(context)->uar.pfn +
+ dev->dev->caps.num_uars,
+ PAGE_SIZE, pgprot_writecombine(vma->vm_page_prot),
+ NULL);
+
+ case 3: {
+ struct mlx4_clock_params params;
+ int ret;
+
+ ret = mlx4_get_internal_clock_params(dev->dev, &params);
+ if (ret)
+ return ret;
+
+ return rdma_user_mmap_io(
+ context, vma,
+ (pci_resource_start(dev->dev->persist->pdev,
+ params.bar) +
+ params.offset) >>
+ PAGE_SHIFT,
+ PAGE_SIZE, pgprot_noncached(vma->vm_page_prot),
+ NULL);
+ }
+
+ default:
+ return -EINVAL;
+ }
+}
+
+static int mlx4_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+{
+ struct mlx4_ib_pd *pd = to_mpd(ibpd);
+ struct ib_device *ibdev = ibpd->device;
+ int err;
+
+ err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn);
+ if (err)
+ return err;
+
+ if (udata && ib_copy_to_udata(udata, &pd->pdn, sizeof(__u32))) {
+ mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn);
+ return -EFAULT;
+ }
+ return 0;
+}
+
+static int mlx4_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
+{
+ mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn);
+ return 0;
+}
+
+static int mlx4_ib_alloc_xrcd(struct ib_xrcd *ibxrcd, struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibxrcd->device);
+ struct mlx4_ib_xrcd *xrcd = to_mxrcd(ibxrcd);
+ struct ib_cq_init_attr cq_attr = {};
+ int err;
+
+ if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+ return -EOPNOTSUPP;
+
+ err = mlx4_xrcd_alloc(dev->dev, &xrcd->xrcdn);
+ if (err)
+ return err;
+
+ xrcd->pd = ib_alloc_pd(ibxrcd->device, 0);
+ if (IS_ERR(xrcd->pd)) {
+ err = PTR_ERR(xrcd->pd);
+ goto err2;
+ }
+
+ cq_attr.cqe = 1;
+ xrcd->cq = ib_create_cq(ibxrcd->device, NULL, NULL, xrcd, &cq_attr);
+ if (IS_ERR(xrcd->cq)) {
+ err = PTR_ERR(xrcd->cq);
+ goto err3;
+ }
+
+ return 0;
+
+err3:
+ ib_dealloc_pd(xrcd->pd);
+err2:
+ mlx4_xrcd_free(dev->dev, xrcd->xrcdn);
+ return err;
+}
+
+static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata)
+{
+ ib_destroy_cq(to_mxrcd(xrcd)->cq);
+ ib_dealloc_pd(to_mxrcd(xrcd)->pd);
+ mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn);
+ return 0;
+}
+
+static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid)
+{
+ struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+ struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+ struct mlx4_ib_gid_entry *ge;
+
+ ge = kzalloc(sizeof *ge, GFP_KERNEL);
+ if (!ge)
+ return -ENOMEM;
+
+ ge->gid = *gid;
+ if (mlx4_ib_add_mc(mdev, mqp, gid)) {
+ ge->port = mqp->port;
+ ge->added = 1;
+ }
+
+ mutex_lock(&mqp->mutex);
+ list_add_tail(&ge->list, &mqp->gid_list);
+ mutex_unlock(&mqp->mutex);
+
+ return 0;
+}
+
+static void mlx4_ib_delete_counters_table(struct mlx4_ib_dev *ibdev,
+ struct mlx4_ib_counters *ctr_table)
+{
+ struct counter_index *counter, *tmp_count;
+
+ mutex_lock(&ctr_table->mutex);
+ list_for_each_entry_safe(counter, tmp_count, &ctr_table->counters_list,
+ list) {
+ if (counter->allocated)
+ mlx4_counter_free(ibdev->dev, counter->index);
+ list_del(&counter->list);
+ kfree(counter);
+ }
+ mutex_unlock(&ctr_table->mutex);
+}
+
+int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+ union ib_gid *gid)
+{
+ struct net_device *ndev;
+ int ret = 0;
+
+ if (!mqp->port)
+ return 0;
+
+ spin_lock_bh(&mdev->iboe.lock);
+ ndev = mdev->iboe.netdevs[mqp->port - 1];
+ if (ndev)
+ dev_hold(ndev);
+ spin_unlock_bh(&mdev->iboe.lock);
+
+ if (ndev) {
+ ret = 1;
+ dev_put(ndev);
+ }
+
+ return ret;
+}
+
+struct mlx4_ib_steering {
+ struct list_head list;
+ struct mlx4_flow_reg_id reg_id;
+ union ib_gid gid;
+};
+
+#define LAST_ETH_FIELD vlan_tag
+#define LAST_IB_FIELD sl
+#define LAST_IPV4_FIELD dst_ip
+#define LAST_TCP_UDP_FIELD src_port
+
+/* Field is the last supported field */
+#define FIELDS_NOT_SUPPORTED(filter, field)\
+ memchr_inv((void *)&filter.field +\
+ sizeof(filter.field), 0,\
+ sizeof(filter) -\
+ offsetof(typeof(filter), field) -\
+ sizeof(filter.field))
+
+static int parse_flow_attr(struct mlx4_dev *dev,
+ u32 qp_num,
+ union ib_flow_spec *ib_spec,
+ struct _rule_hw *mlx4_spec)
+{
+ enum mlx4_net_trans_rule_id type;
+
+ switch (ib_spec->type) {
+ case IB_FLOW_SPEC_ETH:
+ if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD))
+ return -ENOTSUPP;
+
+ type = MLX4_NET_TRANS_RULE_ID_ETH;
+ memcpy(mlx4_spec->eth.dst_mac, ib_spec->eth.val.dst_mac,
+ ETH_ALEN);
+ memcpy(mlx4_spec->eth.dst_mac_msk, ib_spec->eth.mask.dst_mac,
+ ETH_ALEN);
+ mlx4_spec->eth.vlan_tag = ib_spec->eth.val.vlan_tag;
+ mlx4_spec->eth.vlan_tag_msk = ib_spec->eth.mask.vlan_tag;
+ break;
+ case IB_FLOW_SPEC_IB:
+ if (FIELDS_NOT_SUPPORTED(ib_spec->ib.mask, LAST_IB_FIELD))
+ return -ENOTSUPP;
+
+ type = MLX4_NET_TRANS_RULE_ID_IB;
+ mlx4_spec->ib.l3_qpn =
+ cpu_to_be32(qp_num);
+ mlx4_spec->ib.qpn_mask =
+ cpu_to_be32(MLX4_IB_FLOW_QPN_MASK);
+ break;
+
+
+ case IB_FLOW_SPEC_IPV4:
+ if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD))
+ return -ENOTSUPP;
+
+ type = MLX4_NET_TRANS_RULE_ID_IPV4;
+ mlx4_spec->ipv4.src_ip = ib_spec->ipv4.val.src_ip;
+ mlx4_spec->ipv4.src_ip_msk = ib_spec->ipv4.mask.src_ip;
+ mlx4_spec->ipv4.dst_ip = ib_spec->ipv4.val.dst_ip;
+ mlx4_spec->ipv4.dst_ip_msk = ib_spec->ipv4.mask.dst_ip;
+ break;
+
+ case IB_FLOW_SPEC_TCP:
+ case IB_FLOW_SPEC_UDP:
+ if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask, LAST_TCP_UDP_FIELD))
+ return -ENOTSUPP;
+
+ type = ib_spec->type == IB_FLOW_SPEC_TCP ?
+ MLX4_NET_TRANS_RULE_ID_TCP :
+ MLX4_NET_TRANS_RULE_ID_UDP;
+ mlx4_spec->tcp_udp.dst_port = ib_spec->tcp_udp.val.dst_port;
+ mlx4_spec->tcp_udp.dst_port_msk = ib_spec->tcp_udp.mask.dst_port;
+ mlx4_spec->tcp_udp.src_port = ib_spec->tcp_udp.val.src_port;
+ mlx4_spec->tcp_udp.src_port_msk = ib_spec->tcp_udp.mask.src_port;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ if (mlx4_map_sw_to_hw_steering_id(dev, type) < 0 ||
+ mlx4_hw_rule_sz(dev, type) < 0)
+ return -EINVAL;
+ mlx4_spec->id = cpu_to_be16(mlx4_map_sw_to_hw_steering_id(dev, type));
+ mlx4_spec->size = mlx4_hw_rule_sz(dev, type) >> 2;
+ return mlx4_hw_rule_sz(dev, type);
+}
+
+struct default_rules {
+ __u32 mandatory_fields[IB_FLOW_SPEC_SUPPORT_LAYERS];
+ __u32 mandatory_not_fields[IB_FLOW_SPEC_SUPPORT_LAYERS];
+ __u32 rules_create_list[IB_FLOW_SPEC_SUPPORT_LAYERS];
+ __u8 link_layer;
+};
+static const struct default_rules default_table[] = {
+ {
+ .mandatory_fields = {IB_FLOW_SPEC_IPV4},
+ .mandatory_not_fields = {IB_FLOW_SPEC_ETH},
+ .rules_create_list = {IB_FLOW_SPEC_IB},
+ .link_layer = IB_LINK_LAYER_INFINIBAND
+ }
+};
+
+static int __mlx4_ib_default_rules_match(struct ib_qp *qp,
+ struct ib_flow_attr *flow_attr)
+{
+ int i, j, k;
+ void *ib_flow;
+ const struct default_rules *pdefault_rules = default_table;
+ u8 link_layer = rdma_port_get_link_layer(qp->device, flow_attr->port);
+
+ for (i = 0; i < ARRAY_SIZE(default_table); i++, pdefault_rules++) {
+ __u32 field_types[IB_FLOW_SPEC_SUPPORT_LAYERS];
+ memset(&field_types, 0, sizeof(field_types));
+
+ if (link_layer != pdefault_rules->link_layer)
+ continue;
+
+ ib_flow = flow_attr + 1;
+ /* we assume the specs are sorted */
+ for (j = 0, k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS &&
+ j < flow_attr->num_of_specs; k++) {
+ union ib_flow_spec *current_flow =
+ (union ib_flow_spec *)ib_flow;
+
+ /* same layer but different type */
+ if (((current_flow->type & IB_FLOW_SPEC_LAYER_MASK) ==
+ (pdefault_rules->mandatory_fields[k] &
+ IB_FLOW_SPEC_LAYER_MASK)) &&
+ (current_flow->type !=
+ pdefault_rules->mandatory_fields[k]))
+ goto out;
+
+ /* same layer, try match next one */
+ if (current_flow->type ==
+ pdefault_rules->mandatory_fields[k]) {
+ j++;
+ ib_flow +=
+ ((union ib_flow_spec *)ib_flow)->size;
+ }
+ }
+
+ ib_flow = flow_attr + 1;
+ for (j = 0; j < flow_attr->num_of_specs;
+ j++, ib_flow += ((union ib_flow_spec *)ib_flow)->size)
+ for (k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS; k++)
+ /* same layer and same type */
+ if (((union ib_flow_spec *)ib_flow)->type ==
+ pdefault_rules->mandatory_not_fields[k])
+ goto out;
+
+ return i;
+ }
+out:
+ return -1;
+}
+
+static int __mlx4_ib_create_default_rules(
+ struct mlx4_ib_dev *mdev,
+ struct ib_qp *qp,
+ const struct default_rules *pdefault_rules,
+ struct _rule_hw *mlx4_spec) {
+ int size = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(pdefault_rules->rules_create_list); i++) {
+ union ib_flow_spec ib_spec = {};
+ int ret;
+
+ switch (pdefault_rules->rules_create_list[i]) {
+ case 0:
+ /* no rule */
+ continue;
+ case IB_FLOW_SPEC_IB:
+ ib_spec.type = IB_FLOW_SPEC_IB;
+ ib_spec.size = sizeof(struct ib_flow_spec_ib);
+
+ break;
+ default:
+ /* invalid rule */
+ return -EINVAL;
+ }
+ /* We must put empty rule, qpn is being ignored */
+ ret = parse_flow_attr(mdev->dev, 0, &ib_spec,
+ mlx4_spec);
+ if (ret < 0) {
+ pr_info("invalid parsing\n");
+ return -EINVAL;
+ }
+
+ mlx4_spec = (void *)mlx4_spec + ret;
+ size += ret;
+ }
+ return size;
+}
+
+static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr,
+ int domain,
+ enum mlx4_net_trans_promisc_mode flow_type,
+ u64 *reg_id)
+{
+ int ret, i;
+ int size = 0;
+ void *ib_flow;
+ struct mlx4_ib_dev *mdev = to_mdev(qp->device);
+ struct mlx4_cmd_mailbox *mailbox;
+ struct mlx4_net_trans_rule_hw_ctrl *ctrl;
+ int default_flow;
+
+ if (flow_attr->priority > MLX4_IB_FLOW_MAX_PRIO) {
+ pr_err("Invalid priority value %d\n", flow_attr->priority);
+ return -EINVAL;
+ }
+
+ if (mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0)
+ return -EINVAL;
+
+ mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ ctrl = mailbox->buf;
+
+ ctrl->prio = cpu_to_be16(domain | flow_attr->priority);
+ ctrl->type = mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type);
+ ctrl->port = flow_attr->port;
+ ctrl->qpn = cpu_to_be32(qp->qp_num);
+
+ ib_flow = flow_attr + 1;
+ size += sizeof(struct mlx4_net_trans_rule_hw_ctrl);
+ /* Add default flows */
+ default_flow = __mlx4_ib_default_rules_match(qp, flow_attr);
+ if (default_flow >= 0) {
+ ret = __mlx4_ib_create_default_rules(
+ mdev, qp, default_table + default_flow,
+ mailbox->buf + size);
+ if (ret < 0) {
+ mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+ return -EINVAL;
+ }
+ size += ret;
+ }
+ for (i = 0; i < flow_attr->num_of_specs; i++) {
+ ret = parse_flow_attr(mdev->dev, qp->qp_num, ib_flow,
+ mailbox->buf + size);
+ if (ret < 0) {
+ mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+ return -EINVAL;
+ }
+ ib_flow += ((union ib_flow_spec *) ib_flow)->size;
+ size += ret;
+ }
+
+ if (mlx4_is_master(mdev->dev) && flow_type == MLX4_FS_REGULAR &&
+ flow_attr->num_of_specs == 1) {
+ struct _rule_hw *rule_header = (struct _rule_hw *)(ctrl + 1);
+ enum ib_flow_spec_type header_spec =
+ ((union ib_flow_spec *)(flow_attr + 1))->type;
+
+ if (header_spec == IB_FLOW_SPEC_ETH)
+ mlx4_handle_eth_header_mcast_prio(ctrl, rule_header);
+ }
+
+ ret = mlx4_cmd_imm(mdev->dev, mailbox->dma, reg_id, size >> 2, 0,
+ MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
+ MLX4_CMD_NATIVE);
+ if (ret == -ENOMEM)
+ pr_err("mcg table is full. Fail to register network rule.\n");
+ else if (ret == -ENXIO)
+ pr_err("Device managed flow steering is disabled. Fail to register network rule.\n");
+ else if (ret)
+ pr_err("Invalid argument. Fail to register network rule.\n");
+
+ mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+ return ret;
+}
+
+static int __mlx4_ib_destroy_flow(struct mlx4_dev *dev, u64 reg_id)
+{
+ int err;
+ err = mlx4_cmd(dev, reg_id, 0, 0,
+ MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
+ MLX4_CMD_NATIVE);
+ if (err)
+ pr_err("Fail to detach network rule. registration id = 0x%llx\n",
+ reg_id);
+ return err;
+}
+
+static int mlx4_ib_tunnel_steer_add(struct ib_qp *qp, struct ib_flow_attr *flow_attr,
+ u64 *reg_id)
+{
+ void *ib_flow;
+ union ib_flow_spec *ib_spec;
+ struct mlx4_dev *dev = to_mdev(qp->device)->dev;
+ int err = 0;
+
+ if (dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN ||
+ dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC)
+ return 0; /* do nothing */
+
+ ib_flow = flow_attr + 1;
+ ib_spec = (union ib_flow_spec *)ib_flow;
+
+ if (ib_spec->type != IB_FLOW_SPEC_ETH || flow_attr->num_of_specs != 1)
+ return 0; /* do nothing */
+
+ err = mlx4_tunnel_steer_add(to_mdev(qp->device)->dev, ib_spec->eth.val.dst_mac,
+ flow_attr->port, qp->qp_num,
+ MLX4_DOMAIN_UVERBS | (flow_attr->priority & 0xff),
+ reg_id);
+ return err;
+}
+
+static int mlx4_ib_add_dont_trap_rule(struct mlx4_dev *dev,
+ struct ib_flow_attr *flow_attr,
+ enum mlx4_net_trans_promisc_mode *type)
+{
+ int err = 0;
+
+ if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER) ||
+ (dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC) ||
+ (flow_attr->num_of_specs > 1) || (flow_attr->priority != 0)) {
+ return -EOPNOTSUPP;
+ }
+
+ if (flow_attr->num_of_specs == 0) {
+ type[0] = MLX4_FS_MC_SNIFFER;
+ type[1] = MLX4_FS_UC_SNIFFER;
+ } else {
+ union ib_flow_spec *ib_spec;
+
+ ib_spec = (union ib_flow_spec *)(flow_attr + 1);
+ if (ib_spec->type != IB_FLOW_SPEC_ETH)
+ return -EINVAL;
+
+ /* if all is zero than MC and UC */
+ if (is_zero_ether_addr(ib_spec->eth.mask.dst_mac)) {
+ type[0] = MLX4_FS_MC_SNIFFER;
+ type[1] = MLX4_FS_UC_SNIFFER;
+ } else {
+ u8 mac[ETH_ALEN] = {ib_spec->eth.mask.dst_mac[0] ^ 0x01,
+ ib_spec->eth.mask.dst_mac[1],
+ ib_spec->eth.mask.dst_mac[2],
+ ib_spec->eth.mask.dst_mac[3],
+ ib_spec->eth.mask.dst_mac[4],
+ ib_spec->eth.mask.dst_mac[5]};
+
+ /* Above xor was only on MC bit, non empty mask is valid
+ * only if this bit is set and rest are zero.
+ */
+ if (!is_zero_ether_addr(&mac[0]))
+ return -EINVAL;
+
+ if (is_multicast_ether_addr(ib_spec->eth.val.dst_mac))
+ type[0] = MLX4_FS_MC_SNIFFER;
+ else
+ type[0] = MLX4_FS_UC_SNIFFER;
+ }
+ }
+
+ return err;
+}
+
+static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp,
+ struct ib_flow_attr *flow_attr,
+ struct ib_udata *udata)
+{
+ int err = 0, i = 0, j = 0;
+ struct mlx4_ib_flow *mflow;
+ enum mlx4_net_trans_promisc_mode type[2];
+ struct mlx4_dev *dev = (to_mdev(qp->device))->dev;
+ int is_bonded = mlx4_is_bonded(dev);
+
+ if (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if ((flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) &&
+ (flow_attr->type != IB_FLOW_ATTR_NORMAL))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (udata &&
+ udata->inlen && !ib_is_udata_cleared(udata, 0, udata->inlen))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ memset(type, 0, sizeof(type));
+
+ mflow = kzalloc(sizeof(*mflow), GFP_KERNEL);
+ if (!mflow) {
+ err = -ENOMEM;
+ goto err_free;
+ }
+
+ switch (flow_attr->type) {
+ case IB_FLOW_ATTR_NORMAL:
+ /* If dont trap flag (continue match) is set, under specific
+ * condition traffic be replicated to given qp,
+ * without stealing it
+ */
+ if (unlikely(flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)) {
+ err = mlx4_ib_add_dont_trap_rule(dev,
+ flow_attr,
+ type);
+ if (err)
+ goto err_free;
+ } else {
+ type[0] = MLX4_FS_REGULAR;
+ }
+ break;
+
+ case IB_FLOW_ATTR_ALL_DEFAULT:
+ type[0] = MLX4_FS_ALL_DEFAULT;
+ break;
+
+ case IB_FLOW_ATTR_MC_DEFAULT:
+ type[0] = MLX4_FS_MC_DEFAULT;
+ break;
+
+ case IB_FLOW_ATTR_SNIFFER:
+ type[0] = MLX4_FS_MIRROR_RX_PORT;
+ type[1] = MLX4_FS_MIRROR_SX_PORT;
+ break;
+
+ default:
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ while (i < ARRAY_SIZE(type) && type[i]) {
+ err = __mlx4_ib_create_flow(qp, flow_attr, MLX4_DOMAIN_UVERBS,
+ type[i], &mflow->reg_id[i].id);
+ if (err)
+ goto err_create_flow;
+ if (is_bonded) {
+ /* Application always sees one port so the mirror rule
+ * must be on port #2
+ */
+ flow_attr->port = 2;
+ err = __mlx4_ib_create_flow(qp, flow_attr,
+ MLX4_DOMAIN_UVERBS, type[j],
+ &mflow->reg_id[j].mirror);
+ flow_attr->port = 1;
+ if (err)
+ goto err_create_flow;
+ j++;
+ }
+
+ i++;
+ }
+
+ if (i < ARRAY_SIZE(type) && flow_attr->type == IB_FLOW_ATTR_NORMAL) {
+ err = mlx4_ib_tunnel_steer_add(qp, flow_attr,
+ &mflow->reg_id[i].id);
+ if (err)
+ goto err_create_flow;
+
+ if (is_bonded) {
+ flow_attr->port = 2;
+ err = mlx4_ib_tunnel_steer_add(qp, flow_attr,
+ &mflow->reg_id[j].mirror);
+ flow_attr->port = 1;
+ if (err)
+ goto err_create_flow;
+ j++;
+ }
+ /* function to create mirror rule */
+ i++;
+ }
+
+ return &mflow->ibflow;
+
+err_create_flow:
+ while (i) {
+ (void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev,
+ mflow->reg_id[i].id);
+ i--;
+ }
+
+ while (j) {
+ (void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev,
+ mflow->reg_id[j].mirror);
+ j--;
+ }
+err_free:
+ kfree(mflow);
+ return ERR_PTR(err);
+}
+
+static int mlx4_ib_destroy_flow(struct ib_flow *flow_id)
+{
+ int err, ret = 0;
+ int i = 0;
+ struct mlx4_ib_dev *mdev = to_mdev(flow_id->qp->device);
+ struct mlx4_ib_flow *mflow = to_mflow(flow_id);
+
+ while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i].id) {
+ err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i].id);
+ if (err)
+ ret = err;
+ if (mflow->reg_id[i].mirror) {
+ err = __mlx4_ib_destroy_flow(mdev->dev,
+ mflow->reg_id[i].mirror);
+ if (err)
+ ret = err;
+ }
+ i++;
+ }
+
+ kfree(mflow);
+ return ret;
+}
+
+static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ int err;
+ struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+ struct mlx4_dev *dev = mdev->dev;
+ struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+ struct mlx4_ib_steering *ib_steering = NULL;
+ enum mlx4_protocol prot = MLX4_PROT_IB_IPV6;
+ struct mlx4_flow_reg_id reg_id;
+
+ if (mdev->dev->caps.steering_mode ==
+ MLX4_STEERING_MODE_DEVICE_MANAGED) {
+ ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL);
+ if (!ib_steering)
+ return -ENOMEM;
+ }
+
+ err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, mqp->port,
+ !!(mqp->flags &
+ MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
+ prot, &reg_id.id);
+ if (err) {
+ pr_err("multicast attach op failed, err %d\n", err);
+ goto err_malloc;
+ }
+
+ reg_id.mirror = 0;
+ if (mlx4_is_bonded(dev)) {
+ err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw,
+ (mqp->port == 1) ? 2 : 1,
+ !!(mqp->flags &
+ MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
+ prot, &reg_id.mirror);
+ if (err)
+ goto err_add;
+ }
+
+ err = add_gid_entry(ibqp, gid);
+ if (err)
+ goto err_add;
+
+ if (ib_steering) {
+ memcpy(ib_steering->gid.raw, gid->raw, 16);
+ ib_steering->reg_id = reg_id;
+ mutex_lock(&mqp->mutex);
+ list_add(&ib_steering->list, &mqp->steering_rules);
+ mutex_unlock(&mqp->mutex);
+ }
+ return 0;
+
+err_add:
+ mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+ prot, reg_id.id);
+ if (reg_id.mirror)
+ mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+ prot, reg_id.mirror);
+err_malloc:
+ kfree(ib_steering);
+
+ return err;
+}
+
+static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw)
+{
+ struct mlx4_ib_gid_entry *ge;
+ struct mlx4_ib_gid_entry *tmp;
+ struct mlx4_ib_gid_entry *ret = NULL;
+
+ list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
+ if (!memcmp(raw, ge->gid.raw, 16)) {
+ ret = ge;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ int err;
+ struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+ struct mlx4_dev *dev = mdev->dev;
+ struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+ struct net_device *ndev;
+ struct mlx4_ib_gid_entry *ge;
+ struct mlx4_flow_reg_id reg_id = {0, 0};
+ enum mlx4_protocol prot = MLX4_PROT_IB_IPV6;
+
+ if (mdev->dev->caps.steering_mode ==
+ MLX4_STEERING_MODE_DEVICE_MANAGED) {
+ struct mlx4_ib_steering *ib_steering;
+
+ mutex_lock(&mqp->mutex);
+ list_for_each_entry(ib_steering, &mqp->steering_rules, list) {
+ if (!memcmp(ib_steering->gid.raw, gid->raw, 16)) {
+ list_del(&ib_steering->list);
+ break;
+ }
+ }
+ mutex_unlock(&mqp->mutex);
+ if (&ib_steering->list == &mqp->steering_rules) {
+ pr_err("Couldn't find reg_id for mgid. Steering rule is left attached\n");
+ return -EINVAL;
+ }
+ reg_id = ib_steering->reg_id;
+ kfree(ib_steering);
+ }
+
+ err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+ prot, reg_id.id);
+ if (err)
+ return err;
+
+ if (mlx4_is_bonded(dev)) {
+ err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+ prot, reg_id.mirror);
+ if (err)
+ return err;
+ }
+
+ mutex_lock(&mqp->mutex);
+ ge = find_gid_entry(mqp, gid->raw);
+ if (ge) {
+ spin_lock_bh(&mdev->iboe.lock);
+ ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL;
+ if (ndev)
+ dev_hold(ndev);
+ spin_unlock_bh(&mdev->iboe.lock);
+ if (ndev)
+ dev_put(ndev);
+ list_del(&ge->list);
+ kfree(ge);
+ } else
+ pr_warn("could not find mgid entry\n");
+
+ mutex_unlock(&mqp->mutex);
+
+ return 0;
+}
+
+static int init_node_data(struct mlx4_ib_dev *dev)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
+ int err = -ENOMEM;
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ ib_init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_NODE_DESC;
+ if (mlx4_is_master(dev->dev))
+ mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+ err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad);
+ if (err)
+ goto out;
+
+ memcpy(dev->ib_dev.node_desc, out_mad->data, IB_DEVICE_NODE_DESC_MAX);
+
+ in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+ err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad);
+ if (err)
+ goto out;
+
+ dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32));
+ memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
+
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return err;
+}
+
+static ssize_t hca_type_show(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct mlx4_ib_dev *dev =
+ rdma_device_to_drv_device(device, struct mlx4_ib_dev, ib_dev);
+
+ return sysfs_emit(buf, "MT%d\n", dev->dev->persist->pdev->device);
+}
+static DEVICE_ATTR_RO(hca_type);
+
+static ssize_t hw_rev_show(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct mlx4_ib_dev *dev =
+ rdma_device_to_drv_device(device, struct mlx4_ib_dev, ib_dev);
+
+ return sysfs_emit(buf, "%x\n", dev->dev->rev_id);
+}
+static DEVICE_ATTR_RO(hw_rev);
+
+static ssize_t board_id_show(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct mlx4_ib_dev *dev =
+ rdma_device_to_drv_device(device, struct mlx4_ib_dev, ib_dev);
+
+ return sysfs_emit(buf, "%.*s\n", MLX4_BOARD_ID_LEN, dev->dev->board_id);
+}
+static DEVICE_ATTR_RO(board_id);
+
+static struct attribute *mlx4_class_attributes[] = {
+ &dev_attr_hw_rev.attr,
+ &dev_attr_hca_type.attr,
+ &dev_attr_board_id.attr,
+ NULL
+};
+
+static const struct attribute_group mlx4_attr_group = {
+ .attrs = mlx4_class_attributes,
+};
+
+struct diag_counter {
+ const char *name;
+ u32 offset;
+};
+
+#define DIAG_COUNTER(_name, _offset) \
+ { .name = #_name, .offset = _offset }
+
+static const struct diag_counter diag_basic[] = {
+ DIAG_COUNTER(rq_num_lle, 0x00),
+ DIAG_COUNTER(sq_num_lle, 0x04),
+ DIAG_COUNTER(rq_num_lqpoe, 0x08),
+ DIAG_COUNTER(sq_num_lqpoe, 0x0C),
+ DIAG_COUNTER(rq_num_lpe, 0x18),
+ DIAG_COUNTER(sq_num_lpe, 0x1C),
+ DIAG_COUNTER(rq_num_wrfe, 0x20),
+ DIAG_COUNTER(sq_num_wrfe, 0x24),
+ DIAG_COUNTER(sq_num_mwbe, 0x2C),
+ DIAG_COUNTER(sq_num_bre, 0x34),
+ DIAG_COUNTER(sq_num_rire, 0x44),
+ DIAG_COUNTER(rq_num_rire, 0x48),
+ DIAG_COUNTER(sq_num_rae, 0x4C),
+ DIAG_COUNTER(rq_num_rae, 0x50),
+ DIAG_COUNTER(sq_num_roe, 0x54),
+ DIAG_COUNTER(sq_num_tree, 0x5C),
+ DIAG_COUNTER(sq_num_rree, 0x64),
+ DIAG_COUNTER(rq_num_rnr, 0x68),
+ DIAG_COUNTER(sq_num_rnr, 0x6C),
+ DIAG_COUNTER(rq_num_oos, 0x100),
+ DIAG_COUNTER(sq_num_oos, 0x104),
+};
+
+static const struct diag_counter diag_ext[] = {
+ DIAG_COUNTER(rq_num_dup, 0x130),
+ DIAG_COUNTER(sq_num_to, 0x134),
+};
+
+static const struct diag_counter diag_device_only[] = {
+ DIAG_COUNTER(num_cqovf, 0x1A0),
+ DIAG_COUNTER(rq_num_udsdprd, 0x118),
+};
+
+static struct rdma_hw_stats *
+mlx4_ib_alloc_hw_device_stats(struct ib_device *ibdev)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct mlx4_ib_diag_counters *diag = dev->diag_counters;
+
+ if (!diag[0].descs)
+ return NULL;
+
+ return rdma_alloc_hw_stats_struct(diag[0].descs, diag[0].num_counters,
+ RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static struct rdma_hw_stats *
+mlx4_ib_alloc_hw_port_stats(struct ib_device *ibdev, u32 port_num)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct mlx4_ib_diag_counters *diag = dev->diag_counters;
+
+ if (!diag[1].descs)
+ return NULL;
+
+ return rdma_alloc_hw_stats_struct(diag[1].descs, diag[1].num_counters,
+ RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int mlx4_ib_get_hw_stats(struct ib_device *ibdev,
+ struct rdma_hw_stats *stats,
+ u32 port, int index)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct mlx4_ib_diag_counters *diag = dev->diag_counters;
+ u32 hw_value[ARRAY_SIZE(diag_device_only) +
+ ARRAY_SIZE(diag_ext) + ARRAY_SIZE(diag_basic)] = {};
+ int ret;
+ int i;
+
+ ret = mlx4_query_diag_counters(dev->dev,
+ MLX4_OP_MOD_QUERY_TRANSPORT_CI_ERRORS,
+ diag[!!port].offset, hw_value,
+ diag[!!port].num_counters, port);
+
+ if (ret)
+ return ret;
+
+ for (i = 0; i < diag[!!port].num_counters; i++)
+ stats->value[i] = hw_value[i];
+
+ return diag[!!port].num_counters;
+}
+
+static int __mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev,
+ struct rdma_stat_desc **pdescs,
+ u32 **offset, u32 *num, bool port)
+{
+ u32 num_counters;
+
+ num_counters = ARRAY_SIZE(diag_basic);
+
+ if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT)
+ num_counters += ARRAY_SIZE(diag_ext);
+
+ if (!port)
+ num_counters += ARRAY_SIZE(diag_device_only);
+
+ *pdescs = kcalloc(num_counters, sizeof(struct rdma_stat_desc),
+ GFP_KERNEL);
+ if (!*pdescs)
+ return -ENOMEM;
+
+ *offset = kcalloc(num_counters, sizeof(**offset), GFP_KERNEL);
+ if (!*offset)
+ goto err;
+
+ *num = num_counters;
+
+ return 0;
+
+err:
+ kfree(*pdescs);
+ return -ENOMEM;
+}
+
+static void mlx4_ib_fill_diag_counters(struct mlx4_ib_dev *ibdev,
+ struct rdma_stat_desc *descs,
+ u32 *offset, bool port)
+{
+ int i;
+ int j;
+
+ for (i = 0, j = 0; i < ARRAY_SIZE(diag_basic); i++, j++) {
+ descs[i].name = diag_basic[i].name;
+ offset[i] = diag_basic[i].offset;
+ }
+
+ if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT) {
+ for (i = 0; i < ARRAY_SIZE(diag_ext); i++, j++) {
+ descs[j].name = diag_ext[i].name;
+ offset[j] = diag_ext[i].offset;
+ }
+ }
+
+ if (!port) {
+ for (i = 0; i < ARRAY_SIZE(diag_device_only); i++, j++) {
+ descs[j].name = diag_device_only[i].name;
+ offset[j] = diag_device_only[i].offset;
+ }
+ }
+}
+
+static const struct ib_device_ops mlx4_ib_hw_stats_ops = {
+ .alloc_hw_device_stats = mlx4_ib_alloc_hw_device_stats,
+ .alloc_hw_port_stats = mlx4_ib_alloc_hw_port_stats,
+ .get_hw_stats = mlx4_ib_get_hw_stats,
+};
+
+static const struct ib_device_ops mlx4_ib_hw_stats_ops1 = {
+ .alloc_hw_device_stats = mlx4_ib_alloc_hw_device_stats,
+ .get_hw_stats = mlx4_ib_get_hw_stats,
+};
+
+static int mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev)
+{
+ struct mlx4_ib_diag_counters *diag = ibdev->diag_counters;
+ int i;
+ int ret;
+ bool per_port = !!(ibdev->dev->caps.flags2 &
+ MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT);
+
+ if (mlx4_is_slave(ibdev->dev))
+ return 0;
+
+ for (i = 0; i < MLX4_DIAG_COUNTERS_TYPES; i++) {
+ /*
+ * i == 1 means we are building port counters, set a different
+ * stats ops without port stats callback.
+ */
+ if (i && !per_port) {
+ ib_set_device_ops(&ibdev->ib_dev,
+ &mlx4_ib_hw_stats_ops1);
+
+ return 0;
+ }
+
+ ret = __mlx4_ib_alloc_diag_counters(ibdev, &diag[i].descs,
+ &diag[i].offset,
+ &diag[i].num_counters, i);
+ if (ret)
+ goto err_alloc;
+
+ mlx4_ib_fill_diag_counters(ibdev, diag[i].descs,
+ diag[i].offset, i);
+ }
+
+ ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_hw_stats_ops);
+
+ return 0;
+
+err_alloc:
+ if (i) {
+ kfree(diag[i - 1].descs);
+ kfree(diag[i - 1].offset);
+ }
+
+ return ret;
+}
+
+static void mlx4_ib_diag_cleanup(struct mlx4_ib_dev *ibdev)
+{
+ int i;
+
+ for (i = 0; i < MLX4_DIAG_COUNTERS_TYPES; i++) {
+ kfree(ibdev->diag_counters[i].offset);
+ kfree(ibdev->diag_counters[i].descs);
+ }
+}
+
+#define MLX4_IB_INVALID_MAC ((u64)-1)
+static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev,
+ struct net_device *dev,
+ int port)
+{
+ u64 new_smac = 0;
+ u64 release_mac = MLX4_IB_INVALID_MAC;
+ struct mlx4_ib_qp *qp;
+
+ new_smac = ether_addr_to_u64(dev->dev_addr);
+ atomic64_set(&ibdev->iboe.mac[port - 1], new_smac);
+
+ /* no need for update QP1 and mac registration in non-SRIOV */
+ if (!mlx4_is_mfunc(ibdev->dev))
+ return;
+
+ mutex_lock(&ibdev->qp1_proxy_lock[port - 1]);
+ qp = ibdev->qp1_proxy[port - 1];
+ if (qp) {
+ int new_smac_index;
+ u64 old_smac;
+ struct mlx4_update_qp_params update_params;
+
+ mutex_lock(&qp->mutex);
+ old_smac = qp->pri.smac;
+ if (new_smac == old_smac)
+ goto unlock;
+
+ new_smac_index = mlx4_register_mac(ibdev->dev, port, new_smac);
+
+ if (new_smac_index < 0)
+ goto unlock;
+
+ update_params.smac_index = new_smac_index;
+ if (mlx4_update_qp(ibdev->dev, qp->mqp.qpn, MLX4_UPDATE_QP_SMAC,
+ &update_params)) {
+ release_mac = new_smac;
+ goto unlock;
+ }
+ /* if old port was zero, no mac was yet registered for this QP */
+ if (qp->pri.smac_port)
+ release_mac = old_smac;
+ qp->pri.smac = new_smac;
+ qp->pri.smac_port = port;
+ qp->pri.smac_index = new_smac_index;
+ }
+
+unlock:
+ if (release_mac != MLX4_IB_INVALID_MAC)
+ mlx4_unregister_mac(ibdev->dev, port, release_mac);
+ if (qp)
+ mutex_unlock(&qp->mutex);
+ mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]);
+}
+
+static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
+ struct net_device *dev,
+ unsigned long event)
+
+{
+ struct mlx4_ib_iboe *iboe;
+ int update_qps_port = -1;
+ int port;
+
+ ASSERT_RTNL();
+
+ iboe = &ibdev->iboe;
+
+ spin_lock_bh(&iboe->lock);
+ mlx4_foreach_ib_transport_port(port, ibdev->dev) {
+
+ iboe->netdevs[port - 1] =
+ mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port);
+
+ if (dev == iboe->netdevs[port - 1] &&
+ (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER ||
+ event == NETDEV_UP || event == NETDEV_CHANGE))
+ update_qps_port = port;
+
+ if (dev == iboe->netdevs[port - 1] &&
+ (event == NETDEV_UP || event == NETDEV_DOWN)) {
+ enum ib_port_state port_state;
+ struct ib_event ibev = { };
+
+ if (ib_get_cached_port_state(&ibdev->ib_dev, port,
+ &port_state))
+ continue;
+
+ if (event == NETDEV_UP &&
+ (port_state != IB_PORT_ACTIVE ||
+ iboe->last_port_state[port - 1] != IB_PORT_DOWN))
+ continue;
+ if (event == NETDEV_DOWN &&
+ (port_state != IB_PORT_DOWN ||
+ iboe->last_port_state[port - 1] != IB_PORT_ACTIVE))
+ continue;
+ iboe->last_port_state[port - 1] = port_state;
+
+ ibev.device = &ibdev->ib_dev;
+ ibev.element.port_num = port;
+ ibev.event = event == NETDEV_UP ? IB_EVENT_PORT_ACTIVE :
+ IB_EVENT_PORT_ERR;
+ ib_dispatch_event(&ibev);
+ }
+
+ }
+ spin_unlock_bh(&iboe->lock);
+
+ if (update_qps_port > 0)
+ mlx4_ib_update_qps(ibdev, dev, update_qps_port);
+}
+
+static int mlx4_ib_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct mlx4_ib_dev *ibdev;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
+ ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb);
+ mlx4_ib_scan_netdevs(ibdev, dev, event);
+
+ return NOTIFY_DONE;
+}
+
+static void init_pkeys(struct mlx4_ib_dev *ibdev)
+{
+ int port;
+ int slave;
+ int i;
+
+ if (mlx4_is_master(ibdev->dev)) {
+ for (slave = 0; slave <= ibdev->dev->persist->num_vfs;
+ ++slave) {
+ for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) {
+ for (i = 0;
+ i < ibdev->dev->phys_caps.pkey_phys_table_len[port];
+ ++i) {
+ ibdev->pkeys.virt2phys_pkey[slave][port - 1][i] =
+ /* master has the identity virt2phys pkey mapping */
+ (slave == mlx4_master_func_num(ibdev->dev) || !i) ? i :
+ ibdev->dev->phys_caps.pkey_phys_table_len[port] - 1;
+ mlx4_sync_pkey_table(ibdev->dev, slave, port, i,
+ ibdev->pkeys.virt2phys_pkey[slave][port - 1][i]);
+ }
+ }
+ }
+ /* initialize pkey cache */
+ for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) {
+ for (i = 0;
+ i < ibdev->dev->phys_caps.pkey_phys_table_len[port];
+ ++i)
+ ibdev->pkeys.phys_pkey_cache[port-1][i] =
+ (i) ? 0 : 0xFFFF;
+ }
+ }
+}
+
+static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
+{
+ int i, j, eq = 0, total_eqs = 0;
+
+ ibdev->eq_table = kcalloc(dev->caps.num_comp_vectors,
+ sizeof(ibdev->eq_table[0]), GFP_KERNEL);
+ if (!ibdev->eq_table)
+ return;
+
+ for (i = 1; i <= dev->caps.num_ports; i++) {
+ for (j = 0; j < mlx4_get_eqs_per_port(dev, i);
+ j++, total_eqs++) {
+ if (i > 1 && mlx4_is_eq_shared(dev, total_eqs))
+ continue;
+ ibdev->eq_table[eq] = total_eqs;
+ if (!mlx4_assign_eq(dev, i,
+ &ibdev->eq_table[eq]))
+ eq++;
+ else
+ ibdev->eq_table[eq] = -1;
+ }
+ }
+
+ for (i = eq; i < dev->caps.num_comp_vectors;
+ ibdev->eq_table[i++] = -1)
+ ;
+
+ /* Advertise the new number of EQs to clients */
+ ibdev->ib_dev.num_comp_vectors = eq;
+}
+
+static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
+{
+ int i;
+ int total_eqs = ibdev->ib_dev.num_comp_vectors;
+
+ /* no eqs were allocated */
+ if (!ibdev->eq_table)
+ return;
+
+ /* Reset the advertised EQ number */
+ ibdev->ib_dev.num_comp_vectors = 0;
+
+ for (i = 0; i < total_eqs; i++)
+ mlx4_release_eq(dev, ibdev->eq_table[i]);
+
+ kfree(ibdev->eq_table);
+ ibdev->eq_table = NULL;
+}
+
+static int mlx4_port_immutable(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_immutable *immutable)
+{
+ struct ib_port_attr attr;
+ struct mlx4_ib_dev *mdev = to_mdev(ibdev);
+ int err;
+
+ if (mlx4_ib_port_link_layer(ibdev, port_num) == IB_LINK_LAYER_INFINIBAND) {
+ immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
+ immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+ } else {
+ if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE)
+ immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE;
+ if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)
+ immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE |
+ RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+ immutable->core_cap_flags |= RDMA_CORE_PORT_RAW_PACKET;
+ if (immutable->core_cap_flags & (RDMA_CORE_PORT_IBA_ROCE |
+ RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP))
+ immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+ }
+
+ err = ib_query_port(ibdev, port_num, &attr);
+ if (err)
+ return err;
+
+ immutable->pkey_tbl_len = attr.pkey_tbl_len;
+ immutable->gid_tbl_len = attr.gid_tbl_len;
+
+ return 0;
+}
+
+static void get_fw_ver_str(struct ib_device *device, char *str)
+{
+ struct mlx4_ib_dev *dev =
+ container_of(device, struct mlx4_ib_dev, ib_dev);
+ snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d",
+ (int) (dev->dev->caps.fw_ver >> 32),
+ (int) (dev->dev->caps.fw_ver >> 16) & 0xffff,
+ (int) dev->dev->caps.fw_ver & 0xffff);
+}
+
+static const struct ib_device_ops mlx4_ib_dev_ops = {
+ .owner = THIS_MODULE,
+ .driver_id = RDMA_DRIVER_MLX4,
+ .uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION,
+
+ .add_gid = mlx4_ib_add_gid,
+ .alloc_mr = mlx4_ib_alloc_mr,
+ .alloc_pd = mlx4_ib_alloc_pd,
+ .alloc_ucontext = mlx4_ib_alloc_ucontext,
+ .attach_mcast = mlx4_ib_mcg_attach,
+ .create_ah = mlx4_ib_create_ah,
+ .create_cq = mlx4_ib_create_cq,
+ .create_qp = mlx4_ib_create_qp,
+ .create_srq = mlx4_ib_create_srq,
+ .dealloc_pd = mlx4_ib_dealloc_pd,
+ .dealloc_ucontext = mlx4_ib_dealloc_ucontext,
+ .del_gid = mlx4_ib_del_gid,
+ .dereg_mr = mlx4_ib_dereg_mr,
+ .destroy_ah = mlx4_ib_destroy_ah,
+ .destroy_cq = mlx4_ib_destroy_cq,
+ .destroy_qp = mlx4_ib_destroy_qp,
+ .destroy_srq = mlx4_ib_destroy_srq,
+ .detach_mcast = mlx4_ib_mcg_detach,
+ .device_group = &mlx4_attr_group,
+ .disassociate_ucontext = mlx4_ib_disassociate_ucontext,
+ .drain_rq = mlx4_ib_drain_rq,
+ .drain_sq = mlx4_ib_drain_sq,
+ .get_dev_fw_str = get_fw_ver_str,
+ .get_dma_mr = mlx4_ib_get_dma_mr,
+ .get_link_layer = mlx4_ib_port_link_layer,
+ .get_netdev = mlx4_ib_get_netdev,
+ .get_port_immutable = mlx4_port_immutable,
+ .map_mr_sg = mlx4_ib_map_mr_sg,
+ .mmap = mlx4_ib_mmap,
+ .modify_cq = mlx4_ib_modify_cq,
+ .modify_device = mlx4_ib_modify_device,
+ .modify_port = mlx4_ib_modify_port,
+ .modify_qp = mlx4_ib_modify_qp,
+ .modify_srq = mlx4_ib_modify_srq,
+ .poll_cq = mlx4_ib_poll_cq,
+ .post_recv = mlx4_ib_post_recv,
+ .post_send = mlx4_ib_post_send,
+ .post_srq_recv = mlx4_ib_post_srq_recv,
+ .process_mad = mlx4_ib_process_mad,
+ .query_ah = mlx4_ib_query_ah,
+ .query_device = mlx4_ib_query_device,
+ .query_gid = mlx4_ib_query_gid,
+ .query_pkey = mlx4_ib_query_pkey,
+ .query_port = mlx4_ib_query_port,
+ .query_qp = mlx4_ib_query_qp,
+ .query_srq = mlx4_ib_query_srq,
+ .reg_user_mr = mlx4_ib_reg_user_mr,
+ .req_notify_cq = mlx4_ib_arm_cq,
+ .rereg_user_mr = mlx4_ib_rereg_user_mr,
+ .resize_cq = mlx4_ib_resize_cq,
+
+ INIT_RDMA_OBJ_SIZE(ib_ah, mlx4_ib_ah, ibah),
+ INIT_RDMA_OBJ_SIZE(ib_cq, mlx4_ib_cq, ibcq),
+ INIT_RDMA_OBJ_SIZE(ib_pd, mlx4_ib_pd, ibpd),
+ INIT_RDMA_OBJ_SIZE(ib_qp, mlx4_ib_qp, ibqp),
+ INIT_RDMA_OBJ_SIZE(ib_srq, mlx4_ib_srq, ibsrq),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx4_ib_ucontext, ibucontext),
+};
+
+static const struct ib_device_ops mlx4_ib_dev_wq_ops = {
+ .create_rwq_ind_table = mlx4_ib_create_rwq_ind_table,
+ .create_wq = mlx4_ib_create_wq,
+ .destroy_rwq_ind_table = mlx4_ib_destroy_rwq_ind_table,
+ .destroy_wq = mlx4_ib_destroy_wq,
+ .modify_wq = mlx4_ib_modify_wq,
+
+ INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mlx4_ib_rwq_ind_table,
+ ib_rwq_ind_tbl),
+};
+
+static const struct ib_device_ops mlx4_ib_dev_mw_ops = {
+ .alloc_mw = mlx4_ib_alloc_mw,
+ .dealloc_mw = mlx4_ib_dealloc_mw,
+
+ INIT_RDMA_OBJ_SIZE(ib_mw, mlx4_ib_mw, ibmw),
+};
+
+static const struct ib_device_ops mlx4_ib_dev_xrc_ops = {
+ .alloc_xrcd = mlx4_ib_alloc_xrcd,
+ .dealloc_xrcd = mlx4_ib_dealloc_xrcd,
+
+ INIT_RDMA_OBJ_SIZE(ib_xrcd, mlx4_ib_xrcd, ibxrcd),
+};
+
+static const struct ib_device_ops mlx4_ib_dev_fs_ops = {
+ .create_flow = mlx4_ib_create_flow,
+ .destroy_flow = mlx4_ib_destroy_flow,
+};
+
+static void *mlx4_ib_add(struct mlx4_dev *dev)
+{
+ struct mlx4_ib_dev *ibdev;
+ int num_ports = 0;
+ int i, j;
+ int err;
+ struct mlx4_ib_iboe *iboe;
+ int ib_num_ports = 0;
+ int num_req_counters;
+ int allocated;
+ u32 counter_index;
+ struct counter_index *new_counter_index = NULL;
+
+ pr_info_once("%s", mlx4_ib_version);
+
+ num_ports = 0;
+ mlx4_foreach_ib_transport_port(i, dev)
+ num_ports++;
+
+ /* No point in registering a device with no ports... */
+ if (num_ports == 0)
+ return NULL;
+
+ ibdev = ib_alloc_device(mlx4_ib_dev, ib_dev);
+ if (!ibdev) {
+ dev_err(&dev->persist->pdev->dev,
+ "Device struct alloc failed\n");
+ return NULL;
+ }
+
+ iboe = &ibdev->iboe;
+
+ if (mlx4_pd_alloc(dev, &ibdev->priv_pdn))
+ goto err_dealloc;
+
+ if (mlx4_uar_alloc(dev, &ibdev->priv_uar))
+ goto err_pd;
+
+ ibdev->uar_map = ioremap((phys_addr_t) ibdev->priv_uar.pfn << PAGE_SHIFT,
+ PAGE_SIZE);
+ if (!ibdev->uar_map)
+ goto err_uar;
+ MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock);
+
+ ibdev->dev = dev;
+ ibdev->bond_next_port = 0;
+
+ ibdev->ib_dev.node_type = RDMA_NODE_IB_CA;
+ ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey;
+ ibdev->num_ports = num_ports;
+ ibdev->ib_dev.phys_port_cnt = mlx4_is_bonded(dev) ?
+ 1 : ibdev->num_ports;
+ ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors;
+ ibdev->ib_dev.dev.parent = &dev->persist->pdev->dev;
+
+ ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_ops);
+
+ if ((dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) &&
+ ((mlx4_ib_port_link_layer(&ibdev->ib_dev, 1) ==
+ IB_LINK_LAYER_ETHERNET) ||
+ (mlx4_ib_port_link_layer(&ibdev->ib_dev, 2) ==
+ IB_LINK_LAYER_ETHERNET)))
+ ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_wq_ops);
+
+ if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW ||
+ dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN)
+ ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_mw_ops);
+
+ if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) {
+ ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_xrc_ops);
+ }
+
+ if (check_flow_steering_support(dev)) {
+ ibdev->steering_support = MLX4_STEERING_MODE_DEVICE_MANAGED;
+ ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_fs_ops);
+ }
+
+ if (!dev->caps.userspace_caps)
+ ibdev->ib_dev.ops.uverbs_abi_ver =
+ MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION;
+
+ mlx4_ib_alloc_eqs(dev, ibdev);
+
+ spin_lock_init(&iboe->lock);
+
+ if (init_node_data(ibdev))
+ goto err_map;
+ mlx4_init_sl2vl_tbl(ibdev);
+
+ for (i = 0; i < ibdev->num_ports; ++i) {
+ mutex_init(&ibdev->counters_table[i].mutex);
+ INIT_LIST_HEAD(&ibdev->counters_table[i].counters_list);
+ iboe->last_port_state[i] = IB_PORT_DOWN;
+ }
+
+ num_req_counters = mlx4_is_bonded(dev) ? 1 : ibdev->num_ports;
+ for (i = 0; i < num_req_counters; ++i) {
+ mutex_init(&ibdev->qp1_proxy_lock[i]);
+ allocated = 0;
+ if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) ==
+ IB_LINK_LAYER_ETHERNET) {
+ err = mlx4_counter_alloc(ibdev->dev, &counter_index,
+ MLX4_RES_USAGE_DRIVER);
+ /* if failed to allocate a new counter, use default */
+ if (err)
+ counter_index =
+ mlx4_get_default_counter_index(dev,
+ i + 1);
+ else
+ allocated = 1;
+ } else { /* IB_LINK_LAYER_INFINIBAND use the default counter */
+ counter_index = mlx4_get_default_counter_index(dev,
+ i + 1);
+ }
+ new_counter_index = kmalloc(sizeof(*new_counter_index),
+ GFP_KERNEL);
+ if (!new_counter_index) {
+ if (allocated)
+ mlx4_counter_free(ibdev->dev, counter_index);
+ goto err_counter;
+ }
+ new_counter_index->index = counter_index;
+ new_counter_index->allocated = allocated;
+ list_add_tail(&new_counter_index->list,
+ &ibdev->counters_table[i].counters_list);
+ ibdev->counters_table[i].default_counter = counter_index;
+ pr_info("counter index %d for port %d allocated %d\n",
+ counter_index, i + 1, allocated);
+ }
+ if (mlx4_is_bonded(dev))
+ for (i = 1; i < ibdev->num_ports ; ++i) {
+ new_counter_index =
+ kmalloc(sizeof(struct counter_index),
+ GFP_KERNEL);
+ if (!new_counter_index)
+ goto err_counter;
+ new_counter_index->index = counter_index;
+ new_counter_index->allocated = 0;
+ list_add_tail(&new_counter_index->list,
+ &ibdev->counters_table[i].counters_list);
+ ibdev->counters_table[i].default_counter =
+ counter_index;
+ }
+
+ mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+ ib_num_ports++;
+
+ spin_lock_init(&ibdev->sm_lock);
+ mutex_init(&ibdev->cap_mask_mutex);
+ INIT_LIST_HEAD(&ibdev->qp_list);
+ spin_lock_init(&ibdev->reset_flow_resource_lock);
+
+ if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED &&
+ ib_num_ports) {
+ ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS;
+ err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count,
+ MLX4_IB_UC_STEER_QPN_ALIGN,
+ &ibdev->steer_qpn_base, 0,
+ MLX4_RES_USAGE_DRIVER);
+ if (err)
+ goto err_counter;
+
+ ibdev->ib_uc_qpns_bitmap = bitmap_alloc(ibdev->steer_qpn_count,
+ GFP_KERNEL);
+ if (!ibdev->ib_uc_qpns_bitmap)
+ goto err_steer_qp_release;
+
+ if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB) {
+ bitmap_zero(ibdev->ib_uc_qpns_bitmap,
+ ibdev->steer_qpn_count);
+ err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE(
+ dev, ibdev->steer_qpn_base,
+ ibdev->steer_qpn_base +
+ ibdev->steer_qpn_count - 1);
+ if (err)
+ goto err_steer_free_bitmap;
+ } else {
+ bitmap_fill(ibdev->ib_uc_qpns_bitmap,
+ ibdev->steer_qpn_count);
+ }
+ }
+
+ for (j = 1; j <= ibdev->dev->caps.num_ports; j++)
+ atomic64_set(&iboe->mac[j - 1], ibdev->dev->caps.def_mac[j]);
+
+ if (mlx4_ib_alloc_diag_counters(ibdev))
+ goto err_steer_free_bitmap;
+
+ if (ib_register_device(&ibdev->ib_dev, "mlx4_%d",
+ &dev->persist->pdev->dev))
+ goto err_diag_counters;
+
+ if (mlx4_ib_mad_init(ibdev))
+ goto err_reg;
+
+ if (mlx4_ib_init_sriov(ibdev))
+ goto err_mad;
+
+ if (!iboe->nb.notifier_call) {
+ iboe->nb.notifier_call = mlx4_ib_netdev_event;
+ err = register_netdevice_notifier(&iboe->nb);
+ if (err) {
+ iboe->nb.notifier_call = NULL;
+ goto err_notif;
+ }
+ }
+ if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
+ err = mlx4_config_roce_v2_port(dev, ROCE_V2_UDP_DPORT);
+ if (err)
+ goto err_notif;
+ }
+
+ ibdev->ib_active = true;
+ mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+ devlink_port_type_ib_set(mlx4_get_devlink_port(dev, i),
+ &ibdev->ib_dev);
+
+ if (mlx4_is_mfunc(ibdev->dev))
+ init_pkeys(ibdev);
+
+ /* create paravirt contexts for any VFs which are active */
+ if (mlx4_is_master(ibdev->dev)) {
+ for (j = 0; j < MLX4_MFUNC_MAX; j++) {
+ if (j == mlx4_master_func_num(ibdev->dev))
+ continue;
+ if (mlx4_is_slave_active(ibdev->dev, j))
+ do_slave_init(ibdev, j, 1);
+ }
+ }
+ return ibdev;
+
+err_notif:
+ if (ibdev->iboe.nb.notifier_call) {
+ if (unregister_netdevice_notifier(&ibdev->iboe.nb))
+ pr_warn("failure unregistering notifier\n");
+ ibdev->iboe.nb.notifier_call = NULL;
+ }
+ flush_workqueue(wq);
+
+ mlx4_ib_close_sriov(ibdev);
+
+err_mad:
+ mlx4_ib_mad_cleanup(ibdev);
+
+err_reg:
+ ib_unregister_device(&ibdev->ib_dev);
+
+err_diag_counters:
+ mlx4_ib_diag_cleanup(ibdev);
+
+err_steer_free_bitmap:
+ bitmap_free(ibdev->ib_uc_qpns_bitmap);
+
+err_steer_qp_release:
+ mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
+ ibdev->steer_qpn_count);
+err_counter:
+ for (i = 0; i < ibdev->num_ports; ++i)
+ mlx4_ib_delete_counters_table(ibdev, &ibdev->counters_table[i]);
+
+err_map:
+ mlx4_ib_free_eqs(dev, ibdev);
+ iounmap(ibdev->uar_map);
+
+err_uar:
+ mlx4_uar_free(dev, &ibdev->priv_uar);
+
+err_pd:
+ mlx4_pd_free(dev, ibdev->priv_pdn);
+
+err_dealloc:
+ ib_dealloc_device(&ibdev->ib_dev);
+
+ return NULL;
+}
+
+int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn)
+{
+ int offset;
+
+ WARN_ON(!dev->ib_uc_qpns_bitmap);
+
+ offset = bitmap_find_free_region(dev->ib_uc_qpns_bitmap,
+ dev->steer_qpn_count,
+ get_count_order(count));
+ if (offset < 0)
+ return offset;
+
+ *qpn = dev->steer_qpn_base + offset;
+ return 0;
+}
+
+void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count)
+{
+ if (!qpn ||
+ dev->steering_support != MLX4_STEERING_MODE_DEVICE_MANAGED)
+ return;
+
+ if (WARN(qpn < dev->steer_qpn_base, "qpn = %u, steer_qpn_base = %u\n",
+ qpn, dev->steer_qpn_base))
+ /* not supposed to be here */
+ return;
+
+ bitmap_release_region(dev->ib_uc_qpns_bitmap,
+ qpn - dev->steer_qpn_base,
+ get_count_order(count));
+}
+
+int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+ int is_attach)
+{
+ int err;
+ size_t flow_size;
+ struct ib_flow_attr *flow = NULL;
+ struct ib_flow_spec_ib *ib_spec;
+
+ if (is_attach) {
+ flow_size = sizeof(struct ib_flow_attr) +
+ sizeof(struct ib_flow_spec_ib);
+ flow = kzalloc(flow_size, GFP_KERNEL);
+ if (!flow)
+ return -ENOMEM;
+ flow->port = mqp->port;
+ flow->num_of_specs = 1;
+ flow->size = flow_size;
+ ib_spec = (struct ib_flow_spec_ib *)(flow + 1);
+ ib_spec->type = IB_FLOW_SPEC_IB;
+ ib_spec->size = sizeof(struct ib_flow_spec_ib);
+ /* Add an empty rule for IB L2 */
+ memset(&ib_spec->mask, 0, sizeof(ib_spec->mask));
+
+ err = __mlx4_ib_create_flow(&mqp->ibqp, flow, MLX4_DOMAIN_NIC,
+ MLX4_FS_REGULAR, &mqp->reg_id);
+ } else {
+ err = __mlx4_ib_destroy_flow(mdev->dev, mqp->reg_id);
+ }
+ kfree(flow);
+ return err;
+}
+
+static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
+{
+ struct mlx4_ib_dev *ibdev = ibdev_ptr;
+ int p;
+ int i;
+
+ mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+ devlink_port_type_clear(mlx4_get_devlink_port(dev, i));
+ ibdev->ib_active = false;
+ flush_workqueue(wq);
+
+ if (ibdev->iboe.nb.notifier_call) {
+ if (unregister_netdevice_notifier(&ibdev->iboe.nb))
+ pr_warn("failure unregistering notifier\n");
+ ibdev->iboe.nb.notifier_call = NULL;
+ }
+
+ mlx4_ib_close_sriov(ibdev);
+ mlx4_ib_mad_cleanup(ibdev);
+ ib_unregister_device(&ibdev->ib_dev);
+ mlx4_ib_diag_cleanup(ibdev);
+
+ mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
+ ibdev->steer_qpn_count);
+ bitmap_free(ibdev->ib_uc_qpns_bitmap);
+
+ iounmap(ibdev->uar_map);
+ for (p = 0; p < ibdev->num_ports; ++p)
+ mlx4_ib_delete_counters_table(ibdev, &ibdev->counters_table[p]);
+
+ mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB)
+ mlx4_CLOSE_PORT(dev, p);
+
+ mlx4_ib_free_eqs(dev, ibdev);
+
+ mlx4_uar_free(dev, &ibdev->priv_uar);
+ mlx4_pd_free(dev, ibdev->priv_pdn);
+ ib_dealloc_device(&ibdev->ib_dev);
+}
+
+static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init)
+{
+ struct mlx4_ib_demux_work **dm = NULL;
+ struct mlx4_dev *dev = ibdev->dev;
+ int i;
+ unsigned long flags;
+ struct mlx4_active_ports actv_ports;
+ unsigned int ports;
+ unsigned int first_port;
+
+ if (!mlx4_is_master(dev))
+ return;
+
+ actv_ports = mlx4_get_active_ports(dev, slave);
+ ports = bitmap_weight(actv_ports.ports, dev->caps.num_ports);
+ first_port = find_first_bit(actv_ports.ports, dev->caps.num_ports);
+
+ dm = kcalloc(ports, sizeof(*dm), GFP_ATOMIC);
+ if (!dm)
+ return;
+
+ for (i = 0; i < ports; i++) {
+ dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC);
+ if (!dm[i]) {
+ while (--i >= 0)
+ kfree(dm[i]);
+ goto out;
+ }
+ INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work);
+ dm[i]->port = first_port + i + 1;
+ dm[i]->slave = slave;
+ dm[i]->do_init = do_init;
+ dm[i]->dev = ibdev;
+ }
+ /* initialize or tear down tunnel QPs for the slave */
+ spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags);
+ if (!ibdev->sriov.is_going_down) {
+ for (i = 0; i < ports; i++)
+ queue_work(ibdev->sriov.demux[i].ud_wq, &dm[i]->work);
+ spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags);
+ } else {
+ spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags);
+ for (i = 0; i < ports; i++)
+ kfree(dm[i]);
+ }
+out:
+ kfree(dm);
+ return;
+}
+
+static void mlx4_ib_handle_catas_error(struct mlx4_ib_dev *ibdev)
+{
+ struct mlx4_ib_qp *mqp;
+ unsigned long flags_qp;
+ unsigned long flags_cq;
+ struct mlx4_ib_cq *send_mcq, *recv_mcq;
+ struct list_head cq_notify_list;
+ struct mlx4_cq *mcq;
+ unsigned long flags;
+
+ pr_warn("mlx4_ib_handle_catas_error was started\n");
+ INIT_LIST_HEAD(&cq_notify_list);
+
+ /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
+ spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
+
+ list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
+ spin_lock_irqsave(&mqp->sq.lock, flags_qp);
+ if (mqp->sq.tail != mqp->sq.head) {
+ send_mcq = to_mcq(mqp->ibqp.send_cq);
+ spin_lock_irqsave(&send_mcq->lock, flags_cq);
+ if (send_mcq->mcq.comp &&
+ mqp->ibqp.send_cq->comp_handler) {
+ if (!send_mcq->mcq.reset_notify_added) {
+ send_mcq->mcq.reset_notify_added = 1;
+ list_add_tail(&send_mcq->mcq.reset_notify,
+ &cq_notify_list);
+ }
+ }
+ spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
+ }
+ spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
+ /* Now, handle the QP's receive queue */
+ spin_lock_irqsave(&mqp->rq.lock, flags_qp);
+ /* no handling is needed for SRQ */
+ if (!mqp->ibqp.srq) {
+ if (mqp->rq.tail != mqp->rq.head) {
+ recv_mcq = to_mcq(mqp->ibqp.recv_cq);
+ spin_lock_irqsave(&recv_mcq->lock, flags_cq);
+ if (recv_mcq->mcq.comp &&
+ mqp->ibqp.recv_cq->comp_handler) {
+ if (!recv_mcq->mcq.reset_notify_added) {
+ recv_mcq->mcq.reset_notify_added = 1;
+ list_add_tail(&recv_mcq->mcq.reset_notify,
+ &cq_notify_list);
+ }
+ }
+ spin_unlock_irqrestore(&recv_mcq->lock,
+ flags_cq);
+ }
+ }
+ spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
+ }
+
+ list_for_each_entry(mcq, &cq_notify_list, reset_notify) {
+ mcq->comp(mcq);
+ }
+ spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
+ pr_warn("mlx4_ib_handle_catas_error ended\n");
+}
+
+static void handle_bonded_port_state_event(struct work_struct *work)
+{
+ struct ib_event_work *ew =
+ container_of(work, struct ib_event_work, work);
+ struct mlx4_ib_dev *ibdev = ew->ib_dev;
+ enum ib_port_state bonded_port_state = IB_PORT_NOP;
+ int i;
+ struct ib_event ibev;
+
+ kfree(ew);
+ spin_lock_bh(&ibdev->iboe.lock);
+ for (i = 0; i < MLX4_MAX_PORTS; ++i) {
+ struct net_device *curr_netdev = ibdev->iboe.netdevs[i];
+ enum ib_port_state curr_port_state;
+
+ if (!curr_netdev)
+ continue;
+
+ curr_port_state =
+ (netif_running(curr_netdev) &&
+ netif_carrier_ok(curr_netdev)) ?
+ IB_PORT_ACTIVE : IB_PORT_DOWN;
+
+ bonded_port_state = (bonded_port_state != IB_PORT_ACTIVE) ?
+ curr_port_state : IB_PORT_ACTIVE;
+ }
+ spin_unlock_bh(&ibdev->iboe.lock);
+
+ ibev.device = &ibdev->ib_dev;
+ ibev.element.port_num = 1;
+ ibev.event = (bonded_port_state == IB_PORT_ACTIVE) ?
+ IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+
+ ib_dispatch_event(&ibev);
+}
+
+void mlx4_ib_sl2vl_update(struct mlx4_ib_dev *mdev, int port)
+{
+ u64 sl2vl;
+ int err;
+
+ err = mlx4_ib_query_sl2vl(&mdev->ib_dev, port, &sl2vl);
+ if (err) {
+ pr_err("Unable to get current sl to vl mapping for port %d. Using all zeroes (%d)\n",
+ port, err);
+ sl2vl = 0;
+ }
+ atomic64_set(&mdev->sl2vl[port - 1], sl2vl);
+}
+
+static void ib_sl2vl_update_work(struct work_struct *work)
+{
+ struct ib_event_work *ew = container_of(work, struct ib_event_work, work);
+ struct mlx4_ib_dev *mdev = ew->ib_dev;
+ int port = ew->port;
+
+ mlx4_ib_sl2vl_update(mdev, port);
+
+ kfree(ew);
+}
+
+void mlx4_sched_ib_sl2vl_update_work(struct mlx4_ib_dev *ibdev,
+ int port)
+{
+ struct ib_event_work *ew;
+
+ ew = kmalloc(sizeof(*ew), GFP_ATOMIC);
+ if (ew) {
+ INIT_WORK(&ew->work, ib_sl2vl_update_work);
+ ew->port = port;
+ ew->ib_dev = ibdev;
+ queue_work(wq, &ew->work);
+ }
+}
+
+static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
+ enum mlx4_dev_event event, unsigned long param)
+{
+ struct ib_event ibev;
+ struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr);
+ struct mlx4_eqe *eqe = NULL;
+ struct ib_event_work *ew;
+ int p = 0;
+
+ if (mlx4_is_bonded(dev) &&
+ ((event == MLX4_DEV_EVENT_PORT_UP) ||
+ (event == MLX4_DEV_EVENT_PORT_DOWN))) {
+ ew = kmalloc(sizeof(*ew), GFP_ATOMIC);
+ if (!ew)
+ return;
+ INIT_WORK(&ew->work, handle_bonded_port_state_event);
+ ew->ib_dev = ibdev;
+ queue_work(wq, &ew->work);
+ return;
+ }
+
+ if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE)
+ eqe = (struct mlx4_eqe *)param;
+ else
+ p = (int) param;
+
+ switch (event) {
+ case MLX4_DEV_EVENT_PORT_UP:
+ if (p > ibdev->num_ports)
+ return;
+ if (!mlx4_is_slave(dev) &&
+ rdma_port_get_link_layer(&ibdev->ib_dev, p) ==
+ IB_LINK_LAYER_INFINIBAND) {
+ if (mlx4_is_master(dev))
+ mlx4_ib_invalidate_all_guid_record(ibdev, p);
+ if (ibdev->dev->flags & MLX4_FLAG_SECURE_HOST &&
+ !(ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT))
+ mlx4_sched_ib_sl2vl_update_work(ibdev, p);
+ }
+ ibev.event = IB_EVENT_PORT_ACTIVE;
+ break;
+
+ case MLX4_DEV_EVENT_PORT_DOWN:
+ if (p > ibdev->num_ports)
+ return;
+ ibev.event = IB_EVENT_PORT_ERR;
+ break;
+
+ case MLX4_DEV_EVENT_CATASTROPHIC_ERROR:
+ ibdev->ib_active = false;
+ ibev.event = IB_EVENT_DEVICE_FATAL;
+ mlx4_ib_handle_catas_error(ibdev);
+ break;
+
+ case MLX4_DEV_EVENT_PORT_MGMT_CHANGE:
+ ew = kmalloc(sizeof *ew, GFP_ATOMIC);
+ if (!ew)
+ return;
+
+ INIT_WORK(&ew->work, handle_port_mgmt_change_event);
+ memcpy(&ew->ib_eqe, eqe, sizeof *eqe);
+ ew->ib_dev = ibdev;
+ /* need to queue only for port owner, which uses GEN_EQE */
+ if (mlx4_is_master(dev))
+ queue_work(wq, &ew->work);
+ else
+ handle_port_mgmt_change_event(&ew->work);
+ return;
+
+ case MLX4_DEV_EVENT_SLAVE_INIT:
+ /* here, p is the slave id */
+ do_slave_init(ibdev, p, 1);
+ if (mlx4_is_master(dev)) {
+ int i;
+
+ for (i = 1; i <= ibdev->num_ports; i++) {
+ if (rdma_port_get_link_layer(&ibdev->ib_dev, i)
+ == IB_LINK_LAYER_INFINIBAND)
+ mlx4_ib_slave_alias_guid_event(ibdev,
+ p, i,
+ 1);
+ }
+ }
+ return;
+
+ case MLX4_DEV_EVENT_SLAVE_SHUTDOWN:
+ if (mlx4_is_master(dev)) {
+ int i;
+
+ for (i = 1; i <= ibdev->num_ports; i++) {
+ if (rdma_port_get_link_layer(&ibdev->ib_dev, i)
+ == IB_LINK_LAYER_INFINIBAND)
+ mlx4_ib_slave_alias_guid_event(ibdev,
+ p, i,
+ 0);
+ }
+ }
+ /* here, p is the slave id */
+ do_slave_init(ibdev, p, 0);
+ return;
+
+ default:
+ return;
+ }
+
+ ibev.device = ibdev_ptr;
+ ibev.element.port_num = mlx4_is_bonded(ibdev->dev) ? 1 : (u8)p;
+
+ ib_dispatch_event(&ibev);
+}
+
+static struct mlx4_interface mlx4_ib_interface = {
+ .add = mlx4_ib_add,
+ .remove = mlx4_ib_remove,
+ .event = mlx4_ib_event,
+ .protocol = MLX4_PROT_IB_IPV6,
+ .flags = MLX4_INTFF_BONDING
+};
+
+static int __init mlx4_ib_init(void)
+{
+ int err;
+
+ wq = alloc_ordered_workqueue("mlx4_ib", WQ_MEM_RECLAIM);
+ if (!wq)
+ return -ENOMEM;
+
+ err = mlx4_ib_cm_init();
+ if (err)
+ goto clean_wq;
+
+ err = mlx4_ib_mcg_init();
+ if (err)
+ goto clean_cm;
+
+ err = mlx4_register_interface(&mlx4_ib_interface);
+ if (err)
+ goto clean_mcg;
+
+ return 0;
+
+clean_mcg:
+ mlx4_ib_mcg_destroy();
+
+clean_cm:
+ mlx4_ib_cm_destroy();
+
+clean_wq:
+ destroy_workqueue(wq);
+ return err;
+}
+
+static void __exit mlx4_ib_cleanup(void)
+{
+ mlx4_unregister_interface(&mlx4_ib_interface);
+ mlx4_ib_mcg_destroy();
+ mlx4_ib_cm_destroy();
+ destroy_workqueue(wq);
+}
+
+module_init(mlx4_ib_init);
+module_exit(mlx4_ib_cleanup);
diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c
new file mode 100644
index 000000000..33f525b74
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mcg.c
@@ -0,0 +1,1267 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_sa.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/rbtree.h>
+#include <linux/delay.h>
+
+#include "mlx4_ib.h"
+
+#define MAX_VFS 80
+#define MAX_PEND_REQS_PER_FUNC 4
+#define MAD_TIMEOUT_MS 2000
+
+#define mcg_warn(fmt, arg...) pr_warn("MCG WARNING: " fmt, ##arg)
+#define mcg_error(fmt, arg...) pr_err(fmt, ##arg)
+#define mcg_warn_group(group, format, arg...) \
+ pr_warn("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\
+ (group)->name, group->demux->port, ## arg)
+
+#define mcg_debug_group(group, format, arg...) \
+ pr_debug("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\
+ (group)->name, (group)->demux->port, ## arg)
+
+#define mcg_error_group(group, format, arg...) \
+ pr_err(" %16s: " format, (group)->name, ## arg)
+
+
+static union ib_gid mgid0;
+
+static struct workqueue_struct *clean_wq;
+
+enum mcast_state {
+ MCAST_NOT_MEMBER = 0,
+ MCAST_MEMBER,
+};
+
+enum mcast_group_state {
+ MCAST_IDLE,
+ MCAST_JOIN_SENT,
+ MCAST_LEAVE_SENT,
+ MCAST_RESP_READY
+};
+
+struct mcast_member {
+ enum mcast_state state;
+ uint8_t join_state;
+ int num_pend_reqs;
+ struct list_head pending;
+};
+
+struct ib_sa_mcmember_data {
+ union ib_gid mgid;
+ union ib_gid port_gid;
+ __be32 qkey;
+ __be16 mlid;
+ u8 mtusel_mtu;
+ u8 tclass;
+ __be16 pkey;
+ u8 ratesel_rate;
+ u8 lifetmsel_lifetm;
+ __be32 sl_flowlabel_hoplimit;
+ u8 scope_join_state;
+ u8 proxy_join;
+ u8 reserved[2];
+} __packed __aligned(4);
+
+struct mcast_group {
+ struct ib_sa_mcmember_data rec;
+ struct rb_node node;
+ struct list_head mgid0_list;
+ struct mlx4_ib_demux_ctx *demux;
+ struct mcast_member func[MAX_VFS];
+ struct mutex lock;
+ struct work_struct work;
+ struct list_head pending_list;
+ int members[3];
+ enum mcast_group_state state;
+ enum mcast_group_state prev_state;
+ struct ib_sa_mad response_sa_mad;
+ __be64 last_req_tid;
+
+ char name[33]; /* MGID string */
+ struct device_attribute dentry;
+
+ /* refcount is the reference count for the following:
+ 1. Each queued request
+ 2. Each invocation of the worker thread
+ 3. Membership of the port at the SA
+ */
+ atomic_t refcount;
+
+ /* delayed work to clean pending SM request */
+ struct delayed_work timeout_work;
+ struct list_head cleanup_list;
+};
+
+struct mcast_req {
+ int func;
+ struct ib_sa_mad sa_mad;
+ struct list_head group_list;
+ struct list_head func_list;
+ struct mcast_group *group;
+ int clean;
+};
+
+
+#define safe_atomic_dec(ref) \
+ do {\
+ if (atomic_dec_and_test(ref)) \
+ mcg_warn_group(group, "did not expect to reach zero\n"); \
+ } while (0)
+
+static const char *get_state_string(enum mcast_group_state state)
+{
+ switch (state) {
+ case MCAST_IDLE:
+ return "MCAST_IDLE";
+ case MCAST_JOIN_SENT:
+ return "MCAST_JOIN_SENT";
+ case MCAST_LEAVE_SENT:
+ return "MCAST_LEAVE_SENT";
+ case MCAST_RESP_READY:
+ return "MCAST_RESP_READY";
+ }
+ return "Invalid State";
+}
+
+static struct mcast_group *mcast_find(struct mlx4_ib_demux_ctx *ctx,
+ union ib_gid *mgid)
+{
+ struct rb_node *node = ctx->mcg_table.rb_node;
+ struct mcast_group *group;
+ int ret;
+
+ while (node) {
+ group = rb_entry(node, struct mcast_group, node);
+ ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid);
+ if (!ret)
+ return group;
+
+ if (ret < 0)
+ node = node->rb_left;
+ else
+ node = node->rb_right;
+ }
+ return NULL;
+}
+
+static struct mcast_group *mcast_insert(struct mlx4_ib_demux_ctx *ctx,
+ struct mcast_group *group)
+{
+ struct rb_node **link = &ctx->mcg_table.rb_node;
+ struct rb_node *parent = NULL;
+ struct mcast_group *cur_group;
+ int ret;
+
+ while (*link) {
+ parent = *link;
+ cur_group = rb_entry(parent, struct mcast_group, node);
+
+ ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw,
+ sizeof group->rec.mgid);
+ if (ret < 0)
+ link = &(*link)->rb_left;
+ else if (ret > 0)
+ link = &(*link)->rb_right;
+ else
+ return cur_group;
+ }
+ rb_link_node(&group->node, parent, link);
+ rb_insert_color(&group->node, &ctx->mcg_table);
+ return NULL;
+}
+
+static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad)
+{
+ struct mlx4_ib_dev *dev = ctx->dev;
+ struct rdma_ah_attr ah_attr;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->sm_lock, flags);
+ if (!dev->sm_ah[ctx->port - 1]) {
+ /* port is not yet Active, sm_ah not ready */
+ spin_unlock_irqrestore(&dev->sm_lock, flags);
+ return -EAGAIN;
+ }
+ mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);
+ spin_unlock_irqrestore(&dev->sm_lock, flags);
+ return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev),
+ ctx->port, IB_QPT_GSI, 0, 1, IB_QP1_QKEY,
+ &ah_attr, NULL, 0xffff, mad);
+}
+
+static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx,
+ struct ib_mad *mad)
+{
+ struct mlx4_ib_dev *dev = ctx->dev;
+ struct ib_mad_agent *agent = dev->send_agent[ctx->port - 1][1];
+ struct ib_wc wc;
+ struct rdma_ah_attr ah_attr;
+
+ /* Our agent might not yet be registered when mads start to arrive */
+ if (!agent)
+ return -EAGAIN;
+
+ rdma_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);
+
+ if (ib_find_cached_pkey(&dev->ib_dev, ctx->port, IB_DEFAULT_PKEY_FULL, &wc.pkey_index))
+ return -EINVAL;
+ wc.sl = 0;
+ wc.dlid_path_bits = 0;
+ wc.port_num = ctx->port;
+ wc.slid = rdma_ah_get_dlid(&ah_attr); /* opensm lid */
+ wc.src_qp = 1;
+ return mlx4_ib_send_to_slave(dev, slave, ctx->port, IB_QPT_GSI, &wc, NULL, mad);
+}
+
+static int send_join_to_wire(struct mcast_group *group, struct ib_sa_mad *sa_mad)
+{
+ struct ib_sa_mad mad;
+ struct ib_sa_mcmember_data *sa_mad_data = (struct ib_sa_mcmember_data *)&mad.data;
+ int ret;
+
+ /* we rely on a mad request as arrived from a VF */
+ memcpy(&mad, sa_mad, sizeof mad);
+
+ /* fix port GID to be the real one (slave 0) */
+ sa_mad_data->port_gid.global.interface_id = group->demux->guid_cache[0];
+
+ /* assign our own TID */
+ mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux);
+ group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */
+
+ ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad);
+ /* set timeout handler */
+ if (!ret) {
+ /* calls mlx4_ib_mcg_timeout_handler */
+ queue_delayed_work(group->demux->mcg_wq, &group->timeout_work,
+ msecs_to_jiffies(MAD_TIMEOUT_MS));
+ }
+
+ return ret;
+}
+
+static int send_leave_to_wire(struct mcast_group *group, u8 join_state)
+{
+ struct ib_sa_mad mad;
+ struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data;
+ int ret;
+
+ memset(&mad, 0, sizeof mad);
+ mad.mad_hdr.base_version = 1;
+ mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+ mad.mad_hdr.class_version = 2;
+ mad.mad_hdr.method = IB_SA_METHOD_DELETE;
+ mad.mad_hdr.status = cpu_to_be16(0);
+ mad.mad_hdr.class_specific = cpu_to_be16(0);
+ mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux);
+ group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */
+ mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+ mad.mad_hdr.attr_mod = cpu_to_be32(0);
+ mad.sa_hdr.sm_key = 0x0;
+ mad.sa_hdr.attr_offset = cpu_to_be16(7);
+ mad.sa_hdr.comp_mask = IB_SA_MCMEMBER_REC_MGID |
+ IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_JOIN_STATE;
+
+ *sa_data = group->rec;
+ sa_data->scope_join_state = join_state;
+
+ ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad);
+ if (ret)
+ group->state = MCAST_IDLE;
+
+ /* set timeout handler */
+ if (!ret) {
+ /* calls mlx4_ib_mcg_timeout_handler */
+ queue_delayed_work(group->demux->mcg_wq, &group->timeout_work,
+ msecs_to_jiffies(MAD_TIMEOUT_MS));
+ }
+
+ return ret;
+}
+
+static int send_reply_to_slave(int slave, struct mcast_group *group,
+ struct ib_sa_mad *req_sa_mad, u16 status)
+{
+ struct ib_sa_mad mad;
+ struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data;
+ struct ib_sa_mcmember_data *req_sa_data = (struct ib_sa_mcmember_data *)&req_sa_mad->data;
+ int ret;
+
+ memset(&mad, 0, sizeof mad);
+ mad.mad_hdr.base_version = 1;
+ mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+ mad.mad_hdr.class_version = 2;
+ mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
+ mad.mad_hdr.status = cpu_to_be16(status);
+ mad.mad_hdr.class_specific = cpu_to_be16(0);
+ mad.mad_hdr.tid = req_sa_mad->mad_hdr.tid;
+ *(u8 *)&mad.mad_hdr.tid = 0; /* resetting tid to 0 */
+ mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+ mad.mad_hdr.attr_mod = cpu_to_be32(0);
+ mad.sa_hdr.sm_key = req_sa_mad->sa_hdr.sm_key;
+ mad.sa_hdr.attr_offset = cpu_to_be16(7);
+ mad.sa_hdr.comp_mask = 0; /* ignored on responses, see IBTA spec */
+
+ *sa_data = group->rec;
+
+ /* reconstruct VF's requested join_state and port_gid */
+ sa_data->scope_join_state &= 0xf0;
+ sa_data->scope_join_state |= (group->func[slave].join_state & 0x0f);
+ memcpy(&sa_data->port_gid, &req_sa_data->port_gid, sizeof req_sa_data->port_gid);
+
+ ret = send_mad_to_slave(slave, group->demux, (struct ib_mad *)&mad);
+ return ret;
+}
+
+static int check_selector(ib_sa_comp_mask comp_mask,
+ ib_sa_comp_mask selector_mask,
+ ib_sa_comp_mask value_mask,
+ u8 src_value, u8 dst_value)
+{
+ int err;
+ u8 selector = dst_value >> 6;
+ dst_value &= 0x3f;
+ src_value &= 0x3f;
+
+ if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
+ return 0;
+
+ switch (selector) {
+ case IB_SA_GT:
+ err = (src_value <= dst_value);
+ break;
+ case IB_SA_LT:
+ err = (src_value >= dst_value);
+ break;
+ case IB_SA_EQ:
+ err = (src_value != dst_value);
+ break;
+ default:
+ err = 0;
+ break;
+ }
+
+ return err;
+}
+
+static u16 cmp_rec(struct ib_sa_mcmember_data *src,
+ struct ib_sa_mcmember_data *dst, ib_sa_comp_mask comp_mask)
+{
+ /* src is group record, dst is request record */
+ /* MGID must already match */
+ /* Port_GID we always replace to our Port_GID, so it is a match */
+
+#define MAD_STATUS_REQ_INVALID 0x0200
+ if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey)
+ return MAD_STATUS_REQ_INVALID;
+ if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
+ return MAD_STATUS_REQ_INVALID;
+ if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
+ IB_SA_MCMEMBER_REC_MTU,
+ src->mtusel_mtu, dst->mtusel_mtu))
+ return MAD_STATUS_REQ_INVALID;
+ if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS &&
+ src->tclass != dst->tclass)
+ return MAD_STATUS_REQ_INVALID;
+ if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
+ return MAD_STATUS_REQ_INVALID;
+ if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
+ IB_SA_MCMEMBER_REC_RATE,
+ src->ratesel_rate, dst->ratesel_rate))
+ return MAD_STATUS_REQ_INVALID;
+ if (check_selector(comp_mask,
+ IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
+ IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
+ src->lifetmsel_lifetm, dst->lifetmsel_lifetm))
+ return MAD_STATUS_REQ_INVALID;
+ if (comp_mask & IB_SA_MCMEMBER_REC_SL &&
+ (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0xf0000000) !=
+ (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0xf0000000))
+ return MAD_STATUS_REQ_INVALID;
+ if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL &&
+ (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x0fffff00) !=
+ (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x0fffff00))
+ return MAD_STATUS_REQ_INVALID;
+ if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT &&
+ (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x000000ff) !=
+ (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x000000ff))
+ return MAD_STATUS_REQ_INVALID;
+ if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE &&
+ (src->scope_join_state & 0xf0) !=
+ (dst->scope_join_state & 0xf0))
+ return MAD_STATUS_REQ_INVALID;
+
+ /* join_state checked separately, proxy_join ignored */
+
+ return 0;
+}
+
+/* release group, return 1 if this was last release and group is destroyed
+ * timout work is canceled sync */
+static int release_group(struct mcast_group *group, int from_timeout_handler)
+{
+ struct mlx4_ib_demux_ctx *ctx = group->demux;
+ int nzgroup;
+
+ mutex_lock(&ctx->mcg_table_lock);
+ mutex_lock(&group->lock);
+ if (atomic_dec_and_test(&group->refcount)) {
+ if (!from_timeout_handler) {
+ if (group->state != MCAST_IDLE &&
+ !cancel_delayed_work(&group->timeout_work)) {
+ atomic_inc(&group->refcount);
+ mutex_unlock(&group->lock);
+ mutex_unlock(&ctx->mcg_table_lock);
+ return 0;
+ }
+ }
+
+ nzgroup = memcmp(&group->rec.mgid, &mgid0, sizeof mgid0);
+ if (nzgroup)
+ del_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr);
+ if (!list_empty(&group->pending_list))
+ mcg_warn_group(group, "releasing a group with non empty pending list\n");
+ if (nzgroup)
+ rb_erase(&group->node, &ctx->mcg_table);
+ list_del_init(&group->mgid0_list);
+ mutex_unlock(&group->lock);
+ mutex_unlock(&ctx->mcg_table_lock);
+ kfree(group);
+ return 1;
+ } else {
+ mutex_unlock(&group->lock);
+ mutex_unlock(&ctx->mcg_table_lock);
+ }
+ return 0;
+}
+
+static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
+{
+ int i;
+
+ for (i = 0; i < 3; i++, join_state >>= 1)
+ if (join_state & 0x1)
+ group->members[i] += inc;
+}
+
+static u8 get_leave_state(struct mcast_group *group)
+{
+ u8 leave_state = 0;
+ int i;
+
+ for (i = 0; i < 3; i++)
+ if (!group->members[i])
+ leave_state |= (1 << i);
+
+ return leave_state & (group->rec.scope_join_state & 0xf);
+}
+
+static int join_group(struct mcast_group *group, int slave, u8 join_mask)
+{
+ int ret = 0;
+ u8 join_state;
+
+ /* remove bits that slave is already member of, and adjust */
+ join_state = join_mask & (~group->func[slave].join_state);
+ adjust_membership(group, join_state, 1);
+ group->func[slave].join_state |= join_state;
+ if (group->func[slave].state != MCAST_MEMBER && join_state) {
+ group->func[slave].state = MCAST_MEMBER;
+ ret = 1;
+ }
+ return ret;
+}
+
+static int leave_group(struct mcast_group *group, int slave, u8 leave_state)
+{
+ int ret = 0;
+
+ adjust_membership(group, leave_state, -1);
+ group->func[slave].join_state &= ~leave_state;
+ if (!group->func[slave].join_state) {
+ group->func[slave].state = MCAST_NOT_MEMBER;
+ ret = 1;
+ }
+ return ret;
+}
+
+static int check_leave(struct mcast_group *group, int slave, u8 leave_mask)
+{
+ if (group->func[slave].state != MCAST_MEMBER)
+ return MAD_STATUS_REQ_INVALID;
+
+ /* make sure we're not deleting unset bits */
+ if (~group->func[slave].join_state & leave_mask)
+ return MAD_STATUS_REQ_INVALID;
+
+ if (!leave_mask)
+ return MAD_STATUS_REQ_INVALID;
+
+ return 0;
+}
+
+static void mlx4_ib_mcg_timeout_handler(struct work_struct *work)
+{
+ struct delayed_work *delay = to_delayed_work(work);
+ struct mcast_group *group;
+ struct mcast_req *req = NULL;
+
+ group = container_of(delay, typeof(*group), timeout_work);
+
+ mutex_lock(&group->lock);
+ if (group->state == MCAST_JOIN_SENT) {
+ if (!list_empty(&group->pending_list)) {
+ req = list_first_entry(&group->pending_list, struct mcast_req, group_list);
+ list_del(&req->group_list);
+ list_del(&req->func_list);
+ --group->func[req->func].num_pend_reqs;
+ mutex_unlock(&group->lock);
+ kfree(req);
+ if (memcmp(&group->rec.mgid, &mgid0, sizeof mgid0)) {
+ if (release_group(group, 1))
+ return;
+ } else {
+ kfree(group);
+ return;
+ }
+ mutex_lock(&group->lock);
+ } else
+ mcg_warn_group(group, "DRIVER BUG\n");
+ } else if (group->state == MCAST_LEAVE_SENT) {
+ if (group->rec.scope_join_state & 0xf)
+ group->rec.scope_join_state &= 0xf0;
+ group->state = MCAST_IDLE;
+ mutex_unlock(&group->lock);
+ if (release_group(group, 1))
+ return;
+ mutex_lock(&group->lock);
+ } else
+ mcg_warn_group(group, "invalid state %s\n", get_state_string(group->state));
+ group->state = MCAST_IDLE;
+ atomic_inc(&group->refcount);
+ if (!queue_work(group->demux->mcg_wq, &group->work))
+ safe_atomic_dec(&group->refcount);
+
+ mutex_unlock(&group->lock);
+}
+
+static int handle_leave_req(struct mcast_group *group, u8 leave_mask,
+ struct mcast_req *req)
+{
+ u16 status;
+
+ if (req->clean)
+ leave_mask = group->func[req->func].join_state;
+
+ status = check_leave(group, req->func, leave_mask);
+ if (!status)
+ leave_group(group, req->func, leave_mask);
+
+ if (!req->clean)
+ send_reply_to_slave(req->func, group, &req->sa_mad, status);
+ --group->func[req->func].num_pend_reqs;
+ list_del(&req->group_list);
+ list_del(&req->func_list);
+ kfree(req);
+ return 1;
+}
+
+static int handle_join_req(struct mcast_group *group, u8 join_mask,
+ struct mcast_req *req)
+{
+ u8 group_join_state = group->rec.scope_join_state & 0xf;
+ int ref = 0;
+ u16 status;
+ struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data;
+
+ if (join_mask == (group_join_state & join_mask)) {
+ /* port's membership need not change */
+ status = cmp_rec(&group->rec, sa_data, req->sa_mad.sa_hdr.comp_mask);
+ if (!status)
+ join_group(group, req->func, join_mask);
+
+ --group->func[req->func].num_pend_reqs;
+ send_reply_to_slave(req->func, group, &req->sa_mad, status);
+ list_del(&req->group_list);
+ list_del(&req->func_list);
+ kfree(req);
+ ++ref;
+ } else {
+ /* port's membership needs to be updated */
+ group->prev_state = group->state;
+ if (send_join_to_wire(group, &req->sa_mad)) {
+ --group->func[req->func].num_pend_reqs;
+ list_del(&req->group_list);
+ list_del(&req->func_list);
+ kfree(req);
+ ref = 1;
+ group->state = group->prev_state;
+ } else
+ group->state = MCAST_JOIN_SENT;
+ }
+
+ return ref;
+}
+
+static void mlx4_ib_mcg_work_handler(struct work_struct *work)
+{
+ struct mcast_group *group;
+ struct mcast_req *req = NULL;
+ struct ib_sa_mcmember_data *sa_data;
+ u8 req_join_state;
+ int rc = 1; /* release_count - this is for the scheduled work */
+ u16 status;
+ u8 method;
+
+ group = container_of(work, typeof(*group), work);
+
+ mutex_lock(&group->lock);
+
+ /* First, let's see if a response from SM is waiting regarding this group.
+ * If so, we need to update the group's REC. If this is a bad response, we
+ * may need to send a bad response to a VF waiting for it. If VF is waiting
+ * and this is a good response, the VF will be answered later in this func. */
+ if (group->state == MCAST_RESP_READY) {
+ /* cancels mlx4_ib_mcg_timeout_handler */
+ cancel_delayed_work(&group->timeout_work);
+ status = be16_to_cpu(group->response_sa_mad.mad_hdr.status);
+ method = group->response_sa_mad.mad_hdr.method;
+ if (group->last_req_tid != group->response_sa_mad.mad_hdr.tid) {
+ mcg_warn_group(group, "Got MAD response to existing MGID but wrong TID, dropping. Resp TID=%llx, group TID=%llx\n",
+ be64_to_cpu(group->response_sa_mad.mad_hdr.tid),
+ be64_to_cpu(group->last_req_tid));
+ group->state = group->prev_state;
+ goto process_requests;
+ }
+ if (status) {
+ if (!list_empty(&group->pending_list))
+ req = list_first_entry(&group->pending_list,
+ struct mcast_req, group_list);
+ if (method == IB_MGMT_METHOD_GET_RESP) {
+ if (req) {
+ send_reply_to_slave(req->func, group, &req->sa_mad, status);
+ --group->func[req->func].num_pend_reqs;
+ list_del(&req->group_list);
+ list_del(&req->func_list);
+ kfree(req);
+ ++rc;
+ } else
+ mcg_warn_group(group, "no request for failed join\n");
+ } else if (method == IB_SA_METHOD_DELETE_RESP && group->demux->flushing)
+ ++rc;
+ } else {
+ u8 resp_join_state;
+ u8 cur_join_state;
+
+ resp_join_state = ((struct ib_sa_mcmember_data *)
+ group->response_sa_mad.data)->scope_join_state & 0xf;
+ cur_join_state = group->rec.scope_join_state & 0xf;
+
+ if (method == IB_MGMT_METHOD_GET_RESP) {
+ /* successfull join */
+ if (!cur_join_state && resp_join_state)
+ --rc;
+ } else if (!resp_join_state)
+ ++rc;
+ memcpy(&group->rec, group->response_sa_mad.data, sizeof group->rec);
+ }
+ group->state = MCAST_IDLE;
+ }
+
+process_requests:
+ /* We should now go over pending join/leave requests, as long as we are idle. */
+ while (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) {
+ req = list_first_entry(&group->pending_list, struct mcast_req,
+ group_list);
+ sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data;
+ req_join_state = sa_data->scope_join_state & 0xf;
+
+ /* For a leave request, we will immediately answer the VF, and
+ * update our internal counters. The actual leave will be sent
+ * to SM later, if at all needed. We dequeue the request now. */
+ if (req->sa_mad.mad_hdr.method == IB_SA_METHOD_DELETE)
+ rc += handle_leave_req(group, req_join_state, req);
+ else
+ rc += handle_join_req(group, req_join_state, req);
+ }
+
+ /* Handle leaves */
+ if (group->state == MCAST_IDLE) {
+ req_join_state = get_leave_state(group);
+ if (req_join_state) {
+ group->rec.scope_join_state &= ~req_join_state;
+ group->prev_state = group->state;
+ if (send_leave_to_wire(group, req_join_state)) {
+ group->state = group->prev_state;
+ ++rc;
+ } else
+ group->state = MCAST_LEAVE_SENT;
+ }
+ }
+
+ if (!list_empty(&group->pending_list) && group->state == MCAST_IDLE)
+ goto process_requests;
+ mutex_unlock(&group->lock);
+
+ while (rc--)
+ release_group(group, 0);
+}
+
+static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx *ctx,
+ __be64 tid,
+ union ib_gid *new_mgid)
+{
+ struct mcast_group *group = NULL, *cur_group, *n;
+ struct mcast_req *req;
+
+ mutex_lock(&ctx->mcg_table_lock);
+ list_for_each_entry_safe(group, n, &ctx->mcg_mgid0_list, mgid0_list) {
+ mutex_lock(&group->lock);
+ if (group->last_req_tid == tid) {
+ if (memcmp(new_mgid, &mgid0, sizeof mgid0)) {
+ group->rec.mgid = *new_mgid;
+ sprintf(group->name, "%016llx%016llx",
+ be64_to_cpu(group->rec.mgid.global.subnet_prefix),
+ be64_to_cpu(group->rec.mgid.global.interface_id));
+ list_del_init(&group->mgid0_list);
+ cur_group = mcast_insert(ctx, group);
+ if (cur_group) {
+ /* A race between our code and SM. Silently cleaning the new one */
+ req = list_first_entry(&group->pending_list,
+ struct mcast_req, group_list);
+ --group->func[req->func].num_pend_reqs;
+ list_del(&req->group_list);
+ list_del(&req->func_list);
+ kfree(req);
+ mutex_unlock(&group->lock);
+ mutex_unlock(&ctx->mcg_table_lock);
+ release_group(group, 0);
+ return NULL;
+ }
+
+ atomic_inc(&group->refcount);
+ add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr);
+ mutex_unlock(&group->lock);
+ mutex_unlock(&ctx->mcg_table_lock);
+ return group;
+ } else {
+ struct mcast_req *tmp1, *tmp2;
+
+ list_del(&group->mgid0_list);
+ if (!list_empty(&group->pending_list) && group->state != MCAST_IDLE)
+ cancel_delayed_work_sync(&group->timeout_work);
+
+ list_for_each_entry_safe(tmp1, tmp2, &group->pending_list, group_list) {
+ list_del(&tmp1->group_list);
+ kfree(tmp1);
+ }
+ mutex_unlock(&group->lock);
+ mutex_unlock(&ctx->mcg_table_lock);
+ kfree(group);
+ return NULL;
+ }
+ }
+ mutex_unlock(&group->lock);
+ }
+ mutex_unlock(&ctx->mcg_table_lock);
+
+ return NULL;
+}
+
+static ssize_t sysfs_show_group(struct device *dev,
+ struct device_attribute *attr, char *buf);
+
+static struct mcast_group *acquire_group(struct mlx4_ib_demux_ctx *ctx,
+ union ib_gid *mgid, int create)
+{
+ struct mcast_group *group, *cur_group;
+ int is_mgid0;
+ int i;
+
+ is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0);
+ if (!is_mgid0) {
+ group = mcast_find(ctx, mgid);
+ if (group)
+ goto found;
+ }
+
+ if (!create)
+ return ERR_PTR(-ENOENT);
+
+ group = kzalloc(sizeof(*group), GFP_KERNEL);
+ if (!group)
+ return ERR_PTR(-ENOMEM);
+
+ group->demux = ctx;
+ group->rec.mgid = *mgid;
+ INIT_LIST_HEAD(&group->pending_list);
+ INIT_LIST_HEAD(&group->mgid0_list);
+ for (i = 0; i < MAX_VFS; ++i)
+ INIT_LIST_HEAD(&group->func[i].pending);
+ INIT_WORK(&group->work, mlx4_ib_mcg_work_handler);
+ INIT_DELAYED_WORK(&group->timeout_work, mlx4_ib_mcg_timeout_handler);
+ mutex_init(&group->lock);
+ sprintf(group->name, "%016llx%016llx",
+ be64_to_cpu(group->rec.mgid.global.subnet_prefix),
+ be64_to_cpu(group->rec.mgid.global.interface_id));
+ sysfs_attr_init(&group->dentry.attr);
+ group->dentry.show = sysfs_show_group;
+ group->dentry.store = NULL;
+ group->dentry.attr.name = group->name;
+ group->dentry.attr.mode = 0400;
+ group->state = MCAST_IDLE;
+
+ if (is_mgid0) {
+ list_add(&group->mgid0_list, &ctx->mcg_mgid0_list);
+ goto found;
+ }
+
+ cur_group = mcast_insert(ctx, group);
+ if (cur_group) {
+ mcg_warn("group just showed up %s - confused\n", cur_group->name);
+ kfree(group);
+ return ERR_PTR(-EINVAL);
+ }
+
+ add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr);
+
+found:
+ atomic_inc(&group->refcount);
+ return group;
+}
+
+static void queue_req(struct mcast_req *req)
+{
+ struct mcast_group *group = req->group;
+
+ atomic_inc(&group->refcount); /* for the request */
+ atomic_inc(&group->refcount); /* for scheduling the work */
+ list_add_tail(&req->group_list, &group->pending_list);
+ list_add_tail(&req->func_list, &group->func[req->func].pending);
+ /* calls mlx4_ib_mcg_work_handler */
+ if (!queue_work(group->demux->mcg_wq, &group->work))
+ safe_atomic_dec(&group->refcount);
+}
+
+int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave,
+ struct ib_sa_mad *mad)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)mad->data;
+ struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1];
+ struct mcast_group *group;
+
+ switch (mad->mad_hdr.method) {
+ case IB_MGMT_METHOD_GET_RESP:
+ case IB_SA_METHOD_DELETE_RESP:
+ mutex_lock(&ctx->mcg_table_lock);
+ group = acquire_group(ctx, &rec->mgid, 0);
+ mutex_unlock(&ctx->mcg_table_lock);
+ if (IS_ERR(group)) {
+ if (mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP) {
+ __be64 tid = mad->mad_hdr.tid;
+ *(u8 *)(&tid) = (u8)slave; /* in group we kept the modified TID */
+ group = search_relocate_mgid0_group(ctx, tid, &rec->mgid);
+ } else
+ group = NULL;
+ }
+
+ if (!group)
+ return 1;
+
+ mutex_lock(&group->lock);
+ group->response_sa_mad = *mad;
+ group->prev_state = group->state;
+ group->state = MCAST_RESP_READY;
+ /* calls mlx4_ib_mcg_work_handler */
+ atomic_inc(&group->refcount);
+ if (!queue_work(ctx->mcg_wq, &group->work))
+ safe_atomic_dec(&group->refcount);
+ mutex_unlock(&group->lock);
+ release_group(group, 0);
+ return 1; /* consumed */
+ case IB_MGMT_METHOD_SET:
+ case IB_SA_METHOD_GET_TABLE:
+ case IB_SA_METHOD_GET_TABLE_RESP:
+ case IB_SA_METHOD_DELETE:
+ return 0; /* not consumed, pass-through to guest over tunnel */
+ default:
+ mcg_warn("In demux, port %d: unexpected MCMember method: 0x%x, dropping\n",
+ port, mad->mad_hdr.method);
+ return 1; /* consumed */
+ }
+}
+
+int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port,
+ int slave, struct ib_sa_mad *sa_mad)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)sa_mad->data;
+ struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1];
+ struct mcast_group *group;
+ struct mcast_req *req;
+ int may_create = 0;
+
+ if (ctx->flushing)
+ return -EAGAIN;
+
+ switch (sa_mad->mad_hdr.method) {
+ case IB_MGMT_METHOD_SET:
+ may_create = 1;
+ fallthrough;
+ case IB_SA_METHOD_DELETE:
+ req = kzalloc(sizeof *req, GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ req->func = slave;
+ req->sa_mad = *sa_mad;
+
+ mutex_lock(&ctx->mcg_table_lock);
+ group = acquire_group(ctx, &rec->mgid, may_create);
+ mutex_unlock(&ctx->mcg_table_lock);
+ if (IS_ERR(group)) {
+ kfree(req);
+ return PTR_ERR(group);
+ }
+ mutex_lock(&group->lock);
+ if (group->func[slave].num_pend_reqs > MAX_PEND_REQS_PER_FUNC) {
+ mutex_unlock(&group->lock);
+ mcg_debug_group(group, "Port %d, Func %d has too many pending requests (%d), dropping\n",
+ port, slave, MAX_PEND_REQS_PER_FUNC);
+ release_group(group, 0);
+ kfree(req);
+ return -ENOMEM;
+ }
+ ++group->func[slave].num_pend_reqs;
+ req->group = group;
+ queue_req(req);
+ mutex_unlock(&group->lock);
+ release_group(group, 0);
+ return 1; /* consumed */
+ case IB_SA_METHOD_GET_TABLE:
+ case IB_MGMT_METHOD_GET_RESP:
+ case IB_SA_METHOD_GET_TABLE_RESP:
+ case IB_SA_METHOD_DELETE_RESP:
+ return 0; /* not consumed, pass-through */
+ default:
+ mcg_warn("In multiplex, port %d, func %d: unexpected MCMember method: 0x%x, dropping\n",
+ port, slave, sa_mad->mad_hdr.method);
+ return 1; /* consumed */
+ }
+}
+
+static ssize_t sysfs_show_group(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct mcast_group *group =
+ container_of(attr, struct mcast_group, dentry);
+ struct mcast_req *req = NULL;
+ char state_str[40];
+ char pending_str[40];
+ int len;
+ int i;
+ u32 hoplimit;
+
+ if (group->state == MCAST_IDLE)
+ scnprintf(state_str, sizeof(state_str), "%s",
+ get_state_string(group->state));
+ else
+ scnprintf(state_str, sizeof(state_str), "%s(TID=0x%llx)",
+ get_state_string(group->state),
+ be64_to_cpu(group->last_req_tid));
+
+ if (list_empty(&group->pending_list)) {
+ scnprintf(pending_str, sizeof(pending_str), "No");
+ } else {
+ req = list_first_entry(&group->pending_list, struct mcast_req,
+ group_list);
+ scnprintf(pending_str, sizeof(pending_str), "Yes(TID=0x%llx)",
+ be64_to_cpu(req->sa_mad.mad_hdr.tid));
+ }
+
+ len = sysfs_emit(buf, "%1d [%02d,%02d,%02d] %4d %4s %5s ",
+ group->rec.scope_join_state & 0xf,
+ group->members[2],
+ group->members[1],
+ group->members[0],
+ atomic_read(&group->refcount),
+ pending_str,
+ state_str);
+
+ for (i = 0; i < MAX_VFS; i++) {
+ if (group->func[i].state == MCAST_MEMBER)
+ len += sysfs_emit_at(buf, len, "%d[%1x] ", i,
+ group->func[i].join_state);
+ }
+
+ hoplimit = be32_to_cpu(group->rec.sl_flowlabel_hoplimit);
+ len += sysfs_emit_at(buf, len,
+ "\t\t(%4hx %4x %2x %2x %2x %2x %2x %4x %4x %2x %2x)\n",
+ be16_to_cpu(group->rec.pkey),
+ be32_to_cpu(group->rec.qkey),
+ (group->rec.mtusel_mtu & 0xc0) >> 6,
+ (group->rec.mtusel_mtu & 0x3f),
+ group->rec.tclass,
+ (group->rec.ratesel_rate & 0xc0) >> 6,
+ (group->rec.ratesel_rate & 0x3f),
+ (hoplimit & 0xf0000000) >> 28,
+ (hoplimit & 0x0fffff00) >> 8,
+ (hoplimit & 0x000000ff),
+ group->rec.proxy_join);
+
+ return len;
+}
+
+int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx)
+{
+ char name[20];
+
+ atomic_set(&ctx->tid, 0);
+ sprintf(name, "mlx4_ib_mcg%d", ctx->port);
+ ctx->mcg_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
+ if (!ctx->mcg_wq)
+ return -ENOMEM;
+
+ mutex_init(&ctx->mcg_table_lock);
+ ctx->mcg_table = RB_ROOT;
+ INIT_LIST_HEAD(&ctx->mcg_mgid0_list);
+ ctx->flushing = 0;
+
+ return 0;
+}
+
+static void force_clean_group(struct mcast_group *group)
+{
+ struct mcast_req *req, *tmp
+ ;
+ list_for_each_entry_safe(req, tmp, &group->pending_list, group_list) {
+ list_del(&req->group_list);
+ kfree(req);
+ }
+ del_sysfs_port_mcg_attr(group->demux->dev, group->demux->port, &group->dentry.attr);
+ rb_erase(&group->node, &group->demux->mcg_table);
+ kfree(group);
+}
+
+static void _mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq)
+{
+ int i;
+ struct rb_node *p;
+ struct mcast_group *group;
+ unsigned long end;
+ int count;
+
+ for (i = 0; i < MAX_VFS; ++i)
+ clean_vf_mcast(ctx, i);
+
+ end = jiffies + msecs_to_jiffies(MAD_TIMEOUT_MS + 3000);
+ do {
+ count = 0;
+ mutex_lock(&ctx->mcg_table_lock);
+ for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p))
+ ++count;
+ mutex_unlock(&ctx->mcg_table_lock);
+ if (!count)
+ break;
+
+ usleep_range(1000, 2000);
+ } while (time_after(end, jiffies));
+
+ flush_workqueue(ctx->mcg_wq);
+ if (destroy_wq)
+ destroy_workqueue(ctx->mcg_wq);
+
+ mutex_lock(&ctx->mcg_table_lock);
+ while ((p = rb_first(&ctx->mcg_table)) != NULL) {
+ group = rb_entry(p, struct mcast_group, node);
+ if (atomic_read(&group->refcount))
+ mcg_debug_group(group, "group refcount %d!!! (pointer %p)\n",
+ atomic_read(&group->refcount), group);
+
+ force_clean_group(group);
+ }
+ mutex_unlock(&ctx->mcg_table_lock);
+}
+
+struct clean_work {
+ struct work_struct work;
+ struct mlx4_ib_demux_ctx *ctx;
+ int destroy_wq;
+};
+
+static void mcg_clean_task(struct work_struct *work)
+{
+ struct clean_work *cw = container_of(work, struct clean_work, work);
+
+ _mlx4_ib_mcg_port_cleanup(cw->ctx, cw->destroy_wq);
+ cw->ctx->flushing = 0;
+ kfree(cw);
+}
+
+void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq)
+{
+ struct clean_work *work;
+
+ if (ctx->flushing)
+ return;
+
+ ctx->flushing = 1;
+
+ if (destroy_wq) {
+ _mlx4_ib_mcg_port_cleanup(ctx, destroy_wq);
+ ctx->flushing = 0;
+ return;
+ }
+
+ work = kmalloc(sizeof *work, GFP_KERNEL);
+ if (!work) {
+ ctx->flushing = 0;
+ return;
+ }
+
+ work->ctx = ctx;
+ work->destroy_wq = destroy_wq;
+ INIT_WORK(&work->work, mcg_clean_task);
+ queue_work(clean_wq, &work->work);
+}
+
+static void build_leave_mad(struct mcast_req *req)
+{
+ struct ib_sa_mad *mad = &req->sa_mad;
+
+ mad->mad_hdr.method = IB_SA_METHOD_DELETE;
+}
+
+
+static void clear_pending_reqs(struct mcast_group *group, int vf)
+{
+ struct mcast_req *req, *tmp, *group_first = NULL;
+ int clear;
+ int pend = 0;
+
+ if (!list_empty(&group->pending_list))
+ group_first = list_first_entry(&group->pending_list, struct mcast_req, group_list);
+
+ list_for_each_entry_safe(req, tmp, &group->func[vf].pending, func_list) {
+ clear = 1;
+ if (group_first == req &&
+ (group->state == MCAST_JOIN_SENT ||
+ group->state == MCAST_LEAVE_SENT)) {
+ clear = cancel_delayed_work(&group->timeout_work);
+ pend = !clear;
+ group->state = MCAST_IDLE;
+ }
+ if (clear) {
+ --group->func[vf].num_pend_reqs;
+ list_del(&req->group_list);
+ list_del(&req->func_list);
+ kfree(req);
+ atomic_dec(&group->refcount);
+ }
+ }
+
+ if (!pend && (!list_empty(&group->func[vf].pending) || group->func[vf].num_pend_reqs)) {
+ mcg_warn_group(group, "DRIVER BUG: list_empty %d, num_pend_reqs %d\n",
+ list_empty(&group->func[vf].pending), group->func[vf].num_pend_reqs);
+ }
+}
+
+static int push_deleteing_req(struct mcast_group *group, int slave)
+{
+ struct mcast_req *req;
+ struct mcast_req *pend_req;
+
+ if (!group->func[slave].join_state)
+ return 0;
+
+ req = kzalloc(sizeof *req, GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ if (!list_empty(&group->func[slave].pending)) {
+ pend_req = list_entry(group->func[slave].pending.prev, struct mcast_req, group_list);
+ if (pend_req->clean) {
+ kfree(req);
+ return 0;
+ }
+ }
+
+ req->clean = 1;
+ req->func = slave;
+ req->group = group;
+ ++group->func[slave].num_pend_reqs;
+ build_leave_mad(req);
+ queue_req(req);
+ return 0;
+}
+
+void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave)
+{
+ struct mcast_group *group;
+ struct rb_node *p;
+
+ mutex_lock(&ctx->mcg_table_lock);
+ for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) {
+ group = rb_entry(p, struct mcast_group, node);
+ mutex_lock(&group->lock);
+ if (atomic_read(&group->refcount)) {
+ /* clear pending requests of this VF */
+ clear_pending_reqs(group, slave);
+ push_deleteing_req(group, slave);
+ }
+ mutex_unlock(&group->lock);
+ }
+ mutex_unlock(&ctx->mcg_table_lock);
+}
+
+
+int mlx4_ib_mcg_init(void)
+{
+ clean_wq = alloc_ordered_workqueue("mlx4_ib_mcg", WQ_MEM_RECLAIM);
+ if (!clean_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void mlx4_ib_mcg_destroy(void)
+{
+ destroy_workqueue(clean_wq);
+}
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
new file mode 100644
index 000000000..6a3b0f121
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -0,0 +1,943 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_IB_H
+#define MLX4_IB_H
+
+#include <linux/compiler.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/idr.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_sa.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/cq.h>
+
+#define MLX4_IB_DRV_NAME "mlx4_ib"
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+#define pr_fmt(fmt) "<" MLX4_IB_DRV_NAME "> %s: " fmt, __func__
+
+#define mlx4_ib_warn(ibdev, format, arg...) \
+ dev_warn((ibdev)->dev.parent, MLX4_IB_DRV_NAME ": " format, ## arg)
+
+enum {
+ MLX4_IB_SQ_MIN_WQE_SHIFT = 6,
+ MLX4_IB_MAX_HEADROOM = 2048
+};
+
+#define MLX4_IB_SQ_HEADROOM(shift) ((MLX4_IB_MAX_HEADROOM >> (shift)) + 1)
+#define MLX4_IB_SQ_MAX_SPARE (MLX4_IB_SQ_HEADROOM(MLX4_IB_SQ_MIN_WQE_SHIFT))
+
+/*module param to indicate if SM assigns the alias_GUID*/
+extern int mlx4_ib_sm_guid_assign;
+
+#define MLX4_IB_UC_STEER_QPN_ALIGN 1
+#define MLX4_IB_UC_MAX_NUM_QPS 256
+
+enum hw_bar_type {
+ HW_BAR_BF,
+ HW_BAR_DB,
+ HW_BAR_CLOCK,
+ HW_BAR_COUNT
+};
+
+struct mlx4_ib_ucontext {
+ struct ib_ucontext ibucontext;
+ struct mlx4_uar uar;
+ struct list_head db_page_list;
+ struct mutex db_page_mutex;
+ struct list_head wqn_ranges_list;
+ struct mutex wqn_ranges_mutex; /* protect wqn_ranges_list */
+};
+
+struct mlx4_ib_pd {
+ struct ib_pd ibpd;
+ u32 pdn;
+};
+
+struct mlx4_ib_xrcd {
+ struct ib_xrcd ibxrcd;
+ u32 xrcdn;
+ struct ib_pd *pd;
+ struct ib_cq *cq;
+};
+
+struct mlx4_ib_cq_buf {
+ struct mlx4_buf buf;
+ struct mlx4_mtt mtt;
+ int entry_size;
+};
+
+struct mlx4_ib_cq_resize {
+ struct mlx4_ib_cq_buf buf;
+ int cqe;
+};
+
+struct mlx4_ib_cq {
+ struct ib_cq ibcq;
+ struct mlx4_cq mcq;
+ struct mlx4_ib_cq_buf buf;
+ struct mlx4_ib_cq_resize *resize_buf;
+ struct mlx4_db db;
+ spinlock_t lock;
+ struct mutex resize_mutex;
+ struct ib_umem *umem;
+ struct ib_umem *resize_umem;
+ int create_flags;
+ /* List of qps that it serves.*/
+ struct list_head send_qp_list;
+ struct list_head recv_qp_list;
+};
+
+#define MLX4_MR_PAGES_ALIGN 0x40
+
+struct mlx4_ib_mr {
+ struct ib_mr ibmr;
+ __be64 *pages;
+ dma_addr_t page_map;
+ u32 npages;
+ u32 max_pages;
+ struct mlx4_mr mmr;
+ struct ib_umem *umem;
+ size_t page_map_size;
+};
+
+struct mlx4_ib_mw {
+ struct ib_mw ibmw;
+ struct mlx4_mw mmw;
+};
+
+#define MAX_REGS_PER_FLOW 2
+
+struct mlx4_flow_reg_id {
+ u64 id;
+ u64 mirror;
+};
+
+struct mlx4_ib_flow {
+ struct ib_flow ibflow;
+ /* translating DMFS verbs sniffer rule to FW API requires two reg IDs */
+ struct mlx4_flow_reg_id reg_id[MAX_REGS_PER_FLOW];
+};
+
+struct mlx4_ib_wq {
+ u64 *wrid;
+ spinlock_t lock;
+ int wqe_cnt;
+ int max_post;
+ int max_gs;
+ int offset;
+ int wqe_shift;
+ unsigned head;
+ unsigned tail;
+};
+
+enum {
+ MLX4_IB_QP_CREATE_ROCE_V2_GSI = IB_QP_CREATE_RESERVED_START
+};
+
+enum mlx4_ib_qp_flags {
+ MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO,
+ MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK,
+ MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP,
+ MLX4_IB_QP_SCATTER_FCS = IB_QP_CREATE_SCATTER_FCS,
+
+ /* Mellanox specific flags start from IB_QP_CREATE_RESERVED_START */
+ MLX4_IB_ROCE_V2_GSI_QP = MLX4_IB_QP_CREATE_ROCE_V2_GSI,
+ MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30,
+ MLX4_IB_SRIOV_SQP = 1 << 31,
+};
+
+struct mlx4_ib_gid_entry {
+ struct list_head list;
+ union ib_gid gid;
+ int added;
+ u8 port;
+};
+
+enum mlx4_ib_qp_type {
+ /*
+ * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries
+ * here (and in that order) since the MAD layer uses them as
+ * indices into a 2-entry table.
+ */
+ MLX4_IB_QPT_SMI = IB_QPT_SMI,
+ MLX4_IB_QPT_GSI = IB_QPT_GSI,
+
+ MLX4_IB_QPT_RC = IB_QPT_RC,
+ MLX4_IB_QPT_UC = IB_QPT_UC,
+ MLX4_IB_QPT_UD = IB_QPT_UD,
+ MLX4_IB_QPT_RAW_IPV6 = IB_QPT_RAW_IPV6,
+ MLX4_IB_QPT_RAW_ETHERTYPE = IB_QPT_RAW_ETHERTYPE,
+ MLX4_IB_QPT_RAW_PACKET = IB_QPT_RAW_PACKET,
+ MLX4_IB_QPT_XRC_INI = IB_QPT_XRC_INI,
+ MLX4_IB_QPT_XRC_TGT = IB_QPT_XRC_TGT,
+
+ MLX4_IB_QPT_PROXY_SMI_OWNER = 1 << 16,
+ MLX4_IB_QPT_PROXY_SMI = 1 << 17,
+ MLX4_IB_QPT_PROXY_GSI = 1 << 18,
+ MLX4_IB_QPT_TUN_SMI_OWNER = 1 << 19,
+ MLX4_IB_QPT_TUN_SMI = 1 << 20,
+ MLX4_IB_QPT_TUN_GSI = 1 << 21,
+};
+
+#define MLX4_IB_QPT_ANY_SRIOV (MLX4_IB_QPT_PROXY_SMI_OWNER | \
+ MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER | \
+ MLX4_IB_QPT_TUN_SMI | MLX4_IB_QPT_TUN_GSI)
+
+enum mlx4_ib_mad_ifc_flags {
+ MLX4_MAD_IFC_IGNORE_MKEY = 1,
+ MLX4_MAD_IFC_IGNORE_BKEY = 2,
+ MLX4_MAD_IFC_IGNORE_KEYS = (MLX4_MAD_IFC_IGNORE_MKEY |
+ MLX4_MAD_IFC_IGNORE_BKEY),
+ MLX4_MAD_IFC_NET_VIEW = 4,
+};
+
+enum {
+ MLX4_NUM_TUNNEL_BUFS = 512,
+ MLX4_NUM_WIRE_BUFS = 2048,
+};
+
+struct mlx4_ib_tunnel_header {
+ struct mlx4_av av;
+ __be32 remote_qpn;
+ __be32 qkey;
+ __be16 vlan;
+ u8 mac[6];
+ __be16 pkey_index;
+ u8 reserved[6];
+};
+
+struct mlx4_ib_buf {
+ void *addr;
+ dma_addr_t map;
+};
+
+struct mlx4_rcv_tunnel_hdr {
+ __be32 flags_src_qp; /* flags[6:5] is defined for VLANs:
+ * 0x0 - no vlan was in the packet
+ * 0x01 - C-VLAN was in the packet */
+ u8 g_ml_path; /* gid bit stands for ipv6/4 header in RoCE */
+ u8 reserved;
+ __be16 pkey_index;
+ __be16 sl_vid;
+ __be16 slid_mac_47_32;
+ __be32 mac_31_0;
+};
+
+struct mlx4_ib_proxy_sqp_hdr {
+ struct ib_grh grh;
+ struct mlx4_rcv_tunnel_hdr tun;
+} __packed;
+
+struct mlx4_roce_smac_vlan_info {
+ u64 smac;
+ int smac_index;
+ int smac_port;
+ u64 candidate_smac;
+ int candidate_smac_index;
+ int candidate_smac_port;
+ u16 vid;
+ int vlan_index;
+ int vlan_port;
+ u16 candidate_vid;
+ int candidate_vlan_index;
+ int candidate_vlan_port;
+ int update_vid;
+};
+
+struct mlx4_wqn_range {
+ int base_wqn;
+ int size;
+ int refcount;
+ bool dirty;
+ struct list_head list;
+};
+
+struct mlx4_ib_rss {
+ unsigned int base_qpn_tbl_sz;
+ u8 flags;
+ u8 rss_key[MLX4_EN_RSS_KEY_SIZE];
+};
+
+enum {
+ /*
+ * Largest possible UD header: send with GRH and immediate
+ * data plus 18 bytes for an Ethernet header with VLAN/802.1Q
+ * tag. (LRH would only use 8 bytes, so Ethernet is the
+ * biggest case)
+ */
+ MLX4_IB_UD_HEADER_SIZE = 82,
+ MLX4_IB_LSO_HEADER_SPARE = 128,
+};
+
+struct mlx4_ib_sqp {
+ int pkey_index;
+ u32 qkey;
+ u32 send_psn;
+ struct ib_ud_header ud_header;
+ u8 header_buf[MLX4_IB_UD_HEADER_SIZE];
+ struct ib_qp *roce_v2_gsi;
+};
+
+struct mlx4_ib_qp {
+ union {
+ struct ib_qp ibqp;
+ struct ib_wq ibwq;
+ };
+ struct mlx4_qp mqp;
+ struct mlx4_buf buf;
+
+ struct mlx4_db db;
+ struct mlx4_ib_wq rq;
+
+ u32 doorbell_qpn;
+ __be32 sq_signal_bits;
+ unsigned sq_next_wqe;
+ int sq_spare_wqes;
+ struct mlx4_ib_wq sq;
+
+ enum mlx4_ib_qp_type mlx4_ib_qp_type;
+ struct ib_umem *umem;
+ struct mlx4_mtt mtt;
+ int buf_size;
+ struct mutex mutex;
+ u16 xrcdn;
+ u32 flags;
+ u8 port;
+ u8 alt_port;
+ u8 atomic_rd_en;
+ u8 resp_depth;
+ u8 sq_no_prefetch;
+ u8 state;
+ int mlx_type;
+ u32 inl_recv_sz;
+ struct list_head gid_list;
+ struct list_head steering_rules;
+ struct mlx4_ib_buf *sqp_proxy_rcv;
+ struct mlx4_roce_smac_vlan_info pri;
+ struct mlx4_roce_smac_vlan_info alt;
+ u64 reg_id;
+ struct list_head qps_list;
+ struct list_head cq_recv_list;
+ struct list_head cq_send_list;
+ struct counter_index *counter_index;
+ struct mlx4_wqn_range *wqn_range;
+ /* Number of RSS QP parents that uses this WQ */
+ u32 rss_usecnt;
+ union {
+ struct mlx4_ib_rss *rss_ctx;
+ struct mlx4_ib_sqp *sqp;
+ };
+};
+
+struct mlx4_ib_srq {
+ struct ib_srq ibsrq;
+ struct mlx4_srq msrq;
+ struct mlx4_buf buf;
+ struct mlx4_db db;
+ u64 *wrid;
+ spinlock_t lock;
+ int head;
+ int tail;
+ u16 wqe_ctr;
+ struct ib_umem *umem;
+ struct mlx4_mtt mtt;
+ struct mutex mutex;
+};
+
+struct mlx4_ib_ah {
+ struct ib_ah ibah;
+ union mlx4_ext_av av;
+};
+
+struct mlx4_ib_rwq_ind_table {
+ struct ib_rwq_ind_table ib_rwq_ind_tbl;
+};
+
+/****************************************/
+/* alias guid support */
+/****************************************/
+#define NUM_PORT_ALIAS_GUID 2
+#define NUM_ALIAS_GUID_IN_REC 8
+#define NUM_ALIAS_GUID_REC_IN_PORT 16
+#define GUID_REC_SIZE 8
+#define NUM_ALIAS_GUID_PER_PORT 128
+#define MLX4_NOT_SET_GUID (0x00LL)
+#define MLX4_GUID_FOR_DELETE_VAL (~(0x00LL))
+
+enum mlx4_guid_alias_rec_status {
+ MLX4_GUID_INFO_STATUS_IDLE,
+ MLX4_GUID_INFO_STATUS_SET,
+};
+
+#define GUID_STATE_NEED_PORT_INIT 0x01
+
+enum mlx4_guid_alias_rec_method {
+ MLX4_GUID_INFO_RECORD_SET = IB_MGMT_METHOD_SET,
+ MLX4_GUID_INFO_RECORD_DELETE = IB_SA_METHOD_DELETE,
+};
+
+struct mlx4_sriov_alias_guid_info_rec_det {
+ u8 all_recs[GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC];
+ ib_sa_comp_mask guid_indexes; /*indicates what from the 8 records are valid*/
+ enum mlx4_guid_alias_rec_status status; /*indicates the administraively status of the record.*/
+ unsigned int guids_retry_schedule[NUM_ALIAS_GUID_IN_REC];
+ u64 time_to_run;
+};
+
+struct mlx4_sriov_alias_guid_port_rec_det {
+ struct mlx4_sriov_alias_guid_info_rec_det all_rec_per_port[NUM_ALIAS_GUID_REC_IN_PORT];
+ struct workqueue_struct *wq;
+ struct delayed_work alias_guid_work;
+ u32 port;
+ u32 state_flags;
+ struct mlx4_sriov_alias_guid *parent;
+ struct list_head cb_list;
+};
+
+struct mlx4_sriov_alias_guid {
+ struct mlx4_sriov_alias_guid_port_rec_det ports_guid[MLX4_MAX_PORTS];
+ spinlock_t ag_work_lock;
+ struct ib_sa_client *sa_client;
+};
+
+struct mlx4_ib_demux_work {
+ struct work_struct work;
+ struct mlx4_ib_dev *dev;
+ int slave;
+ int do_init;
+ u8 port;
+
+};
+
+struct mlx4_ib_tun_tx_buf {
+ struct mlx4_ib_buf buf;
+ struct ib_ah *ah;
+};
+
+struct mlx4_ib_demux_pv_qp {
+ struct ib_qp *qp;
+ enum ib_qp_type proxy_qpt;
+ struct mlx4_ib_buf *ring;
+ struct mlx4_ib_tun_tx_buf *tx_ring;
+ spinlock_t tx_lock;
+ unsigned tx_ix_head;
+ unsigned tx_ix_tail;
+};
+
+enum mlx4_ib_demux_pv_state {
+ DEMUX_PV_STATE_DOWN,
+ DEMUX_PV_STATE_STARTING,
+ DEMUX_PV_STATE_ACTIVE,
+ DEMUX_PV_STATE_DOWNING,
+};
+
+struct mlx4_ib_demux_pv_ctx {
+ int port;
+ int slave;
+ enum mlx4_ib_demux_pv_state state;
+ int has_smi;
+ struct ib_device *ib_dev;
+ struct ib_cq *cq;
+ struct ib_pd *pd;
+ struct work_struct work;
+ struct workqueue_struct *wq;
+ struct workqueue_struct *wi_wq;
+ struct mlx4_ib_demux_pv_qp qp[2];
+};
+
+struct mlx4_ib_demux_ctx {
+ struct ib_device *ib_dev;
+ int port;
+ struct workqueue_struct *wq;
+ struct workqueue_struct *wi_wq;
+ struct workqueue_struct *ud_wq;
+ spinlock_t ud_lock;
+ atomic64_t subnet_prefix;
+ __be64 guid_cache[128];
+ struct mlx4_ib_dev *dev;
+ /* the following lock protects both mcg_table and mcg_mgid0_list */
+ struct mutex mcg_table_lock;
+ struct rb_root mcg_table;
+ struct list_head mcg_mgid0_list;
+ struct workqueue_struct *mcg_wq;
+ struct mlx4_ib_demux_pv_ctx **tun;
+ atomic_t tid;
+ int flushing; /* flushing the work queue */
+};
+
+struct mlx4_ib_sriov {
+ struct mlx4_ib_demux_ctx demux[MLX4_MAX_PORTS];
+ struct mlx4_ib_demux_pv_ctx *sqps[MLX4_MAX_PORTS];
+ /* when using this spinlock you should use "irq" because
+ * it may be called from interrupt context.*/
+ spinlock_t going_down_lock;
+ int is_going_down;
+
+ struct mlx4_sriov_alias_guid alias_guid;
+
+ /* CM paravirtualization fields */
+ struct xarray pv_id_table;
+ u32 pv_id_next;
+ spinlock_t id_map_lock;
+ struct rb_root sl_id_map;
+ struct list_head cm_list;
+ struct xarray xa_rej_tmout;
+};
+
+struct gid_cache_context {
+ int real_index;
+ int refcount;
+};
+
+struct gid_entry {
+ union ib_gid gid;
+ enum ib_gid_type gid_type;
+ struct gid_cache_context *ctx;
+ u16 vlan_id;
+};
+
+struct mlx4_port_gid_table {
+ struct gid_entry gids[MLX4_MAX_PORT_GIDS];
+};
+
+struct mlx4_ib_iboe {
+ spinlock_t lock;
+ struct net_device *netdevs[MLX4_MAX_PORTS];
+ atomic64_t mac[MLX4_MAX_PORTS];
+ struct notifier_block nb;
+ struct mlx4_port_gid_table gids[MLX4_MAX_PORTS];
+ enum ib_port_state last_port_state[MLX4_MAX_PORTS];
+};
+
+struct pkey_mgt {
+ u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS];
+ u16 phys_pkey_cache[MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS];
+ struct list_head pkey_port_list[MLX4_MFUNC_MAX];
+ struct kobject *device_parent[MLX4_MFUNC_MAX];
+};
+
+struct mlx4_ib_iov_sysfs_attr {
+ void *ctx;
+ struct kobject *kobj;
+ unsigned long data;
+ u32 entry_num;
+ char name[15];
+ struct device_attribute dentry;
+ struct device *dev;
+};
+
+struct mlx4_ib_iov_sysfs_attr_ar {
+ struct mlx4_ib_iov_sysfs_attr dentries[3 * NUM_ALIAS_GUID_PER_PORT + 1];
+};
+
+struct mlx4_ib_iov_port {
+ char name[100];
+ u8 num;
+ struct mlx4_ib_dev *dev;
+ struct list_head list;
+ struct mlx4_ib_iov_sysfs_attr_ar *dentr_ar;
+ struct ib_port_attr attr;
+ struct kobject *cur_port;
+ struct kobject *admin_alias_parent;
+ struct kobject *gids_parent;
+ struct kobject *pkeys_parent;
+ struct kobject *mcgs_parent;
+ struct mlx4_ib_iov_sysfs_attr mcg_dentry;
+};
+
+struct counter_index {
+ struct list_head list;
+ u32 index;
+ u8 allocated;
+};
+
+struct mlx4_ib_counters {
+ struct list_head counters_list;
+ struct mutex mutex; /* mutex for accessing counters list */
+ u32 default_counter;
+};
+
+#define MLX4_DIAG_COUNTERS_TYPES 2
+
+struct mlx4_ib_diag_counters {
+ struct rdma_stat_desc *descs;
+ u32 *offset;
+ u32 num_counters;
+};
+
+struct mlx4_ib_dev {
+ struct ib_device ib_dev;
+ struct mlx4_dev *dev;
+ int num_ports;
+ void __iomem *uar_map;
+
+ struct mlx4_uar priv_uar;
+ u32 priv_pdn;
+ MLX4_DECLARE_DOORBELL_LOCK(uar_lock);
+
+ struct ib_mad_agent *send_agent[MLX4_MAX_PORTS][2];
+ struct ib_ah *sm_ah[MLX4_MAX_PORTS];
+ spinlock_t sm_lock;
+ atomic64_t sl2vl[MLX4_MAX_PORTS];
+ struct mlx4_ib_sriov sriov;
+
+ struct mutex cap_mask_mutex;
+ bool ib_active;
+ struct mlx4_ib_iboe iboe;
+ struct mlx4_ib_counters counters_table[MLX4_MAX_PORTS];
+ int *eq_table;
+ struct kobject *iov_parent;
+ struct kobject *ports_parent;
+ struct kobject *dev_ports_parent[MLX4_MFUNC_MAX];
+ struct mlx4_ib_iov_port iov_ports[MLX4_MAX_PORTS];
+ struct pkey_mgt pkeys;
+ unsigned long *ib_uc_qpns_bitmap;
+ int steer_qpn_count;
+ int steer_qpn_base;
+ int steering_support;
+ struct mlx4_ib_qp *qp1_proxy[MLX4_MAX_PORTS];
+ /* lock when destroying qp1_proxy and getting netdev events */
+ struct mutex qp1_proxy_lock[MLX4_MAX_PORTS];
+ u8 bond_next_port;
+ /* protect resources needed as part of reset flow */
+ spinlock_t reset_flow_resource_lock;
+ struct list_head qp_list;
+ struct mlx4_ib_diag_counters diag_counters[MLX4_DIAG_COUNTERS_TYPES];
+};
+
+struct ib_event_work {
+ struct work_struct work;
+ struct mlx4_ib_dev *ib_dev;
+ struct mlx4_eqe ib_eqe;
+ int port;
+};
+
+struct mlx4_ib_qp_tunnel_init_attr {
+ struct ib_qp_init_attr init_attr;
+ int slave;
+ enum ib_qp_type proxy_qp_type;
+ u32 port;
+};
+
+struct mlx4_uverbs_ex_query_device {
+ __u32 comp_mask;
+ __u32 reserved;
+};
+
+static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev)
+{
+ return container_of(ibdev, struct mlx4_ib_dev, ib_dev);
+}
+
+static inline struct mlx4_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
+{
+ return container_of(ibucontext, struct mlx4_ib_ucontext, ibucontext);
+}
+
+static inline struct mlx4_ib_pd *to_mpd(struct ib_pd *ibpd)
+{
+ return container_of(ibpd, struct mlx4_ib_pd, ibpd);
+}
+
+static inline struct mlx4_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd)
+{
+ return container_of(ibxrcd, struct mlx4_ib_xrcd, ibxrcd);
+}
+
+static inline struct mlx4_ib_cq *to_mcq(struct ib_cq *ibcq)
+{
+ return container_of(ibcq, struct mlx4_ib_cq, ibcq);
+}
+
+static inline struct mlx4_ib_cq *to_mibcq(struct mlx4_cq *mcq)
+{
+ return container_of(mcq, struct mlx4_ib_cq, mcq);
+}
+
+static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr)
+{
+ return container_of(ibmr, struct mlx4_ib_mr, ibmr);
+}
+
+static inline struct mlx4_ib_mw *to_mmw(struct ib_mw *ibmw)
+{
+ return container_of(ibmw, struct mlx4_ib_mw, ibmw);
+}
+
+static inline struct mlx4_ib_flow *to_mflow(struct ib_flow *ibflow)
+{
+ return container_of(ibflow, struct mlx4_ib_flow, ibflow);
+}
+
+static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp)
+{
+ return container_of(ibqp, struct mlx4_ib_qp, ibqp);
+}
+
+static inline struct mlx4_ib_qp *to_mibqp(struct mlx4_qp *mqp)
+{
+ return container_of(mqp, struct mlx4_ib_qp, mqp);
+}
+
+static inline struct mlx4_ib_srq *to_msrq(struct ib_srq *ibsrq)
+{
+ return container_of(ibsrq, struct mlx4_ib_srq, ibsrq);
+}
+
+static inline struct mlx4_ib_srq *to_mibsrq(struct mlx4_srq *msrq)
+{
+ return container_of(msrq, struct mlx4_ib_srq, msrq);
+}
+
+static inline struct mlx4_ib_ah *to_mah(struct ib_ah *ibah)
+{
+ return container_of(ibah, struct mlx4_ib_ah, ibah);
+}
+
+static inline u8 mlx4_ib_bond_next_port(struct mlx4_ib_dev *dev)
+{
+ dev->bond_next_port = (dev->bond_next_port + 1) % dev->num_ports;
+
+ return dev->bond_next_port + 1;
+}
+
+int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev);
+void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev);
+
+int mlx4_ib_db_map_user(struct ib_udata *udata, unsigned long virt,
+ struct mlx4_db *db);
+void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db);
+
+struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc);
+int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
+ struct ib_umem *umem);
+struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt_addr, int access_flags,
+ struct ib_udata *udata);
+int mlx4_ib_dereg_mr(struct ib_mr *mr, struct ib_udata *udata);
+int mlx4_ib_alloc_mw(struct ib_mw *mw, struct ib_udata *udata);
+int mlx4_ib_dealloc_mw(struct ib_mw *mw);
+struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+ u32 max_num_sg);
+int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset);
+int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
+int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
+int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+ struct ib_udata *udata);
+int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
+int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
+void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
+
+int mlx4_ib_create_ah(struct ib_ah *ah, struct rdma_ah_init_attr *init_attr,
+ struct ib_udata *udata);
+int mlx4_ib_create_ah_slave(struct ib_ah *ah, struct rdma_ah_attr *ah_attr,
+ int slave_sgid_index, u8 *s_mac, u16 vlan_tag);
+int mlx4_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
+static inline int mlx4_ib_destroy_ah(struct ib_ah *ah, u32 flags)
+{
+ return 0;
+}
+
+int mlx4_ib_create_srq(struct ib_srq *srq, struct ib_srq_init_attr *init_attr,
+ struct ib_udata *udata);
+int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+int mlx4_ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+int mlx4_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata);
+void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index);
+int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr);
+
+int mlx4_ib_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata);
+int mlx4_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata);
+void mlx4_ib_drain_sq(struct ib_qp *qp);
+void mlx4_ib_drain_rq(struct ib_qp *qp);
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata);
+int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr);
+int mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
+ const struct ib_send_wr **bad_wr);
+int mlx4_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr);
+
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
+ int port, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+ const void *in_mad, void *response_mad);
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u32 port_num,
+ const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+ const struct ib_mad *in, struct ib_mad *out,
+ size_t *out_mad_size, u16 *out_mad_pkey_index);
+int mlx4_ib_mad_init(struct mlx4_ib_dev *dev);
+void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev);
+
+int __mlx4_ib_query_port(struct ib_device *ibdev, u32 port,
+ struct ib_port_attr *props, int netw_view);
+int __mlx4_ib_query_pkey(struct ib_device *ibdev, u32 port, u16 index,
+ u16 *pkey, int netw_view);
+
+int __mlx4_ib_query_gid(struct ib_device *ibdev, u32 port, int index,
+ union ib_gid *gid, int netw_view);
+
+static inline bool mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah)
+{
+ u32 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3;
+
+ if (rdma_port_get_link_layer(ah->ibah.device, port) == IB_LINK_LAYER_ETHERNET)
+ return true;
+
+ return !!(ah->av.ib.g_slid & 0x80);
+}
+
+int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx);
+void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq);
+void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave);
+int mlx4_ib_mcg_init(void);
+void mlx4_ib_mcg_destroy(void);
+
+int mlx4_ib_find_real_gid(struct ib_device *ibdev, u32 port, __be64 guid);
+
+int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, int slave,
+ struct ib_sa_mad *sa_mad);
+int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave,
+ struct ib_sa_mad *mad);
+
+int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+ union ib_gid *gid);
+
+void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u32 port_num,
+ enum ib_event_type type);
+
+void mlx4_ib_tunnels_update_work(struct work_struct *work);
+
+int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u32 port,
+ enum ib_qp_type qpt, struct ib_wc *wc,
+ struct ib_grh *grh, struct ib_mad *mad);
+
+int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u32 port,
+ enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn,
+ u32 qkey, struct rdma_ah_attr *attr, u8 *s_mac,
+ u16 vlan_id, struct ib_mad *mad);
+
+__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx);
+
+int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
+ struct ib_mad *mad);
+
+int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id,
+ struct ib_mad *mad);
+
+void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev);
+void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave_id);
+
+/* alias guid support */
+void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port);
+int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev);
+void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev);
+void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port);
+
+void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev,
+ int block_num,
+ u32 port_num, u8 *p_data);
+
+void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev,
+ int block_num, u32 port_num,
+ u8 *p_data);
+
+int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+ struct attribute *attr);
+void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+ struct attribute *attr);
+ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index);
+void mlx4_ib_slave_alias_guid_event(struct mlx4_ib_dev *dev, int slave,
+ int port, int slave_init);
+
+int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *device) ;
+
+void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device);
+
+__be64 mlx4_ib_gen_node_guid(void);
+
+int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn);
+void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count);
+int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+ int is_attach);
+struct ib_mr *mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, u64 start,
+ u64 length, u64 virt_addr,
+ int mr_access_flags, struct ib_pd *pd,
+ struct ib_udata *udata);
+int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
+ const struct ib_gid_attr *attr);
+
+void mlx4_sched_ib_sl2vl_update_work(struct mlx4_ib_dev *ibdev,
+ int port);
+
+void mlx4_ib_sl2vl_update(struct mlx4_ib_dev *mdev, int port);
+
+struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd,
+ struct ib_wq_init_attr *init_attr,
+ struct ib_udata *udata);
+int mlx4_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata);
+int mlx4_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+ u32 wq_attr_mask, struct ib_udata *udata);
+
+int mlx4_ib_create_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl,
+ struct ib_rwq_ind_table_init_attr *init_attr,
+ struct ib_udata *udata);
+static inline int
+mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table)
+{
+ return 0;
+}
+int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va,
+ int *num_of_mtts);
+
+int mlx4_ib_cm_init(void);
+void mlx4_ib_cm_destroy(void);
+
+#endif /* MLX4_IB_H */
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
new file mode 100644
index 000000000..a40bf58bc
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -0,0 +1,717 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "mlx4_ib.h"
+
+static u32 convert_access(int acc)
+{
+ return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX4_PERM_ATOMIC : 0) |
+ (acc & IB_ACCESS_REMOTE_WRITE ? MLX4_PERM_REMOTE_WRITE : 0) |
+ (acc & IB_ACCESS_REMOTE_READ ? MLX4_PERM_REMOTE_READ : 0) |
+ (acc & IB_ACCESS_LOCAL_WRITE ? MLX4_PERM_LOCAL_WRITE : 0) |
+ (acc & IB_ACCESS_MW_BIND ? MLX4_PERM_BIND_MW : 0) |
+ MLX4_PERM_LOCAL_READ;
+}
+
+static enum mlx4_mw_type to_mlx4_type(enum ib_mw_type type)
+{
+ switch (type) {
+ case IB_MW_TYPE_1: return MLX4_MW_TYPE_1;
+ case IB_MW_TYPE_2: return MLX4_MW_TYPE_2;
+ default: return -1;
+ }
+}
+
+struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
+{
+ struct mlx4_ib_mr *mr;
+ int err;
+
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ err = mlx4_mr_alloc(to_mdev(pd->device)->dev, to_mpd(pd)->pdn, 0,
+ ~0ull, convert_access(acc), 0, 0, &mr->mmr);
+ if (err)
+ goto err_free;
+
+ err = mlx4_mr_enable(to_mdev(pd->device)->dev, &mr->mmr);
+ if (err)
+ goto err_mr;
+
+ mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+ mr->umem = NULL;
+
+ return &mr->ibmr;
+
+err_mr:
+ (void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
+
+err_free:
+ kfree(mr);
+
+ return ERR_PTR(err);
+}
+
+enum {
+ MLX4_MAX_MTT_SHIFT = 31
+};
+
+static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev,
+ struct mlx4_mtt *mtt,
+ u64 mtt_size, u64 mtt_shift, u64 len,
+ u64 cur_start_addr, u64 *pages,
+ int *start_index, int *npages)
+{
+ u64 cur_end_addr = cur_start_addr + len;
+ u64 cur_end_addr_aligned = 0;
+ u64 mtt_entries;
+ int err = 0;
+ int k;
+
+ len += (cur_start_addr & (mtt_size - 1ULL));
+ cur_end_addr_aligned = round_up(cur_end_addr, mtt_size);
+ len += (cur_end_addr_aligned - cur_end_addr);
+ if (len & (mtt_size - 1ULL)) {
+ pr_warn("write_block: len %llx is not aligned to mtt_size %llx\n",
+ len, mtt_size);
+ return -EINVAL;
+ }
+
+ mtt_entries = (len >> mtt_shift);
+
+ /*
+ * Align the MTT start address to the mtt_size.
+ * Required to handle cases when the MR starts in the middle of an MTT
+ * record. Was not required in old code since the physical addresses
+ * provided by the dma subsystem were page aligned, which was also the
+ * MTT size.
+ */
+ cur_start_addr = round_down(cur_start_addr, mtt_size);
+ /* A new block is started ... */
+ for (k = 0; k < mtt_entries; ++k) {
+ pages[*npages] = cur_start_addr + (mtt_size * k);
+ (*npages)++;
+ /*
+ * Be friendly to mlx4_write_mtt() and pass it chunks of
+ * appropriate size.
+ */
+ if (*npages == PAGE_SIZE / sizeof(u64)) {
+ err = mlx4_write_mtt(dev->dev, mtt, *start_index,
+ *npages, pages);
+ if (err)
+ return err;
+
+ (*start_index) += *npages;
+ *npages = 0;
+ }
+ }
+
+ return 0;
+}
+
+static inline u64 alignment_of(u64 ptr)
+{
+ return ilog2(ptr & (~(ptr - 1)));
+}
+
+static int mlx4_ib_umem_calc_block_mtt(u64 next_block_start,
+ u64 current_block_end,
+ u64 block_shift)
+{
+ /* Check whether the alignment of the new block is aligned as well as
+ * the previous block.
+ * Block address must start with zeros till size of entity_size.
+ */
+ if ((next_block_start & ((1ULL << block_shift) - 1ULL)) != 0)
+ /*
+ * It is not as well aligned as the previous block-reduce the
+ * mtt size accordingly. Here we take the last right bit which
+ * is 1.
+ */
+ block_shift = alignment_of(next_block_start);
+
+ /*
+ * Check whether the alignment of the end of previous block - is it
+ * aligned as well as the start of the block
+ */
+ if (((current_block_end) & ((1ULL << block_shift) - 1ULL)) != 0)
+ /*
+ * It is not as well aligned as the start of the block -
+ * reduce the mtt size accordingly.
+ */
+ block_shift = alignment_of(current_block_end);
+
+ return block_shift;
+}
+
+int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
+ struct ib_umem *umem)
+{
+ u64 *pages;
+ u64 len = 0;
+ int err = 0;
+ u64 mtt_size;
+ u64 cur_start_addr = 0;
+ u64 mtt_shift;
+ int start_index = 0;
+ int npages = 0;
+ struct scatterlist *sg;
+ int i;
+
+ pages = (u64 *) __get_free_page(GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ mtt_shift = mtt->page_shift;
+ mtt_size = 1ULL << mtt_shift;
+
+ for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) {
+ if (cur_start_addr + len == sg_dma_address(sg)) {
+ /* still the same block */
+ len += sg_dma_len(sg);
+ continue;
+ }
+ /*
+ * A new block is started ...
+ * If len is malaligned, write an extra mtt entry to cover the
+ * misaligned area (round up the division)
+ */
+ err = mlx4_ib_umem_write_mtt_block(dev, mtt, mtt_size,
+ mtt_shift, len,
+ cur_start_addr,
+ pages, &start_index,
+ &npages);
+ if (err)
+ goto out;
+
+ cur_start_addr = sg_dma_address(sg);
+ len = sg_dma_len(sg);
+ }
+
+ /* Handle the last block */
+ if (len > 0) {
+ /*
+ * If len is malaligned, write an extra mtt entry to cover
+ * the misaligned area (round up the division)
+ */
+ err = mlx4_ib_umem_write_mtt_block(dev, mtt, mtt_size,
+ mtt_shift, len,
+ cur_start_addr, pages,
+ &start_index, &npages);
+ if (err)
+ goto out;
+ }
+
+ if (npages)
+ err = mlx4_write_mtt(dev->dev, mtt, start_index, npages, pages);
+
+out:
+ free_page((unsigned long) pages);
+ return err;
+}
+
+/*
+ * Calculate optimal mtt size based on contiguous pages.
+ * Function will return also the number of pages that are not aligned to the
+ * calculated mtt_size to be added to total number of pages. For that we should
+ * check the first chunk length & last chunk length and if not aligned to
+ * mtt_size we should increment the non_aligned_pages number. All chunks in the
+ * middle already handled as part of mtt shift calculation for both their start
+ * & end addresses.
+ */
+int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va,
+ int *num_of_mtts)
+{
+ u64 block_shift = MLX4_MAX_MTT_SHIFT;
+ u64 min_shift = PAGE_SHIFT;
+ u64 last_block_aligned_end = 0;
+ u64 current_block_start = 0;
+ u64 first_block_start = 0;
+ u64 current_block_len = 0;
+ u64 last_block_end = 0;
+ struct scatterlist *sg;
+ u64 current_block_end;
+ u64 misalignment_bits;
+ u64 next_block_start;
+ u64 total_len = 0;
+ int i;
+
+ *num_of_mtts = ib_umem_num_dma_blocks(umem, PAGE_SIZE);
+
+ for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) {
+ /*
+ * Initialization - save the first chunk start as the
+ * current_block_start - block means contiguous pages.
+ */
+ if (current_block_len == 0 && current_block_start == 0) {
+ current_block_start = sg_dma_address(sg);
+ first_block_start = current_block_start;
+ /*
+ * Find the bits that are different between the physical
+ * address and the virtual address for the start of the
+ * MR.
+ * umem_get aligned the start_va to a page boundary.
+ * Therefore, we need to align the start va to the same
+ * boundary.
+ * misalignment_bits is needed to handle the case of a
+ * single memory region. In this case, the rest of the
+ * logic will not reduce the block size. If we use a
+ * block size which is bigger than the alignment of the
+ * misalignment bits, we might use the virtual page
+ * number instead of the physical page number, resulting
+ * in access to the wrong data.
+ */
+ misalignment_bits =
+ (start_va & (~(((u64)(PAGE_SIZE)) - 1ULL))) ^
+ current_block_start;
+ block_shift = min(alignment_of(misalignment_bits),
+ block_shift);
+ }
+
+ /*
+ * Go over the scatter entries and check if they continue the
+ * previous scatter entry.
+ */
+ next_block_start = sg_dma_address(sg);
+ current_block_end = current_block_start + current_block_len;
+ /* If we have a split (non-contig.) between two blocks */
+ if (current_block_end != next_block_start) {
+ block_shift = mlx4_ib_umem_calc_block_mtt
+ (next_block_start,
+ current_block_end,
+ block_shift);
+
+ /*
+ * If we reached the minimum shift for 4k page we stop
+ * the loop.
+ */
+ if (block_shift <= min_shift)
+ goto end;
+
+ /*
+ * If not saved yet we are in first block - we save the
+ * length of first block to calculate the
+ * non_aligned_pages number at the end.
+ */
+ total_len += current_block_len;
+
+ /* Start a new block */
+ current_block_start = next_block_start;
+ current_block_len = sg_dma_len(sg);
+ continue;
+ }
+ /* The scatter entry is another part of the current block,
+ * increase the block size.
+ * An entry in the scatter can be larger than 4k (page) as of
+ * dma mapping which merge some blocks together.
+ */
+ current_block_len += sg_dma_len(sg);
+ }
+
+ /* Account for the last block in the total len */
+ total_len += current_block_len;
+ /* Add to the first block the misalignment that it suffers from. */
+ total_len += (first_block_start & ((1ULL << block_shift) - 1ULL));
+ last_block_end = current_block_start + current_block_len;
+ last_block_aligned_end = round_up(last_block_end, 1ULL << block_shift);
+ total_len += (last_block_aligned_end - last_block_end);
+
+ if (total_len & ((1ULL << block_shift) - 1ULL))
+ pr_warn("misaligned total length detected (%llu, %llu)!",
+ total_len, block_shift);
+
+ *num_of_mtts = total_len >> block_shift;
+end:
+ if (block_shift < min_shift) {
+ /*
+ * If shift is less than the min we set a warning and return the
+ * min shift.
+ */
+ pr_warn("umem_calc_optimal_mtt_size - unexpected shift %lld\n", block_shift);
+
+ block_shift = min_shift;
+ }
+ return block_shift;
+}
+
+static struct ib_umem *mlx4_get_umem_mr(struct ib_device *device, u64 start,
+ u64 length, int access_flags)
+{
+ /*
+ * Force registering the memory as writable if the underlying pages
+ * are writable. This is so rereg can change the access permissions
+ * from readable to writable without having to run through ib_umem_get
+ * again
+ */
+ if (!ib_access_writable(access_flags)) {
+ unsigned long untagged_start = untagged_addr(start);
+ struct vm_area_struct *vma;
+
+ mmap_read_lock(current->mm);
+ /*
+ * FIXME: Ideally this would iterate over all the vmas that
+ * cover the memory, but for now it requires a single vma to
+ * entirely cover the MR to support RO mappings.
+ */
+ vma = find_vma(current->mm, untagged_start);
+ if (vma && vma->vm_end >= untagged_start + length &&
+ vma->vm_start <= untagged_start) {
+ if (vma->vm_flags & VM_WRITE)
+ access_flags |= IB_ACCESS_LOCAL_WRITE;
+ } else {
+ access_flags |= IB_ACCESS_LOCAL_WRITE;
+ }
+
+ mmap_read_unlock(current->mm);
+ }
+
+ return ib_umem_get(device, start, length, access_flags);
+}
+
+struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt_addr, int access_flags,
+ struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(pd->device);
+ struct mlx4_ib_mr *mr;
+ int shift;
+ int err;
+ int n;
+
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ mr->umem = mlx4_get_umem_mr(pd->device, start, length, access_flags);
+ if (IS_ERR(mr->umem)) {
+ err = PTR_ERR(mr->umem);
+ goto err_free;
+ }
+
+ shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start, &n);
+
+ err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
+ convert_access(access_flags), n, shift, &mr->mmr);
+ if (err)
+ goto err_umem;
+
+ err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem);
+ if (err)
+ goto err_mr;
+
+ err = mlx4_mr_enable(dev->dev, &mr->mmr);
+ if (err)
+ goto err_mr;
+
+ mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+ mr->ibmr.page_size = 1U << shift;
+
+ return &mr->ibmr;
+
+err_mr:
+ (void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
+
+err_umem:
+ ib_umem_release(mr->umem);
+
+err_free:
+ kfree(mr);
+
+ return ERR_PTR(err);
+}
+
+struct ib_mr *mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, u64 start,
+ u64 length, u64 virt_addr,
+ int mr_access_flags, struct ib_pd *pd,
+ struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(mr->device);
+ struct mlx4_ib_mr *mmr = to_mmr(mr);
+ struct mlx4_mpt_entry *mpt_entry;
+ struct mlx4_mpt_entry **pmpt_entry = &mpt_entry;
+ int err;
+
+ /* Since we synchronize this call and mlx4_ib_dereg_mr via uverbs,
+ * we assume that the calls can't run concurrently. Otherwise, a
+ * race exists.
+ */
+ err = mlx4_mr_hw_get_mpt(dev->dev, &mmr->mmr, &pmpt_entry);
+ if (err)
+ return ERR_PTR(err);
+
+ if (flags & IB_MR_REREG_PD) {
+ err = mlx4_mr_hw_change_pd(dev->dev, *pmpt_entry,
+ to_mpd(pd)->pdn);
+
+ if (err)
+ goto release_mpt_entry;
+ }
+
+ if (flags & IB_MR_REREG_ACCESS) {
+ if (ib_access_writable(mr_access_flags) &&
+ !mmr->umem->writable) {
+ err = -EPERM;
+ goto release_mpt_entry;
+ }
+
+ err = mlx4_mr_hw_change_access(dev->dev, *pmpt_entry,
+ convert_access(mr_access_flags));
+
+ if (err)
+ goto release_mpt_entry;
+ }
+
+ if (flags & IB_MR_REREG_TRANS) {
+ int shift;
+ int n;
+
+ mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
+ ib_umem_release(mmr->umem);
+ mmr->umem = mlx4_get_umem_mr(mr->device, start, length,
+ mr_access_flags);
+ if (IS_ERR(mmr->umem)) {
+ err = PTR_ERR(mmr->umem);
+ /* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */
+ mmr->umem = NULL;
+ goto release_mpt_entry;
+ }
+ n = ib_umem_num_dma_blocks(mmr->umem, PAGE_SIZE);
+ shift = PAGE_SHIFT;
+
+ err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr,
+ virt_addr, length, n, shift,
+ *pmpt_entry);
+ if (err) {
+ ib_umem_release(mmr->umem);
+ goto release_mpt_entry;
+ }
+ mmr->mmr.iova = virt_addr;
+ mmr->mmr.size = length;
+
+ err = mlx4_ib_umem_write_mtt(dev, &mmr->mmr.mtt, mmr->umem);
+ if (err) {
+ mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
+ ib_umem_release(mmr->umem);
+ goto release_mpt_entry;
+ }
+ }
+
+ /* If we couldn't transfer the MR to the HCA, just remember to
+ * return a failure. But dereg_mr will free the resources.
+ */
+ err = mlx4_mr_hw_write_mpt(dev->dev, &mmr->mmr, pmpt_entry);
+ if (!err && flags & IB_MR_REREG_ACCESS)
+ mmr->mmr.access = mr_access_flags;
+
+release_mpt_entry:
+ mlx4_mr_hw_put_mpt(dev->dev, pmpt_entry);
+ if (err)
+ return ERR_PTR(err);
+ return NULL;
+}
+
+static int
+mlx4_alloc_priv_pages(struct ib_device *device,
+ struct mlx4_ib_mr *mr,
+ int max_pages)
+{
+ int ret;
+
+ /* Ensure that size is aligned to DMA cacheline
+ * requirements.
+ * max_pages is limited to MLX4_MAX_FAST_REG_PAGES
+ * so page_map_size will never cross PAGE_SIZE.
+ */
+ mr->page_map_size = roundup(max_pages * sizeof(u64),
+ MLX4_MR_PAGES_ALIGN);
+
+ /* Prevent cross page boundary allocation. */
+ mr->pages = (__be64 *)get_zeroed_page(GFP_KERNEL);
+ if (!mr->pages)
+ return -ENOMEM;
+
+ mr->page_map = dma_map_single(device->dev.parent, mr->pages,
+ mr->page_map_size, DMA_TO_DEVICE);
+
+ if (dma_mapping_error(device->dev.parent, mr->page_map)) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ return 0;
+
+err:
+ free_page((unsigned long)mr->pages);
+ return ret;
+}
+
+static void
+mlx4_free_priv_pages(struct mlx4_ib_mr *mr)
+{
+ if (mr->pages) {
+ struct ib_device *device = mr->ibmr.device;
+
+ dma_unmap_single(device->dev.parent, mr->page_map,
+ mr->page_map_size, DMA_TO_DEVICE);
+ free_page((unsigned long)mr->pages);
+ mr->pages = NULL;
+ }
+}
+
+int mlx4_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+{
+ struct mlx4_ib_mr *mr = to_mmr(ibmr);
+ int ret;
+
+ mlx4_free_priv_pages(mr);
+
+ ret = mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr);
+ if (ret)
+ return ret;
+ if (mr->umem)
+ ib_umem_release(mr->umem);
+ kfree(mr);
+
+ return 0;
+}
+
+int mlx4_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibmw->device);
+ struct mlx4_ib_mw *mw = to_mmw(ibmw);
+ int err;
+
+ err = mlx4_mw_alloc(dev->dev, to_mpd(ibmw->pd)->pdn,
+ to_mlx4_type(ibmw->type), &mw->mmw);
+ if (err)
+ return err;
+
+ err = mlx4_mw_enable(dev->dev, &mw->mmw);
+ if (err)
+ goto err_mw;
+
+ ibmw->rkey = mw->mmw.key;
+ return 0;
+
+err_mw:
+ mlx4_mw_free(dev->dev, &mw->mmw);
+ return err;
+}
+
+int mlx4_ib_dealloc_mw(struct ib_mw *ibmw)
+{
+ struct mlx4_ib_mw *mw = to_mmw(ibmw);
+
+ mlx4_mw_free(to_mdev(ibmw->device)->dev, &mw->mmw);
+ return 0;
+}
+
+struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+ u32 max_num_sg)
+{
+ struct mlx4_ib_dev *dev = to_mdev(pd->device);
+ struct mlx4_ib_mr *mr;
+ int err;
+
+ if (mr_type != IB_MR_TYPE_MEM_REG ||
+ max_num_sg > MLX4_MAX_FAST_REG_PAGES)
+ return ERR_PTR(-EINVAL);
+
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0,
+ max_num_sg, 0, &mr->mmr);
+ if (err)
+ goto err_free;
+
+ err = mlx4_alloc_priv_pages(pd->device, mr, max_num_sg);
+ if (err)
+ goto err_free_mr;
+
+ mr->max_pages = max_num_sg;
+ err = mlx4_mr_enable(dev->dev, &mr->mmr);
+ if (err)
+ goto err_free_pl;
+
+ mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+ mr->umem = NULL;
+
+ return &mr->ibmr;
+
+err_free_pl:
+ mr->ibmr.device = pd->device;
+ mlx4_free_priv_pages(mr);
+err_free_mr:
+ (void) mlx4_mr_free(dev->dev, &mr->mmr);
+err_free:
+ kfree(mr);
+ return ERR_PTR(err);
+}
+
+static int mlx4_set_page(struct ib_mr *ibmr, u64 addr)
+{
+ struct mlx4_ib_mr *mr = to_mmr(ibmr);
+
+ if (unlikely(mr->npages == mr->max_pages))
+ return -ENOMEM;
+
+ mr->pages[mr->npages++] = cpu_to_be64(addr | MLX4_MTT_FLAG_PRESENT);
+
+ return 0;
+}
+
+int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset)
+{
+ struct mlx4_ib_mr *mr = to_mmr(ibmr);
+ int rc;
+
+ mr->npages = 0;
+
+ ib_dma_sync_single_for_cpu(ibmr->device, mr->page_map,
+ mr->page_map_size, DMA_TO_DEVICE);
+
+ rc = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, mlx4_set_page);
+
+ ib_dma_sync_single_for_device(ibmr->device, mr->page_map,
+ mr->page_map_size, DMA_TO_DEVICE);
+
+ return rc;
+}
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
new file mode 100644
index 000000000..ac479e81d
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -0,0 +1,4474 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/log2.h>
+#include <linux/etherdevice.h>
+#include <net/ip.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_mad.h>
+#include <rdma/uverbs_ioctl.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4_ib.h"
+#include <rdma/mlx4-abi.h>
+
+static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq,
+ struct mlx4_ib_cq *recv_cq);
+static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq,
+ struct mlx4_ib_cq *recv_cq);
+static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state,
+ struct ib_udata *udata);
+
+enum {
+ MLX4_IB_ACK_REQ_FREQ = 8,
+};
+
+enum {
+ MLX4_IB_DEFAULT_SCHED_QUEUE = 0x83,
+ MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f,
+ MLX4_IB_LINK_TYPE_IB = 0,
+ MLX4_IB_LINK_TYPE_ETH = 1
+};
+
+enum {
+ MLX4_IB_MIN_SQ_STRIDE = 6,
+ MLX4_IB_CACHE_LINE_SIZE = 64,
+};
+
+enum {
+ MLX4_RAW_QP_MTU = 7,
+ MLX4_RAW_QP_MSGMAX = 31,
+};
+
+#ifndef ETH_ALEN
+#define ETH_ALEN 6
+#endif
+
+static const __be32 mlx4_ib_opcode[] = {
+ [IB_WR_SEND] = cpu_to_be32(MLX4_OPCODE_SEND),
+ [IB_WR_LSO] = cpu_to_be32(MLX4_OPCODE_LSO),
+ [IB_WR_SEND_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_SEND_IMM),
+ [IB_WR_RDMA_WRITE] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
+ [IB_WR_RDMA_WRITE_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
+ [IB_WR_RDMA_READ] = cpu_to_be32(MLX4_OPCODE_RDMA_READ),
+ [IB_WR_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
+ [IB_WR_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
+ [IB_WR_SEND_WITH_INV] = cpu_to_be32(MLX4_OPCODE_SEND_INVAL),
+ [IB_WR_LOCAL_INV] = cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL),
+ [IB_WR_REG_MR] = cpu_to_be32(MLX4_OPCODE_FMR),
+ [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS),
+ [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA),
+};
+
+enum mlx4_ib_source_type {
+ MLX4_IB_QP_SRC = 0,
+ MLX4_IB_RWQ_SRC = 1,
+};
+
+static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+ if (!mlx4_is_master(dev->dev))
+ return 0;
+
+ return qp->mqp.qpn >= dev->dev->phys_caps.base_tunnel_sqpn &&
+ qp->mqp.qpn < dev->dev->phys_caps.base_tunnel_sqpn +
+ 8 * MLX4_MFUNC_MAX;
+}
+
+static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+ int proxy_sqp = 0;
+ int real_sqp = 0;
+ int i;
+ /* PPF or Native -- real SQP */
+ real_sqp = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) &&
+ qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn &&
+ qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 3);
+ if (real_sqp)
+ return 1;
+ /* VF or PF -- proxy SQP */
+ if (mlx4_is_mfunc(dev->dev)) {
+ for (i = 0; i < dev->dev->caps.num_ports; i++) {
+ if (qp->mqp.qpn == dev->dev->caps.spec_qps[i].qp0_proxy ||
+ qp->mqp.qpn == dev->dev->caps.spec_qps[i].qp1_proxy) {
+ proxy_sqp = 1;
+ break;
+ }
+ }
+ }
+ if (proxy_sqp)
+ return 1;
+
+ return !!(qp->flags & MLX4_IB_ROCE_V2_GSI_QP);
+}
+
+/* used for INIT/CLOSE port logic */
+static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+ int proxy_qp0 = 0;
+ int real_qp0 = 0;
+ int i;
+ /* PPF or Native -- real QP0 */
+ real_qp0 = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) &&
+ qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn &&
+ qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 1);
+ if (real_qp0)
+ return 1;
+ /* VF or PF -- proxy QP0 */
+ if (mlx4_is_mfunc(dev->dev)) {
+ for (i = 0; i < dev->dev->caps.num_ports; i++) {
+ if (qp->mqp.qpn == dev->dev->caps.spec_qps[i].qp0_proxy) {
+ proxy_qp0 = 1;
+ break;
+ }
+ }
+ }
+ return proxy_qp0;
+}
+
+static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
+{
+ return mlx4_buf_offset(&qp->buf, offset);
+}
+
+static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n)
+{
+ return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
+}
+
+static void *get_send_wqe(struct mlx4_ib_qp *qp, int n)
+{
+ return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift));
+}
+
+/*
+ * Stamp a SQ WQE so that it is invalid if prefetched by marking the
+ * first four bytes of every 64 byte chunk with 0xffffffff, except for
+ * the very first chunk of the WQE.
+ */
+static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n)
+{
+ __be32 *wqe;
+ int i;
+ int s;
+ void *buf;
+ struct mlx4_wqe_ctrl_seg *ctrl;
+
+ buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+ ctrl = (struct mlx4_wqe_ctrl_seg *)buf;
+ s = (ctrl->qpn_vlan.fence_size & 0x3f) << 4;
+ for (i = 64; i < s; i += 64) {
+ wqe = buf + i;
+ *wqe = cpu_to_be32(0xffffffff);
+ }
+}
+
+static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
+{
+ struct ib_event event;
+ struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
+
+ if (type == MLX4_EVENT_TYPE_PATH_MIG)
+ to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
+
+ if (ibqp->event_handler) {
+ event.device = ibqp->device;
+ event.element.qp = ibqp;
+ switch (type) {
+ case MLX4_EVENT_TYPE_PATH_MIG:
+ event.event = IB_EVENT_PATH_MIG;
+ break;
+ case MLX4_EVENT_TYPE_COMM_EST:
+ event.event = IB_EVENT_COMM_EST;
+ break;
+ case MLX4_EVENT_TYPE_SQ_DRAINED:
+ event.event = IB_EVENT_SQ_DRAINED;
+ break;
+ case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
+ event.event = IB_EVENT_QP_LAST_WQE_REACHED;
+ break;
+ case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
+ event.event = IB_EVENT_QP_FATAL;
+ break;
+ case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
+ event.event = IB_EVENT_PATH_MIG_ERR;
+ break;
+ case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+ event.event = IB_EVENT_QP_REQ_ERR;
+ break;
+ case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
+ event.event = IB_EVENT_QP_ACCESS_ERR;
+ break;
+ default:
+ pr_warn("Unexpected event type %d "
+ "on QP %06x\n", type, qp->qpn);
+ return;
+ }
+
+ ibqp->event_handler(&event, ibqp->qp_context);
+ }
+}
+
+static void mlx4_ib_wq_event(struct mlx4_qp *qp, enum mlx4_event type)
+{
+ pr_warn_ratelimited("Unexpected event type %d on WQ 0x%06x. Events are not supported for WQs\n",
+ type, qp->qpn);
+}
+
+static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags)
+{
+ /*
+ * UD WQEs must have a datagram segment.
+ * RC and UC WQEs might have a remote address segment.
+ * MLX WQEs need two extra inline data segments (for the UD
+ * header and space for the ICRC).
+ */
+ switch (type) {
+ case MLX4_IB_QPT_UD:
+ return sizeof (struct mlx4_wqe_ctrl_seg) +
+ sizeof (struct mlx4_wqe_datagram_seg) +
+ ((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0);
+ case MLX4_IB_QPT_PROXY_SMI_OWNER:
+ case MLX4_IB_QPT_PROXY_SMI:
+ case MLX4_IB_QPT_PROXY_GSI:
+ return sizeof (struct mlx4_wqe_ctrl_seg) +
+ sizeof (struct mlx4_wqe_datagram_seg) + 64;
+ case MLX4_IB_QPT_TUN_SMI_OWNER:
+ case MLX4_IB_QPT_TUN_GSI:
+ return sizeof (struct mlx4_wqe_ctrl_seg) +
+ sizeof (struct mlx4_wqe_datagram_seg);
+
+ case MLX4_IB_QPT_UC:
+ return sizeof (struct mlx4_wqe_ctrl_seg) +
+ sizeof (struct mlx4_wqe_raddr_seg);
+ case MLX4_IB_QPT_RC:
+ return sizeof (struct mlx4_wqe_ctrl_seg) +
+ sizeof (struct mlx4_wqe_masked_atomic_seg) +
+ sizeof (struct mlx4_wqe_raddr_seg);
+ case MLX4_IB_QPT_SMI:
+ case MLX4_IB_QPT_GSI:
+ return sizeof (struct mlx4_wqe_ctrl_seg) +
+ ALIGN(MLX4_IB_UD_HEADER_SIZE +
+ DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE,
+ MLX4_INLINE_ALIGN) *
+ sizeof (struct mlx4_wqe_inline_seg),
+ sizeof (struct mlx4_wqe_data_seg)) +
+ ALIGN(4 +
+ sizeof (struct mlx4_wqe_inline_seg),
+ sizeof (struct mlx4_wqe_data_seg));
+ default:
+ return sizeof (struct mlx4_wqe_ctrl_seg);
+ }
+}
+
+static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+ bool is_user, bool has_rq, struct mlx4_ib_qp *qp,
+ u32 inl_recv_sz)
+{
+ /* Sanity check RQ size before proceeding */
+ if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE ||
+ cap->max_recv_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg))
+ return -EINVAL;
+
+ if (!has_rq) {
+ if (cap->max_recv_wr || inl_recv_sz)
+ return -EINVAL;
+
+ qp->rq.wqe_cnt = qp->rq.max_gs = 0;
+ } else {
+ u32 max_inl_recv_sz = dev->dev->caps.max_rq_sg *
+ sizeof(struct mlx4_wqe_data_seg);
+ u32 wqe_size;
+
+ /* HW requires >= 1 RQ entry with >= 1 gather entry */
+ if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge ||
+ inl_recv_sz > max_inl_recv_sz))
+ return -EINVAL;
+
+ qp->rq.wqe_cnt = roundup_pow_of_two(max(1U, cap->max_recv_wr));
+ qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge));
+ wqe_size = qp->rq.max_gs * sizeof(struct mlx4_wqe_data_seg);
+ qp->rq.wqe_shift = ilog2(max_t(u32, wqe_size, inl_recv_sz));
+ }
+
+ /* leave userspace return values as they were, so as not to break ABI */
+ if (is_user) {
+ cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt;
+ cap->max_recv_sge = qp->rq.max_gs;
+ } else {
+ cap->max_recv_wr = qp->rq.max_post =
+ min(dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt);
+ cap->max_recv_sge = min(qp->rq.max_gs,
+ min(dev->dev->caps.max_sq_sg,
+ dev->dev->caps.max_rq_sg));
+ }
+
+ return 0;
+}
+
+static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+ enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp)
+{
+ int s;
+
+ /* Sanity check SQ size before proceeding */
+ if (cap->max_send_wr > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) ||
+ cap->max_send_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) ||
+ cap->max_inline_data + send_wqe_overhead(type, qp->flags) +
+ sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz)
+ return -EINVAL;
+
+ /*
+ * For MLX transport we need 2 extra S/G entries:
+ * one for the header and one for the checksum at the end
+ */
+ if ((type == MLX4_IB_QPT_SMI || type == MLX4_IB_QPT_GSI ||
+ type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) &&
+ cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
+ return -EINVAL;
+
+ s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg),
+ cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) +
+ send_wqe_overhead(type, qp->flags);
+
+ if (s > dev->dev->caps.max_sq_desc_sz)
+ return -EINVAL;
+
+ qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
+
+ /*
+ * We need to leave 2 KB + 1 WR of headroom in the SQ to
+ * allow HW to prefetch.
+ */
+ qp->sq_spare_wqes = MLX4_IB_SQ_HEADROOM(qp->sq.wqe_shift);
+ qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr +
+ qp->sq_spare_wqes);
+
+ qp->sq.max_gs =
+ (min(dev->dev->caps.max_sq_desc_sz,
+ (1 << qp->sq.wqe_shift)) -
+ send_wqe_overhead(type, qp->flags)) /
+ sizeof (struct mlx4_wqe_data_seg);
+
+ qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+ (qp->sq.wqe_cnt << qp->sq.wqe_shift);
+ if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
+ qp->rq.offset = 0;
+ qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+ } else {
+ qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
+ qp->sq.offset = 0;
+ }
+
+ cap->max_send_wr = qp->sq.max_post =
+ qp->sq.wqe_cnt - qp->sq_spare_wqes;
+ cap->max_send_sge = min(qp->sq.max_gs,
+ min(dev->dev->caps.max_sq_sg,
+ dev->dev->caps.max_rq_sg));
+ /* We don't support inline sends for kernel QPs (yet) */
+ cap->max_inline_data = 0;
+
+ return 0;
+}
+
+static int set_user_sq_size(struct mlx4_ib_dev *dev,
+ struct mlx4_ib_qp *qp,
+ struct mlx4_ib_create_qp *ucmd)
+{
+ u32 cnt;
+
+ /* Sanity check SQ size before proceeding */
+ if (check_shl_overflow(1, ucmd->log_sq_bb_count, &cnt) ||
+ cnt > dev->dev->caps.max_wqes)
+ return -EINVAL;
+ if (ucmd->log_sq_stride >
+ ilog2(roundup_pow_of_two(dev->dev->caps.max_sq_desc_sz)) ||
+ ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE)
+ return -EINVAL;
+
+ qp->sq.wqe_cnt = 1 << ucmd->log_sq_bb_count;
+ qp->sq.wqe_shift = ucmd->log_sq_stride;
+
+ qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+ (qp->sq.wqe_cnt << qp->sq.wqe_shift);
+
+ return 0;
+}
+
+static int alloc_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp)
+{
+ int i;
+
+ qp->sqp_proxy_rcv =
+ kmalloc_array(qp->rq.wqe_cnt, sizeof(struct mlx4_ib_buf),
+ GFP_KERNEL);
+ if (!qp->sqp_proxy_rcv)
+ return -ENOMEM;
+ for (i = 0; i < qp->rq.wqe_cnt; i++) {
+ qp->sqp_proxy_rcv[i].addr =
+ kmalloc(sizeof (struct mlx4_ib_proxy_sqp_hdr),
+ GFP_KERNEL);
+ if (!qp->sqp_proxy_rcv[i].addr)
+ goto err;
+ qp->sqp_proxy_rcv[i].map =
+ ib_dma_map_single(dev, qp->sqp_proxy_rcv[i].addr,
+ sizeof (struct mlx4_ib_proxy_sqp_hdr),
+ DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(dev, qp->sqp_proxy_rcv[i].map)) {
+ kfree(qp->sqp_proxy_rcv[i].addr);
+ goto err;
+ }
+ }
+ return 0;
+
+err:
+ while (i > 0) {
+ --i;
+ ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map,
+ sizeof (struct mlx4_ib_proxy_sqp_hdr),
+ DMA_FROM_DEVICE);
+ kfree(qp->sqp_proxy_rcv[i].addr);
+ }
+ kfree(qp->sqp_proxy_rcv);
+ qp->sqp_proxy_rcv = NULL;
+ return -ENOMEM;
+}
+
+static void free_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp)
+{
+ int i;
+
+ for (i = 0; i < qp->rq.wqe_cnt; i++) {
+ ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map,
+ sizeof (struct mlx4_ib_proxy_sqp_hdr),
+ DMA_FROM_DEVICE);
+ kfree(qp->sqp_proxy_rcv[i].addr);
+ }
+ kfree(qp->sqp_proxy_rcv);
+}
+
+static bool qp_has_rq(struct ib_qp_init_attr *attr)
+{
+ if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT)
+ return false;
+
+ return !attr->srq;
+}
+
+static int qp0_enabled_vf(struct mlx4_dev *dev, int qpn)
+{
+ int i;
+ for (i = 0; i < dev->caps.num_ports; i++) {
+ if (qpn == dev->caps.spec_qps[i].qp0_proxy)
+ return !!dev->caps.spec_qps[i].qp0_qkey;
+ }
+ return 0;
+}
+
+static void mlx4_ib_free_qp_counter(struct mlx4_ib_dev *dev,
+ struct mlx4_ib_qp *qp)
+{
+ mutex_lock(&dev->counters_table[qp->port - 1].mutex);
+ mlx4_counter_free(dev->dev, qp->counter_index->index);
+ list_del(&qp->counter_index->list);
+ mutex_unlock(&dev->counters_table[qp->port - 1].mutex);
+
+ kfree(qp->counter_index);
+ qp->counter_index = NULL;
+}
+
+static int set_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_rss *rss_ctx,
+ struct ib_qp_init_attr *init_attr,
+ struct mlx4_ib_create_qp_rss *ucmd)
+{
+ rss_ctx->base_qpn_tbl_sz = init_attr->rwq_ind_tbl->ind_tbl[0]->wq_num |
+ (init_attr->rwq_ind_tbl->log_ind_tbl_size << 24);
+
+ if ((ucmd->rx_hash_function == MLX4_IB_RX_HASH_FUNC_TOEPLITZ) &&
+ (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP)) {
+ memcpy(rss_ctx->rss_key, ucmd->rx_hash_key,
+ MLX4_EN_RSS_KEY_SIZE);
+ } else {
+ pr_debug("RX Hash function is not supported\n");
+ return (-EOPNOTSUPP);
+ }
+
+ if (ucmd->rx_hash_fields_mask & ~(u64)(MLX4_IB_RX_HASH_SRC_IPV4 |
+ MLX4_IB_RX_HASH_DST_IPV4 |
+ MLX4_IB_RX_HASH_SRC_IPV6 |
+ MLX4_IB_RX_HASH_DST_IPV6 |
+ MLX4_IB_RX_HASH_SRC_PORT_TCP |
+ MLX4_IB_RX_HASH_DST_PORT_TCP |
+ MLX4_IB_RX_HASH_SRC_PORT_UDP |
+ MLX4_IB_RX_HASH_DST_PORT_UDP |
+ MLX4_IB_RX_HASH_INNER)) {
+ pr_debug("RX Hash fields_mask has unsupported mask (0x%llx)\n",
+ ucmd->rx_hash_fields_mask);
+ return (-EOPNOTSUPP);
+ }
+
+ if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV4) &&
+ (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV4)) {
+ rss_ctx->flags = MLX4_RSS_IPV4;
+ } else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV4) ||
+ (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV4)) {
+ pr_debug("RX Hash fields_mask is not supported - both IPv4 SRC and DST must be set\n");
+ return (-EOPNOTSUPP);
+ }
+
+ if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV6) &&
+ (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV6)) {
+ rss_ctx->flags |= MLX4_RSS_IPV6;
+ } else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV6) ||
+ (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV6)) {
+ pr_debug("RX Hash fields_mask is not supported - both IPv6 SRC and DST must be set\n");
+ return (-EOPNOTSUPP);
+ }
+
+ if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_UDP) &&
+ (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_UDP)) {
+ if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UDP_RSS)) {
+ pr_debug("RX Hash fields_mask for UDP is not supported\n");
+ return (-EOPNOTSUPP);
+ }
+
+ if (rss_ctx->flags & MLX4_RSS_IPV4)
+ rss_ctx->flags |= MLX4_RSS_UDP_IPV4;
+ if (rss_ctx->flags & MLX4_RSS_IPV6)
+ rss_ctx->flags |= MLX4_RSS_UDP_IPV6;
+ if (!(rss_ctx->flags & (MLX4_RSS_IPV6 | MLX4_RSS_IPV4))) {
+ pr_debug("RX Hash fields_mask is not supported - UDP must be set with IPv4 or IPv6\n");
+ return (-EOPNOTSUPP);
+ }
+ } else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_UDP) ||
+ (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_UDP)) {
+ pr_debug("RX Hash fields_mask is not supported - both UDP SRC and DST must be set\n");
+ return (-EOPNOTSUPP);
+ }
+
+ if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_TCP) &&
+ (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_TCP)) {
+ if (rss_ctx->flags & MLX4_RSS_IPV4)
+ rss_ctx->flags |= MLX4_RSS_TCP_IPV4;
+ if (rss_ctx->flags & MLX4_RSS_IPV6)
+ rss_ctx->flags |= MLX4_RSS_TCP_IPV6;
+ if (!(rss_ctx->flags & (MLX4_RSS_IPV6 | MLX4_RSS_IPV4))) {
+ pr_debug("RX Hash fields_mask is not supported - TCP must be set with IPv4 or IPv6\n");
+ return (-EOPNOTSUPP);
+ }
+ } else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_TCP) ||
+ (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_TCP)) {
+ pr_debug("RX Hash fields_mask is not supported - both TCP SRC and DST must be set\n");
+ return (-EOPNOTSUPP);
+ }
+
+ if (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_INNER) {
+ if (dev->dev->caps.tunnel_offload_mode ==
+ MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+ /*
+ * Hash according to inner headers if exist, otherwise
+ * according to outer headers.
+ */
+ rss_ctx->flags |= MLX4_RSS_BY_INNER_HEADERS_IPONLY;
+ } else {
+ pr_debug("RSS Hash for inner headers isn't supported\n");
+ return (-EOPNOTSUPP);
+ }
+ }
+
+ return 0;
+}
+
+static int create_qp_rss(struct mlx4_ib_dev *dev,
+ struct ib_qp_init_attr *init_attr,
+ struct mlx4_ib_create_qp_rss *ucmd,
+ struct mlx4_ib_qp *qp)
+{
+ int qpn;
+ int err;
+
+ qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS;
+
+ err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn, 0, qp->mqp.usage);
+ if (err)
+ return err;
+
+ err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
+ if (err)
+ goto err_qpn;
+
+ INIT_LIST_HEAD(&qp->gid_list);
+ INIT_LIST_HEAD(&qp->steering_rules);
+
+ qp->mlx4_ib_qp_type = MLX4_IB_QPT_RAW_PACKET;
+ qp->state = IB_QPS_RESET;
+
+ /* Set dummy send resources to be compatible with HV and PRM */
+ qp->sq_no_prefetch = 1;
+ qp->sq.wqe_cnt = 1;
+ qp->sq.wqe_shift = MLX4_IB_MIN_SQ_STRIDE;
+ qp->buf_size = qp->sq.wqe_cnt << MLX4_IB_MIN_SQ_STRIDE;
+ qp->mtt = (to_mqp(
+ (struct ib_qp *)init_attr->rwq_ind_tbl->ind_tbl[0]))->mtt;
+
+ qp->rss_ctx = kzalloc(sizeof(*qp->rss_ctx), GFP_KERNEL);
+ if (!qp->rss_ctx) {
+ err = -ENOMEM;
+ goto err_qp_alloc;
+ }
+
+ err = set_qp_rss(dev, qp->rss_ctx, init_attr, ucmd);
+ if (err)
+ goto err;
+
+ return 0;
+
+err:
+ kfree(qp->rss_ctx);
+
+err_qp_alloc:
+ mlx4_qp_remove(dev->dev, &qp->mqp);
+ mlx4_qp_free(dev->dev, &qp->mqp);
+
+err_qpn:
+ mlx4_qp_release_range(dev->dev, qpn, 1);
+ return err;
+}
+
+static int _mlx4_ib_create_qp_rss(struct ib_pd *pd, struct mlx4_ib_qp *qp,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct mlx4_ib_create_qp_rss ucmd = {};
+ size_t required_cmd_sz;
+ int err;
+
+ if (!udata) {
+ pr_debug("RSS QP with NULL udata\n");
+ return -EINVAL;
+ }
+
+ if (udata->outlen)
+ return -EOPNOTSUPP;
+
+ required_cmd_sz = offsetof(typeof(ucmd), reserved1) +
+ sizeof(ucmd.reserved1);
+ if (udata->inlen < required_cmd_sz) {
+ pr_debug("invalid inlen\n");
+ return -EINVAL;
+ }
+
+ if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) {
+ pr_debug("copy failed\n");
+ return -EFAULT;
+ }
+
+ if (memchr_inv(ucmd.reserved, 0, sizeof(ucmd.reserved)))
+ return -EOPNOTSUPP;
+
+ if (ucmd.comp_mask || ucmd.reserved1)
+ return -EOPNOTSUPP;
+
+ if (udata->inlen > sizeof(ucmd) &&
+ !ib_is_udata_cleared(udata, sizeof(ucmd),
+ udata->inlen - sizeof(ucmd))) {
+ pr_debug("inlen is not supported\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (init_attr->qp_type != IB_QPT_RAW_PACKET) {
+ pr_debug("RSS QP with unsupported QP type %d\n",
+ init_attr->qp_type);
+ return -EOPNOTSUPP;
+ }
+
+ if (init_attr->create_flags) {
+ pr_debug("RSS QP doesn't support create flags\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (init_attr->send_cq || init_attr->cap.max_send_wr) {
+ pr_debug("RSS QP with unsupported send attributes\n");
+ return -EOPNOTSUPP;
+ }
+
+ qp->pri.vid = 0xFFFF;
+ qp->alt.vid = 0xFFFF;
+
+ err = create_qp_rss(to_mdev(pd->device), init_attr, &ucmd, qp);
+ if (err)
+ return err;
+
+ qp->ibqp.qp_num = qp->mqp.qpn;
+ return 0;
+}
+
+/*
+ * This function allocates a WQN from a range which is consecutive and aligned
+ * to its size. In case the range is full, then it creates a new range and
+ * allocates WQN from it. The new range will be used for following allocations.
+ */
+static int mlx4_ib_alloc_wqn(struct mlx4_ib_ucontext *context,
+ struct mlx4_ib_qp *qp, int range_size, int *wqn)
+{
+ struct mlx4_ib_dev *dev = to_mdev(context->ibucontext.device);
+ struct mlx4_wqn_range *range;
+ int err = 0;
+
+ mutex_lock(&context->wqn_ranges_mutex);
+
+ range = list_first_entry_or_null(&context->wqn_ranges_list,
+ struct mlx4_wqn_range, list);
+
+ if (!range || (range->refcount == range->size) || range->dirty) {
+ range = kzalloc(sizeof(*range), GFP_KERNEL);
+ if (!range) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = mlx4_qp_reserve_range(dev->dev, range_size,
+ range_size, &range->base_wqn, 0,
+ qp->mqp.usage);
+ if (err) {
+ kfree(range);
+ goto out;
+ }
+
+ range->size = range_size;
+ list_add(&range->list, &context->wqn_ranges_list);
+ } else if (range_size != 1) {
+ /*
+ * Requesting a new range (>1) when last range is still open, is
+ * not valid.
+ */
+ err = -EINVAL;
+ goto out;
+ }
+
+ qp->wqn_range = range;
+
+ *wqn = range->base_wqn + range->refcount;
+
+ range->refcount++;
+
+out:
+ mutex_unlock(&context->wqn_ranges_mutex);
+
+ return err;
+}
+
+static void mlx4_ib_release_wqn(struct mlx4_ib_ucontext *context,
+ struct mlx4_ib_qp *qp, bool dirty_release)
+{
+ struct mlx4_ib_dev *dev = to_mdev(context->ibucontext.device);
+ struct mlx4_wqn_range *range;
+
+ mutex_lock(&context->wqn_ranges_mutex);
+
+ range = qp->wqn_range;
+
+ range->refcount--;
+ if (!range->refcount) {
+ mlx4_qp_release_range(dev->dev, range->base_wqn,
+ range->size);
+ list_del(&range->list);
+ kfree(range);
+ } else if (dirty_release) {
+ /*
+ * A range which one of its WQNs is destroyed, won't be able to be
+ * reused for further WQN allocations.
+ * The next created WQ will allocate a new range.
+ */
+ range->dirty = true;
+ }
+
+ mutex_unlock(&context->wqn_ranges_mutex);
+}
+
+static int create_rq(struct ib_pd *pd, struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata, struct mlx4_ib_qp *qp)
+{
+ struct mlx4_ib_dev *dev = to_mdev(pd->device);
+ int qpn;
+ int err;
+ struct mlx4_ib_ucontext *context = rdma_udata_to_drv_context(
+ udata, struct mlx4_ib_ucontext, ibucontext);
+ struct mlx4_ib_cq *mcq;
+ unsigned long flags;
+ int range_size;
+ struct mlx4_ib_create_wq wq;
+ size_t copy_len;
+ int shift;
+ int n;
+
+ qp->mlx4_ib_qp_type = MLX4_IB_QPT_RAW_PACKET;
+
+ spin_lock_init(&qp->sq.lock);
+ spin_lock_init(&qp->rq.lock);
+ INIT_LIST_HEAD(&qp->gid_list);
+ INIT_LIST_HEAD(&qp->steering_rules);
+
+ qp->state = IB_QPS_RESET;
+
+ copy_len = min(sizeof(struct mlx4_ib_create_wq), udata->inlen);
+
+ if (ib_copy_from_udata(&wq, udata, copy_len)) {
+ err = -EFAULT;
+ goto err;
+ }
+
+ if (wq.comp_mask || wq.reserved[0] || wq.reserved[1] ||
+ wq.reserved[2]) {
+ pr_debug("user command isn't supported\n");
+ err = -EOPNOTSUPP;
+ goto err;
+ }
+
+ if (wq.log_range_size > ilog2(dev->dev->caps.max_rss_tbl_sz)) {
+ pr_debug("WQN range size must be equal or smaller than %d\n",
+ dev->dev->caps.max_rss_tbl_sz);
+ err = -EOPNOTSUPP;
+ goto err;
+ }
+ range_size = 1 << wq.log_range_size;
+
+ if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS)
+ qp->flags |= MLX4_IB_QP_SCATTER_FCS;
+
+ err = set_rq_size(dev, &init_attr->cap, true, true, qp, qp->inl_recv_sz);
+ if (err)
+ goto err;
+
+ qp->sq_no_prefetch = 1;
+ qp->sq.wqe_cnt = 1;
+ qp->sq.wqe_shift = MLX4_IB_MIN_SQ_STRIDE;
+ qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+ (qp->sq.wqe_cnt << qp->sq.wqe_shift);
+
+ qp->umem = ib_umem_get(pd->device, wq.buf_addr, qp->buf_size, 0);
+ if (IS_ERR(qp->umem)) {
+ err = PTR_ERR(qp->umem);
+ goto err;
+ }
+
+ shift = mlx4_ib_umem_calc_optimal_mtt_size(qp->umem, 0, &n);
+ err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt);
+
+ if (err)
+ goto err_buf;
+
+ err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem);
+ if (err)
+ goto err_mtt;
+
+ err = mlx4_ib_db_map_user(udata, wq.db_addr, &qp->db);
+ if (err)
+ goto err_mtt;
+ qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS;
+
+ err = mlx4_ib_alloc_wqn(context, qp, range_size, &qpn);
+ if (err)
+ goto err_wrid;
+
+ err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
+ if (err)
+ goto err_qpn;
+
+ /*
+ * Hardware wants QPN written in big-endian order (after
+ * shifting) for send doorbell. Precompute this value to save
+ * a little bit when posting sends.
+ */
+ qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
+
+ qp->mqp.event = mlx4_ib_wq_event;
+
+ spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+ mlx4_ib_lock_cqs(to_mcq(init_attr->send_cq),
+ to_mcq(init_attr->recv_cq));
+ /* Maintain device to QPs access, needed for further handling
+ * via reset flow
+ */
+ list_add_tail(&qp->qps_list, &dev->qp_list);
+ /* Maintain CQ to QPs access, needed for further handling
+ * via reset flow
+ */
+ mcq = to_mcq(init_attr->send_cq);
+ list_add_tail(&qp->cq_send_list, &mcq->send_qp_list);
+ mcq = to_mcq(init_attr->recv_cq);
+ list_add_tail(&qp->cq_recv_list, &mcq->recv_qp_list);
+ mlx4_ib_unlock_cqs(to_mcq(init_attr->send_cq),
+ to_mcq(init_attr->recv_cq));
+ spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
+ return 0;
+
+err_qpn:
+ mlx4_ib_release_wqn(context, qp, 0);
+err_wrid:
+ mlx4_ib_db_unmap_user(context, &qp->db);
+
+err_mtt:
+ mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+err_buf:
+ ib_umem_release(qp->umem);
+err:
+ return err;
+}
+
+static int create_qp_common(struct ib_pd *pd, struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata, int sqpn,
+ struct mlx4_ib_qp *qp)
+{
+ struct mlx4_ib_dev *dev = to_mdev(pd->device);
+ int qpn;
+ int err;
+ struct mlx4_ib_ucontext *context = rdma_udata_to_drv_context(
+ udata, struct mlx4_ib_ucontext, ibucontext);
+ enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
+ struct mlx4_ib_cq *mcq;
+ unsigned long flags;
+
+ /* When tunneling special qps, we use a plain UD qp */
+ if (sqpn) {
+ if (mlx4_is_mfunc(dev->dev) &&
+ (!mlx4_is_master(dev->dev) ||
+ !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) {
+ if (init_attr->qp_type == IB_QPT_GSI)
+ qp_type = MLX4_IB_QPT_PROXY_GSI;
+ else {
+ if (mlx4_is_master(dev->dev) ||
+ qp0_enabled_vf(dev->dev, sqpn))
+ qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER;
+ else
+ qp_type = MLX4_IB_QPT_PROXY_SMI;
+ }
+ }
+ qpn = sqpn;
+ /* add extra sg entry for tunneling */
+ init_attr->cap.max_recv_sge++;
+ } else if (init_attr->create_flags & MLX4_IB_SRIOV_TUNNEL_QP) {
+ struct mlx4_ib_qp_tunnel_init_attr *tnl_init =
+ container_of(init_attr,
+ struct mlx4_ib_qp_tunnel_init_attr, init_attr);
+ if ((tnl_init->proxy_qp_type != IB_QPT_SMI &&
+ tnl_init->proxy_qp_type != IB_QPT_GSI) ||
+ !mlx4_is_master(dev->dev))
+ return -EINVAL;
+ if (tnl_init->proxy_qp_type == IB_QPT_GSI)
+ qp_type = MLX4_IB_QPT_TUN_GSI;
+ else if (tnl_init->slave == mlx4_master_func_num(dev->dev) ||
+ mlx4_vf_smi_enabled(dev->dev, tnl_init->slave,
+ tnl_init->port))
+ qp_type = MLX4_IB_QPT_TUN_SMI_OWNER;
+ else
+ qp_type = MLX4_IB_QPT_TUN_SMI;
+ /* we are definitely in the PPF here, since we are creating
+ * tunnel QPs. base_tunnel_sqpn is therefore valid. */
+ qpn = dev->dev->phys_caps.base_tunnel_sqpn + 8 * tnl_init->slave
+ + tnl_init->proxy_qp_type * 2 + tnl_init->port - 1;
+ sqpn = qpn;
+ }
+
+ if (init_attr->qp_type == IB_QPT_SMI ||
+ init_attr->qp_type == IB_QPT_GSI || qp_type == MLX4_IB_QPT_SMI ||
+ qp_type == MLX4_IB_QPT_GSI ||
+ (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER |
+ MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) {
+ qp->sqp = kzalloc(sizeof(struct mlx4_ib_sqp), GFP_KERNEL);
+ if (!qp->sqp)
+ return -ENOMEM;
+ }
+
+ qp->mlx4_ib_qp_type = qp_type;
+
+ spin_lock_init(&qp->sq.lock);
+ spin_lock_init(&qp->rq.lock);
+ INIT_LIST_HEAD(&qp->gid_list);
+ INIT_LIST_HEAD(&qp->steering_rules);
+
+ qp->state = IB_QPS_RESET;
+ if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+ qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+ if (udata) {
+ struct mlx4_ib_create_qp ucmd;
+ size_t copy_len;
+ int shift;
+ int n;
+
+ copy_len = sizeof(struct mlx4_ib_create_qp);
+
+ if (ib_copy_from_udata(&ucmd, udata, copy_len)) {
+ err = -EFAULT;
+ goto err;
+ }
+
+ qp->inl_recv_sz = ucmd.inl_recv_sz;
+
+ if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS) {
+ if (!(dev->dev->caps.flags &
+ MLX4_DEV_CAP_FLAG_FCS_KEEP)) {
+ pr_debug("scatter FCS is unsupported\n");
+ err = -EOPNOTSUPP;
+ goto err;
+ }
+
+ qp->flags |= MLX4_IB_QP_SCATTER_FCS;
+ }
+
+ err = set_rq_size(dev, &init_attr->cap, udata,
+ qp_has_rq(init_attr), qp, qp->inl_recv_sz);
+ if (err)
+ goto err;
+
+ qp->sq_no_prefetch = ucmd.sq_no_prefetch;
+
+ err = set_user_sq_size(dev, qp, &ucmd);
+ if (err)
+ goto err;
+
+ qp->umem =
+ ib_umem_get(pd->device, ucmd.buf_addr, qp->buf_size, 0);
+ if (IS_ERR(qp->umem)) {
+ err = PTR_ERR(qp->umem);
+ goto err;
+ }
+
+ shift = mlx4_ib_umem_calc_optimal_mtt_size(qp->umem, 0, &n);
+ err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt);
+
+ if (err)
+ goto err_buf;
+
+ err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem);
+ if (err)
+ goto err_mtt;
+
+ if (qp_has_rq(init_attr)) {
+ err = mlx4_ib_db_map_user(udata, ucmd.db_addr, &qp->db);
+ if (err)
+ goto err_mtt;
+ }
+ qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS;
+ } else {
+ err = set_rq_size(dev, &init_attr->cap, udata,
+ qp_has_rq(init_attr), qp, 0);
+ if (err)
+ goto err;
+
+ qp->sq_no_prefetch = 0;
+
+ if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
+ qp->flags |= MLX4_IB_QP_LSO;
+
+ if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) {
+ if (dev->steering_support ==
+ MLX4_STEERING_MODE_DEVICE_MANAGED)
+ qp->flags |= MLX4_IB_QP_NETIF;
+ else {
+ err = -EINVAL;
+ goto err;
+ }
+ }
+
+ err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp);
+ if (err)
+ goto err;
+
+ if (qp_has_rq(init_attr)) {
+ err = mlx4_db_alloc(dev->dev, &qp->db, 0);
+ if (err)
+ goto err;
+
+ *qp->db.db = 0;
+ }
+
+ if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2,
+ &qp->buf)) {
+ err = -ENOMEM;
+ goto err_db;
+ }
+
+ err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
+ &qp->mtt);
+ if (err)
+ goto err_buf;
+
+ err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf);
+ if (err)
+ goto err_mtt;
+
+ qp->sq.wrid = kvmalloc_array(qp->sq.wqe_cnt,
+ sizeof(u64), GFP_KERNEL);
+ qp->rq.wrid = kvmalloc_array(qp->rq.wqe_cnt,
+ sizeof(u64), GFP_KERNEL);
+ if (!qp->sq.wrid || !qp->rq.wrid) {
+ err = -ENOMEM;
+ goto err_wrid;
+ }
+ qp->mqp.usage = MLX4_RES_USAGE_DRIVER;
+ }
+
+ if (sqpn) {
+ if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
+ MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) {
+ if (alloc_proxy_bufs(pd->device, qp)) {
+ err = -ENOMEM;
+ goto err_wrid;
+ }
+ }
+ } else {
+ /* Raw packet QPNs may not have bits 6,7 set in their qp_num;
+ * otherwise, the WQE BlueFlame setup flow wrongly causes
+ * VLAN insertion. */
+ if (init_attr->qp_type == IB_QPT_RAW_PACKET)
+ err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn,
+ (init_attr->cap.max_send_wr ?
+ MLX4_RESERVE_ETH_BF_QP : 0) |
+ (init_attr->cap.max_recv_wr ?
+ MLX4_RESERVE_A0_QP : 0),
+ qp->mqp.usage);
+ else
+ if (qp->flags & MLX4_IB_QP_NETIF)
+ err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn);
+ else
+ err = mlx4_qp_reserve_range(dev->dev, 1, 1,
+ &qpn, 0, qp->mqp.usage);
+ if (err)
+ goto err_proxy;
+ }
+
+ if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+ qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+
+ err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
+ if (err)
+ goto err_qpn;
+
+ if (init_attr->qp_type == IB_QPT_XRC_TGT)
+ qp->mqp.qpn |= (1 << 23);
+
+ /*
+ * Hardware wants QPN written in big-endian order (after
+ * shifting) for send doorbell. Precompute this value to save
+ * a little bit when posting sends.
+ */
+ qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
+
+ qp->mqp.event = mlx4_ib_qp_event;
+
+ spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+ mlx4_ib_lock_cqs(to_mcq(init_attr->send_cq),
+ to_mcq(init_attr->recv_cq));
+ /* Maintain device to QPs access, needed for further handling
+ * via reset flow
+ */
+ list_add_tail(&qp->qps_list, &dev->qp_list);
+ /* Maintain CQ to QPs access, needed for further handling
+ * via reset flow
+ */
+ mcq = to_mcq(init_attr->send_cq);
+ list_add_tail(&qp->cq_send_list, &mcq->send_qp_list);
+ mcq = to_mcq(init_attr->recv_cq);
+ list_add_tail(&qp->cq_recv_list, &mcq->recv_qp_list);
+ mlx4_ib_unlock_cqs(to_mcq(init_attr->send_cq),
+ to_mcq(init_attr->recv_cq));
+ spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
+ return 0;
+
+err_qpn:
+ if (!sqpn) {
+ if (qp->flags & MLX4_IB_QP_NETIF)
+ mlx4_ib_steer_qp_free(dev, qpn, 1);
+ else
+ mlx4_qp_release_range(dev->dev, qpn, 1);
+ }
+err_proxy:
+ if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI)
+ free_proxy_bufs(pd->device, qp);
+err_wrid:
+ if (udata) {
+ if (qp_has_rq(init_attr))
+ mlx4_ib_db_unmap_user(context, &qp->db);
+ } else {
+ kvfree(qp->sq.wrid);
+ kvfree(qp->rq.wrid);
+ }
+
+err_mtt:
+ mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+
+err_buf:
+ if (!qp->umem)
+ mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+ ib_umem_release(qp->umem);
+
+err_db:
+ if (!udata && qp_has_rq(init_attr))
+ mlx4_db_free(dev->dev, &qp->db);
+
+err:
+ kfree(qp->sqp);
+ return err;
+}
+
+static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state)
+{
+ switch (state) {
+ case IB_QPS_RESET: return MLX4_QP_STATE_RST;
+ case IB_QPS_INIT: return MLX4_QP_STATE_INIT;
+ case IB_QPS_RTR: return MLX4_QP_STATE_RTR;
+ case IB_QPS_RTS: return MLX4_QP_STATE_RTS;
+ case IB_QPS_SQD: return MLX4_QP_STATE_SQD;
+ case IB_QPS_SQE: return MLX4_QP_STATE_SQER;
+ case IB_QPS_ERR: return MLX4_QP_STATE_ERR;
+ default: return -1;
+ }
+}
+
+static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+ __acquires(&send_cq->lock) __acquires(&recv_cq->lock)
+{
+ if (send_cq == recv_cq) {
+ spin_lock(&send_cq->lock);
+ __acquire(&recv_cq->lock);
+ } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+ spin_lock(&send_cq->lock);
+ spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
+ } else {
+ spin_lock(&recv_cq->lock);
+ spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
+ }
+}
+
+static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+ __releases(&send_cq->lock) __releases(&recv_cq->lock)
+{
+ if (send_cq == recv_cq) {
+ __release(&recv_cq->lock);
+ spin_unlock(&send_cq->lock);
+ } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+ spin_unlock(&recv_cq->lock);
+ spin_unlock(&send_cq->lock);
+ } else {
+ spin_unlock(&send_cq->lock);
+ spin_unlock(&recv_cq->lock);
+ }
+}
+
+static void del_gid_entries(struct mlx4_ib_qp *qp)
+{
+ struct mlx4_ib_gid_entry *ge, *tmp;
+
+ list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
+ list_del(&ge->list);
+ kfree(ge);
+ }
+}
+
+static struct mlx4_ib_pd *get_pd(struct mlx4_ib_qp *qp)
+{
+ if (qp->ibqp.qp_type == IB_QPT_XRC_TGT)
+ return to_mpd(to_mxrcd(qp->ibqp.xrcd)->pd);
+ else
+ return to_mpd(qp->ibqp.pd);
+}
+
+static void get_cqs(struct mlx4_ib_qp *qp, enum mlx4_ib_source_type src,
+ struct mlx4_ib_cq **send_cq, struct mlx4_ib_cq **recv_cq)
+{
+ switch (qp->ibqp.qp_type) {
+ case IB_QPT_XRC_TGT:
+ *send_cq = to_mcq(to_mxrcd(qp->ibqp.xrcd)->cq);
+ *recv_cq = *send_cq;
+ break;
+ case IB_QPT_XRC_INI:
+ *send_cq = to_mcq(qp->ibqp.send_cq);
+ *recv_cq = *send_cq;
+ break;
+ default:
+ *recv_cq = (src == MLX4_IB_QP_SRC) ? to_mcq(qp->ibqp.recv_cq) :
+ to_mcq(qp->ibwq.cq);
+ *send_cq = (src == MLX4_IB_QP_SRC) ? to_mcq(qp->ibqp.send_cq) :
+ *recv_cq;
+ break;
+ }
+}
+
+static void destroy_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+ if (qp->state != IB_QPS_RESET) {
+ int i;
+
+ for (i = 0; i < (1 << qp->ibqp.rwq_ind_tbl->log_ind_tbl_size);
+ i++) {
+ struct ib_wq *ibwq = qp->ibqp.rwq_ind_tbl->ind_tbl[i];
+ struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq);
+
+ mutex_lock(&wq->mutex);
+
+ wq->rss_usecnt--;
+
+ mutex_unlock(&wq->mutex);
+ }
+
+ if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
+ MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
+ pr_warn("modify QP %06x to RESET failed.\n",
+ qp->mqp.qpn);
+ }
+
+ mlx4_qp_remove(dev->dev, &qp->mqp);
+ mlx4_qp_free(dev->dev, &qp->mqp);
+ mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
+ del_gid_entries(qp);
+}
+
+static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
+ enum mlx4_ib_source_type src,
+ struct ib_udata *udata)
+{
+ struct mlx4_ib_cq *send_cq, *recv_cq;
+ unsigned long flags;
+
+ if (qp->state != IB_QPS_RESET) {
+ if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
+ MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
+ pr_warn("modify QP %06x to RESET failed.\n",
+ qp->mqp.qpn);
+ if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) {
+ mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
+ qp->pri.smac = 0;
+ qp->pri.smac_port = 0;
+ }
+ if (qp->alt.smac) {
+ mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
+ qp->alt.smac = 0;
+ }
+ if (qp->pri.vid < 0x1000) {
+ mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid);
+ qp->pri.vid = 0xFFFF;
+ qp->pri.candidate_vid = 0xFFFF;
+ qp->pri.update_vid = 0;
+ }
+ if (qp->alt.vid < 0x1000) {
+ mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid);
+ qp->alt.vid = 0xFFFF;
+ qp->alt.candidate_vid = 0xFFFF;
+ qp->alt.update_vid = 0;
+ }
+ }
+
+ get_cqs(qp, src, &send_cq, &recv_cq);
+
+ spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+ mlx4_ib_lock_cqs(send_cq, recv_cq);
+
+ /* del from lists under both locks above to protect reset flow paths */
+ list_del(&qp->qps_list);
+ list_del(&qp->cq_send_list);
+ list_del(&qp->cq_recv_list);
+ if (!udata) {
+ __mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
+ qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL);
+ if (send_cq != recv_cq)
+ __mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+ }
+
+ mlx4_qp_remove(dev->dev, &qp->mqp);
+
+ mlx4_ib_unlock_cqs(send_cq, recv_cq);
+ spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
+
+ mlx4_qp_free(dev->dev, &qp->mqp);
+
+ if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) {
+ if (qp->flags & MLX4_IB_QP_NETIF)
+ mlx4_ib_steer_qp_free(dev, qp->mqp.qpn, 1);
+ else if (src == MLX4_IB_RWQ_SRC)
+ mlx4_ib_release_wqn(
+ rdma_udata_to_drv_context(
+ udata,
+ struct mlx4_ib_ucontext,
+ ibucontext),
+ qp, 1);
+ else
+ mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
+ }
+
+ mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+
+ if (udata) {
+ if (qp->rq.wqe_cnt) {
+ struct mlx4_ib_ucontext *mcontext =
+ rdma_udata_to_drv_context(
+ udata,
+ struct mlx4_ib_ucontext,
+ ibucontext);
+
+ mlx4_ib_db_unmap_user(mcontext, &qp->db);
+ }
+ } else {
+ kvfree(qp->sq.wrid);
+ kvfree(qp->rq.wrid);
+ if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
+ MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
+ free_proxy_bufs(&dev->ib_dev, qp);
+ mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+ if (qp->rq.wqe_cnt)
+ mlx4_db_free(dev->dev, &qp->db);
+ }
+ ib_umem_release(qp->umem);
+
+ del_gid_entries(qp);
+}
+
+static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr)
+{
+ /* Native or PPF */
+ if (!mlx4_is_mfunc(dev->dev) ||
+ (mlx4_is_master(dev->dev) &&
+ attr->create_flags & MLX4_IB_SRIOV_SQP)) {
+ return dev->dev->phys_caps.base_sqpn +
+ (attr->qp_type == IB_QPT_SMI ? 0 : 2) +
+ attr->port_num - 1;
+ }
+ /* PF or VF -- creating proxies */
+ if (attr->qp_type == IB_QPT_SMI)
+ return dev->dev->caps.spec_qps[attr->port_num - 1].qp0_proxy;
+ else
+ return dev->dev->caps.spec_qps[attr->port_num - 1].qp1_proxy;
+}
+
+static int _mlx4_ib_create_qp(struct ib_pd *pd, struct mlx4_ib_qp *qp,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ int err;
+ int sup_u_create_flags = MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+ u16 xrcdn = 0;
+
+ if (init_attr->rwq_ind_tbl)
+ return _mlx4_ib_create_qp_rss(pd, qp, init_attr, udata);
+
+ /*
+ * We only support LSO, vendor flag1, and multicast loopback blocking,
+ * and only for kernel UD QPs.
+ */
+ if (init_attr->create_flags & ~(MLX4_IB_QP_LSO |
+ MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK |
+ MLX4_IB_SRIOV_TUNNEL_QP |
+ MLX4_IB_SRIOV_SQP |
+ MLX4_IB_QP_NETIF |
+ MLX4_IB_QP_CREATE_ROCE_V2_GSI))
+ return -EOPNOTSUPP;
+
+ if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) {
+ if (init_attr->qp_type != IB_QPT_UD)
+ return -EINVAL;
+ }
+
+ if (init_attr->create_flags) {
+ if (udata && init_attr->create_flags & ~(sup_u_create_flags))
+ return -EINVAL;
+
+ if ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP |
+ MLX4_IB_QP_CREATE_ROCE_V2_GSI |
+ MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) &&
+ init_attr->qp_type != IB_QPT_UD) ||
+ (init_attr->create_flags & MLX4_IB_SRIOV_SQP &&
+ init_attr->qp_type > IB_QPT_GSI) ||
+ (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI &&
+ init_attr->qp_type != IB_QPT_GSI))
+ return -EINVAL;
+ }
+
+ switch (init_attr->qp_type) {
+ case IB_QPT_XRC_TGT:
+ pd = to_mxrcd(init_attr->xrcd)->pd;
+ xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn;
+ init_attr->send_cq = to_mxrcd(init_attr->xrcd)->cq;
+ fallthrough;
+ case IB_QPT_XRC_INI:
+ if (!(to_mdev(pd->device)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+ return -ENOSYS;
+ init_attr->recv_cq = init_attr->send_cq;
+ fallthrough;
+ case IB_QPT_RC:
+ case IB_QPT_UC:
+ case IB_QPT_RAW_PACKET:
+ case IB_QPT_UD:
+ qp->pri.vid = 0xFFFF;
+ qp->alt.vid = 0xFFFF;
+ err = create_qp_common(pd, init_attr, udata, 0, qp);
+ if (err)
+ return err;
+
+ qp->ibqp.qp_num = qp->mqp.qpn;
+ qp->xrcdn = xrcdn;
+ break;
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ {
+ int sqpn;
+
+ if (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI) {
+ int res = mlx4_qp_reserve_range(to_mdev(pd->device)->dev,
+ 1, 1, &sqpn, 0,
+ MLX4_RES_USAGE_DRIVER);
+
+ if (res)
+ return res;
+ } else {
+ sqpn = get_sqp_num(to_mdev(pd->device), init_attr);
+ }
+
+ qp->pri.vid = 0xFFFF;
+ qp->alt.vid = 0xFFFF;
+ err = create_qp_common(pd, init_attr, udata, sqpn, qp);
+ if (err)
+ return err;
+
+ if (init_attr->create_flags &
+ (MLX4_IB_SRIOV_SQP | MLX4_IB_SRIOV_TUNNEL_QP))
+ /* Internal QP created with ib_create_qp */
+ rdma_restrack_no_track(&qp->ibqp.res);
+
+ qp->port = init_attr->port_num;
+ qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 :
+ init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI ? sqpn : 1;
+ break;
+ }
+ default:
+ /* Don't support raw QPs */
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
+int mlx4_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct ib_device *device = ibqp->device;
+ struct mlx4_ib_dev *dev = to_mdev(device);
+ struct mlx4_ib_qp *qp = to_mqp(ibqp);
+ struct ib_pd *pd = ibqp->pd;
+ int ret;
+
+ mutex_init(&qp->mutex);
+ ret = _mlx4_ib_create_qp(pd, qp, init_attr, udata);
+ if (ret)
+ return ret;
+
+ if (init_attr->qp_type == IB_QPT_GSI &&
+ !(init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI)) {
+ struct mlx4_ib_sqp *sqp = qp->sqp;
+ int is_eth = rdma_cap_eth_ah(&dev->ib_dev, init_attr->port_num);
+
+ if (is_eth &&
+ dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
+ init_attr->create_flags |= MLX4_IB_QP_CREATE_ROCE_V2_GSI;
+ sqp->roce_v2_gsi = ib_create_qp(pd, init_attr);
+
+ if (IS_ERR(sqp->roce_v2_gsi)) {
+ pr_err("Failed to create GSI QP for RoCEv2 (%ld)\n", PTR_ERR(sqp->roce_v2_gsi));
+ sqp->roce_v2_gsi = NULL;
+ } else {
+ to_mqp(sqp->roce_v2_gsi)->flags |=
+ MLX4_IB_ROCE_V2_GSI_QP;
+ }
+
+ init_attr->create_flags &= ~MLX4_IB_QP_CREATE_ROCE_V2_GSI;
+ }
+ }
+ return 0;
+}
+
+static int _mlx4_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(qp->device);
+ struct mlx4_ib_qp *mqp = to_mqp(qp);
+
+ if (is_qp0(dev, mqp))
+ mlx4_CLOSE_PORT(dev->dev, mqp->port);
+
+ if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI &&
+ dev->qp1_proxy[mqp->port - 1] == mqp) {
+ mutex_lock(&dev->qp1_proxy_lock[mqp->port - 1]);
+ dev->qp1_proxy[mqp->port - 1] = NULL;
+ mutex_unlock(&dev->qp1_proxy_lock[mqp->port - 1]);
+ }
+
+ if (mqp->counter_index)
+ mlx4_ib_free_qp_counter(dev, mqp);
+
+ if (qp->rwq_ind_tbl) {
+ destroy_qp_rss(dev, mqp);
+ } else {
+ destroy_qp_common(dev, mqp, MLX4_IB_QP_SRC, udata);
+ }
+
+ kfree(mqp->sqp);
+ return 0;
+}
+
+int mlx4_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
+{
+ struct mlx4_ib_qp *mqp = to_mqp(qp);
+
+ if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
+ struct mlx4_ib_sqp *sqp = mqp->sqp;
+
+ if (sqp->roce_v2_gsi)
+ ib_destroy_qp(sqp->roce_v2_gsi);
+ }
+
+ return _mlx4_ib_destroy_qp(qp, udata);
+}
+
+static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type)
+{
+ switch (type) {
+ case MLX4_IB_QPT_RC: return MLX4_QP_ST_RC;
+ case MLX4_IB_QPT_UC: return MLX4_QP_ST_UC;
+ case MLX4_IB_QPT_UD: return MLX4_QP_ST_UD;
+ case MLX4_IB_QPT_XRC_INI:
+ case MLX4_IB_QPT_XRC_TGT: return MLX4_QP_ST_XRC;
+ case MLX4_IB_QPT_SMI:
+ case MLX4_IB_QPT_GSI:
+ case MLX4_IB_QPT_RAW_PACKET: return MLX4_QP_ST_MLX;
+
+ case MLX4_IB_QPT_PROXY_SMI_OWNER:
+ case MLX4_IB_QPT_TUN_SMI_OWNER: return (mlx4_is_mfunc(dev->dev) ?
+ MLX4_QP_ST_MLX : -1);
+ case MLX4_IB_QPT_PROXY_SMI:
+ case MLX4_IB_QPT_TUN_SMI:
+ case MLX4_IB_QPT_PROXY_GSI:
+ case MLX4_IB_QPT_TUN_GSI: return (mlx4_is_mfunc(dev->dev) ?
+ MLX4_QP_ST_UD : -1);
+ default: return -1;
+ }
+}
+
+static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr,
+ int attr_mask)
+{
+ u8 dest_rd_atomic;
+ u32 access_flags;
+ u32 hw_access_flags = 0;
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+ dest_rd_atomic = attr->max_dest_rd_atomic;
+ else
+ dest_rd_atomic = qp->resp_depth;
+
+ if (attr_mask & IB_QP_ACCESS_FLAGS)
+ access_flags = attr->qp_access_flags;
+ else
+ access_flags = qp->atomic_rd_en;
+
+ if (!dest_rd_atomic)
+ access_flags &= IB_ACCESS_REMOTE_WRITE;
+
+ if (access_flags & IB_ACCESS_REMOTE_READ)
+ hw_access_flags |= MLX4_QP_BIT_RRE;
+ if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
+ hw_access_flags |= MLX4_QP_BIT_RAE;
+ if (access_flags & IB_ACCESS_REMOTE_WRITE)
+ hw_access_flags |= MLX4_QP_BIT_RWE;
+
+ return cpu_to_be32(hw_access_flags);
+}
+
+static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr,
+ int attr_mask)
+{
+ if (attr_mask & IB_QP_PKEY_INDEX)
+ sqp->pkey_index = attr->pkey_index;
+ if (attr_mask & IB_QP_QKEY)
+ sqp->qkey = attr->qkey;
+ if (attr_mask & IB_QP_SQ_PSN)
+ sqp->send_psn = attr->sq_psn;
+}
+
+static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port)
+{
+ path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6);
+}
+
+static int _mlx4_set_path(struct mlx4_ib_dev *dev,
+ const struct rdma_ah_attr *ah,
+ u64 smac, u16 vlan_tag, struct mlx4_qp_path *path,
+ struct mlx4_roce_smac_vlan_info *smac_info, u8 port)
+{
+ int vidx;
+ int smac_index;
+ int err;
+
+ path->grh_mylmc = rdma_ah_get_path_bits(ah) & 0x7f;
+ path->rlid = cpu_to_be16(rdma_ah_get_dlid(ah));
+ if (rdma_ah_get_static_rate(ah)) {
+ path->static_rate = rdma_ah_get_static_rate(ah) +
+ MLX4_STAT_RATE_OFFSET;
+ while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+ !(1 << path->static_rate & dev->dev->caps.stat_rate_support))
+ --path->static_rate;
+ } else
+ path->static_rate = 0;
+
+ if (rdma_ah_get_ah_flags(ah) & IB_AH_GRH) {
+ const struct ib_global_route *grh = rdma_ah_read_grh(ah);
+ int real_sgid_index =
+ mlx4_ib_gid_index_to_real_index(dev, grh->sgid_attr);
+
+ if (real_sgid_index < 0)
+ return real_sgid_index;
+ if (real_sgid_index >= dev->dev->caps.gid_table_len[port]) {
+ pr_err("sgid_index (%u) too large. max is %d\n",
+ real_sgid_index, dev->dev->caps.gid_table_len[port] - 1);
+ return -1;
+ }
+
+ path->grh_mylmc |= 1 << 7;
+ path->mgid_index = real_sgid_index;
+ path->hop_limit = grh->hop_limit;
+ path->tclass_flowlabel =
+ cpu_to_be32((grh->traffic_class << 20) |
+ (grh->flow_label));
+ memcpy(path->rgid, grh->dgid.raw, 16);
+ }
+
+ if (ah->type == RDMA_AH_ATTR_TYPE_ROCE) {
+ if (!(rdma_ah_get_ah_flags(ah) & IB_AH_GRH))
+ return -1;
+
+ path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
+ ((port - 1) << 6) | ((rdma_ah_get_sl(ah) & 7) << 3);
+
+ path->feup |= MLX4_FEUP_FORCE_ETH_UP;
+ if (vlan_tag < 0x1000) {
+ if (smac_info->vid < 0x1000) {
+ /* both valid vlan ids */
+ if (smac_info->vid != vlan_tag) {
+ /* different VIDs. unreg old and reg new */
+ err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx);
+ if (err)
+ return err;
+ smac_info->candidate_vid = vlan_tag;
+ smac_info->candidate_vlan_index = vidx;
+ smac_info->candidate_vlan_port = port;
+ smac_info->update_vid = 1;
+ path->vlan_index = vidx;
+ } else {
+ path->vlan_index = smac_info->vlan_index;
+ }
+ } else {
+ /* no current vlan tag in qp */
+ err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx);
+ if (err)
+ return err;
+ smac_info->candidate_vid = vlan_tag;
+ smac_info->candidate_vlan_index = vidx;
+ smac_info->candidate_vlan_port = port;
+ smac_info->update_vid = 1;
+ path->vlan_index = vidx;
+ }
+ path->feup |= MLX4_FVL_FORCE_ETH_VLAN;
+ path->fl = 1 << 6;
+ } else {
+ /* have current vlan tag. unregister it at modify-qp success */
+ if (smac_info->vid < 0x1000) {
+ smac_info->candidate_vid = 0xFFFF;
+ smac_info->update_vid = 1;
+ }
+ }
+
+ /* get smac_index for RoCE use.
+ * If no smac was yet assigned, register one.
+ * If one was already assigned, but the new mac differs,
+ * unregister the old one and register the new one.
+ */
+ if ((!smac_info->smac && !smac_info->smac_port) ||
+ smac_info->smac != smac) {
+ /* register candidate now, unreg if needed, after success */
+ smac_index = mlx4_register_mac(dev->dev, port, smac);
+ if (smac_index >= 0) {
+ smac_info->candidate_smac_index = smac_index;
+ smac_info->candidate_smac = smac;
+ smac_info->candidate_smac_port = port;
+ } else {
+ return -EINVAL;
+ }
+ } else {
+ smac_index = smac_info->smac_index;
+ }
+ memcpy(path->dmac, ah->roce.dmac, 6);
+ path->ackto = MLX4_IB_LINK_TYPE_ETH;
+ /* put MAC table smac index for IBoE */
+ path->grh_mylmc = (u8) (smac_index) | 0x80;
+ } else {
+ path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
+ ((port - 1) << 6) | ((rdma_ah_get_sl(ah) & 0xf) << 2);
+ }
+
+ return 0;
+}
+
+static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_qp_attr *qp,
+ enum ib_qp_attr_mask qp_attr_mask,
+ struct mlx4_ib_qp *mqp,
+ struct mlx4_qp_path *path, u8 port,
+ u16 vlan_id, u8 *smac)
+{
+ return _mlx4_set_path(dev, &qp->ah_attr,
+ ether_addr_to_u64(smac),
+ vlan_id,
+ path, &mqp->pri, port);
+}
+
+static int mlx4_set_alt_path(struct mlx4_ib_dev *dev,
+ const struct ib_qp_attr *qp,
+ enum ib_qp_attr_mask qp_attr_mask,
+ struct mlx4_ib_qp *mqp,
+ struct mlx4_qp_path *path, u8 port)
+{
+ return _mlx4_set_path(dev, &qp->alt_ah_attr,
+ 0,
+ 0xffff,
+ path, &mqp->alt, port);
+}
+
+static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+ struct mlx4_ib_gid_entry *ge, *tmp;
+
+ list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
+ if (!ge->added && mlx4_ib_add_mc(dev, qp, &ge->gid)) {
+ ge->added = 1;
+ ge->port = qp->port;
+ }
+ }
+}
+
+static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev,
+ struct mlx4_ib_qp *qp,
+ struct mlx4_qp_context *context)
+{
+ u64 u64_mac;
+ int smac_index;
+
+ u64_mac = atomic64_read(&dev->iboe.mac[qp->port - 1]);
+
+ context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6);
+ if (!qp->pri.smac && !qp->pri.smac_port) {
+ smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac);
+ if (smac_index >= 0) {
+ qp->pri.candidate_smac_index = smac_index;
+ qp->pri.candidate_smac = u64_mac;
+ qp->pri.candidate_smac_port = qp->port;
+ context->pri_path.grh_mylmc = 0x80 | (u8) smac_index;
+ } else {
+ return -ENOENT;
+ }
+ }
+ return 0;
+}
+
+static int create_qp_lb_counter(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+ struct counter_index *new_counter_index;
+ int err;
+ u32 tmp_idx;
+
+ if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) !=
+ IB_LINK_LAYER_ETHERNET ||
+ !(qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) ||
+ !(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_LB_SRC_CHK))
+ return 0;
+
+ err = mlx4_counter_alloc(dev->dev, &tmp_idx, MLX4_RES_USAGE_DRIVER);
+ if (err)
+ return err;
+
+ new_counter_index = kmalloc(sizeof(*new_counter_index), GFP_KERNEL);
+ if (!new_counter_index) {
+ mlx4_counter_free(dev->dev, tmp_idx);
+ return -ENOMEM;
+ }
+
+ new_counter_index->index = tmp_idx;
+ new_counter_index->allocated = 1;
+ qp->counter_index = new_counter_index;
+
+ mutex_lock(&dev->counters_table[qp->port - 1].mutex);
+ list_add_tail(&new_counter_index->list,
+ &dev->counters_table[qp->port - 1].counters_list);
+ mutex_unlock(&dev->counters_table[qp->port - 1].mutex);
+
+ return 0;
+}
+
+enum {
+ MLX4_QPC_ROCE_MODE_1 = 0,
+ MLX4_QPC_ROCE_MODE_2 = 2,
+ MLX4_QPC_ROCE_MODE_UNDEFINED = 0xff
+};
+
+static u8 gid_type_to_qpc(enum ib_gid_type gid_type)
+{
+ switch (gid_type) {
+ case IB_GID_TYPE_ROCE:
+ return MLX4_QPC_ROCE_MODE_1;
+ case IB_GID_TYPE_ROCE_UDP_ENCAP:
+ return MLX4_QPC_ROCE_MODE_2;
+ default:
+ return MLX4_QPC_ROCE_MODE_UNDEFINED;
+ }
+}
+
+/*
+ * Go over all RSS QP's childes (WQs) and apply their HW state according to
+ * their logic state if the RSS QP is the first RSS QP associated for the WQ.
+ */
+static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num,
+ struct ib_udata *udata)
+{
+ int err = 0;
+ int i;
+
+ for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) {
+ struct ib_wq *ibwq = ind_tbl->ind_tbl[i];
+ struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq);
+
+ mutex_lock(&wq->mutex);
+
+ /* Mlx4_ib restrictions:
+ * WQ's is associated to a port according to the RSS QP it is
+ * associates to.
+ * In case the WQ is associated to a different port by another
+ * RSS QP, return a failure.
+ */
+ if ((wq->rss_usecnt > 0) && (wq->port != port_num)) {
+ err = -EINVAL;
+ mutex_unlock(&wq->mutex);
+ break;
+ }
+ wq->port = port_num;
+ if ((wq->rss_usecnt == 0) && (ibwq->state == IB_WQS_RDY)) {
+ err = _mlx4_ib_modify_wq(ibwq, IB_WQS_RDY, udata);
+ if (err) {
+ mutex_unlock(&wq->mutex);
+ break;
+ }
+ }
+ wq->rss_usecnt++;
+
+ mutex_unlock(&wq->mutex);
+ }
+
+ if (i && err) {
+ int j;
+
+ for (j = (i - 1); j >= 0; j--) {
+ struct ib_wq *ibwq = ind_tbl->ind_tbl[j];
+ struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq);
+
+ mutex_lock(&wq->mutex);
+
+ if ((wq->rss_usecnt == 1) &&
+ (ibwq->state == IB_WQS_RDY))
+ if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET,
+ udata))
+ pr_warn("failed to reverse WQN=0x%06x\n",
+ ibwq->wq_num);
+ wq->rss_usecnt--;
+
+ mutex_unlock(&wq->mutex);
+ }
+ }
+
+ return err;
+}
+
+static void bring_down_rss_rwqs(struct ib_rwq_ind_table *ind_tbl,
+ struct ib_udata *udata)
+{
+ int i;
+
+ for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) {
+ struct ib_wq *ibwq = ind_tbl->ind_tbl[i];
+ struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq);
+
+ mutex_lock(&wq->mutex);
+
+ if ((wq->rss_usecnt == 1) && (ibwq->state == IB_WQS_RDY))
+ if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET, udata))
+ pr_warn("failed to reverse WQN=%x\n",
+ ibwq->wq_num);
+ wq->rss_usecnt--;
+
+ mutex_unlock(&wq->mutex);
+ }
+}
+
+static void fill_qp_rss_context(struct mlx4_qp_context *context,
+ struct mlx4_ib_qp *qp)
+{
+ struct mlx4_rss_context *rss_context;
+
+ rss_context = (void *)context + offsetof(struct mlx4_qp_context,
+ pri_path) + MLX4_RSS_OFFSET_IN_QPC_PRI_PATH;
+
+ rss_context->base_qpn = cpu_to_be32(qp->rss_ctx->base_qpn_tbl_sz);
+ rss_context->default_qpn =
+ cpu_to_be32(qp->rss_ctx->base_qpn_tbl_sz & 0xffffff);
+ if (qp->rss_ctx->flags & (MLX4_RSS_UDP_IPV4 | MLX4_RSS_UDP_IPV6))
+ rss_context->base_qpn_udp = rss_context->default_qpn;
+ rss_context->flags = qp->rss_ctx->flags;
+ /* Currently support just toeplitz */
+ rss_context->hash_fn = MLX4_RSS_HASH_TOP;
+
+ memcpy(rss_context->rss_key, qp->rss_ctx->rss_key,
+ MLX4_EN_RSS_KEY_SIZE);
+}
+
+static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
+ const struct ib_qp_attr *attr, int attr_mask,
+ enum ib_qp_state cur_state,
+ enum ib_qp_state new_state,
+ struct ib_udata *udata)
+{
+ struct ib_srq *ibsrq;
+ const struct ib_gid_attr *gid_attr = NULL;
+ struct ib_rwq_ind_table *rwq_ind_tbl;
+ enum ib_qp_type qp_type;
+ struct mlx4_ib_dev *dev;
+ struct mlx4_ib_qp *qp;
+ struct mlx4_ib_pd *pd;
+ struct mlx4_ib_cq *send_cq, *recv_cq;
+ struct mlx4_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+ udata, struct mlx4_ib_ucontext, ibucontext);
+ struct mlx4_qp_context *context;
+ enum mlx4_qp_optpar optpar = 0;
+ int sqd_event;
+ int steer_qp = 0;
+ int err = -EINVAL;
+ int counter_index;
+
+ if (src_type == MLX4_IB_RWQ_SRC) {
+ struct ib_wq *ibwq;
+
+ ibwq = (struct ib_wq *)src;
+ ibsrq = NULL;
+ rwq_ind_tbl = NULL;
+ qp_type = IB_QPT_RAW_PACKET;
+ qp = to_mqp((struct ib_qp *)ibwq);
+ dev = to_mdev(ibwq->device);
+ pd = to_mpd(ibwq->pd);
+ } else {
+ struct ib_qp *ibqp;
+
+ ibqp = (struct ib_qp *)src;
+ ibsrq = ibqp->srq;
+ rwq_ind_tbl = ibqp->rwq_ind_tbl;
+ qp_type = ibqp->qp_type;
+ qp = to_mqp(ibqp);
+ dev = to_mdev(ibqp->device);
+ pd = get_pd(qp);
+ }
+
+ /* APM is not supported under RoCE */
+ if (attr_mask & IB_QP_ALT_PATH &&
+ rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
+ IB_LINK_LAYER_ETHERNET)
+ return -ENOTSUPP;
+
+ context = kzalloc(sizeof *context, GFP_KERNEL);
+ if (!context)
+ return -ENOMEM;
+
+ context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) |
+ (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16));
+
+ if (!(attr_mask & IB_QP_PATH_MIG_STATE))
+ context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
+ else {
+ optpar |= MLX4_QP_OPTPAR_PM_STATE;
+ switch (attr->path_mig_state) {
+ case IB_MIG_MIGRATED:
+ context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
+ break;
+ case IB_MIG_REARM:
+ context->flags |= cpu_to_be32(MLX4_QP_PM_REARM << 11);
+ break;
+ case IB_MIG_ARMED:
+ context->flags |= cpu_to_be32(MLX4_QP_PM_ARMED << 11);
+ break;
+ }
+ }
+
+ if (qp->inl_recv_sz)
+ context->param3 |= cpu_to_be32(1 << 25);
+
+ if (qp->flags & MLX4_IB_QP_SCATTER_FCS)
+ context->param3 |= cpu_to_be32(1 << 29);
+
+ if (qp_type == IB_QPT_GSI || qp_type == IB_QPT_SMI)
+ context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
+ else if (qp_type == IB_QPT_RAW_PACKET)
+ context->mtu_msgmax = (MLX4_RAW_QP_MTU << 5) | MLX4_RAW_QP_MSGMAX;
+ else if (qp_type == IB_QPT_UD) {
+ if (qp->flags & MLX4_IB_QP_LSO)
+ context->mtu_msgmax = (IB_MTU_4096 << 5) |
+ ilog2(dev->dev->caps.max_gso_sz);
+ else
+ context->mtu_msgmax = (IB_MTU_4096 << 5) | 13;
+ } else if (attr_mask & IB_QP_PATH_MTU) {
+ if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) {
+ pr_err("path MTU (%u) is invalid\n",
+ attr->path_mtu);
+ goto out;
+ }
+ context->mtu_msgmax = (attr->path_mtu << 5) |
+ ilog2(dev->dev->caps.max_msg_sz);
+ }
+
+ if (!rwq_ind_tbl) { /* PRM RSS receive side should be left zeros */
+ if (qp->rq.wqe_cnt)
+ context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3;
+ context->rq_size_stride |= qp->rq.wqe_shift - 4;
+ }
+
+ if (qp->sq.wqe_cnt)
+ context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3;
+ context->sq_size_stride |= qp->sq.wqe_shift - 4;
+
+ if (new_state == IB_QPS_RESET && qp->counter_index)
+ mlx4_ib_free_qp_counter(dev, qp);
+
+ if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+ context->sq_size_stride |= !!qp->sq_no_prefetch << 7;
+ context->xrcd = cpu_to_be32((u32) qp->xrcdn);
+ if (qp_type == IB_QPT_RAW_PACKET)
+ context->param3 |= cpu_to_be32(1 << 30);
+ }
+
+ if (ucontext)
+ context->usr_page = cpu_to_be32(
+ mlx4_to_hw_uar_index(dev->dev, ucontext->uar.index));
+ else
+ context->usr_page = cpu_to_be32(
+ mlx4_to_hw_uar_index(dev->dev, dev->priv_uar.index));
+
+ if (attr_mask & IB_QP_DEST_QPN)
+ context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
+
+ if (attr_mask & IB_QP_PORT) {
+ if (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD &&
+ !(attr_mask & IB_QP_AV)) {
+ mlx4_set_sched(&context->pri_path, attr->port_num);
+ optpar |= MLX4_QP_OPTPAR_SCHED_QUEUE;
+ }
+ }
+
+ if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
+ err = create_qp_lb_counter(dev, qp);
+ if (err)
+ goto out;
+
+ counter_index =
+ dev->counters_table[qp->port - 1].default_counter;
+ if (qp->counter_index)
+ counter_index = qp->counter_index->index;
+
+ if (counter_index != -1) {
+ context->pri_path.counter_index = counter_index;
+ optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX;
+ if (qp->counter_index) {
+ context->pri_path.fl |=
+ MLX4_FL_ETH_SRC_CHECK_MC_LB;
+ context->pri_path.vlan_control |=
+ MLX4_CTRL_ETH_SRC_CHECK_IF_COUNTER;
+ }
+ } else
+ context->pri_path.counter_index =
+ MLX4_SINK_COUNTER_INDEX(dev->dev);
+
+ if (qp->flags & MLX4_IB_QP_NETIF) {
+ mlx4_ib_steer_qp_reg(dev, qp, 1);
+ steer_qp = 1;
+ }
+
+ if (qp_type == IB_QPT_GSI) {
+ enum ib_gid_type gid_type = qp->flags & MLX4_IB_ROCE_V2_GSI_QP ?
+ IB_GID_TYPE_ROCE_UDP_ENCAP : IB_GID_TYPE_ROCE;
+ u8 qpc_roce_mode = gid_type_to_qpc(gid_type);
+
+ context->rlkey_roce_mode |= (qpc_roce_mode << 6);
+ }
+ }
+
+ if (attr_mask & IB_QP_PKEY_INDEX) {
+ if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV)
+ context->pri_path.disable_pkey_check = 0x40;
+ context->pri_path.pkey_index = attr->pkey_index;
+ optpar |= MLX4_QP_OPTPAR_PKEY_INDEX;
+ }
+
+ if (attr_mask & IB_QP_AV) {
+ u8 port_num = mlx4_is_bonded(dev->dev) ? 1 :
+ attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+ u16 vlan = 0xffff;
+ u8 smac[ETH_ALEN];
+ int is_eth =
+ rdma_cap_eth_ah(&dev->ib_dev, port_num) &&
+ rdma_ah_get_ah_flags(&attr->ah_attr) & IB_AH_GRH;
+
+ if (is_eth) {
+ gid_attr = attr->ah_attr.grh.sgid_attr;
+ err = rdma_read_gid_l2_fields(gid_attr, &vlan,
+ &smac[0]);
+ if (err)
+ goto out;
+ }
+
+ if (mlx4_set_path(dev, attr, attr_mask, qp, &context->pri_path,
+ port_num, vlan, smac))
+ goto out;
+
+ optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH |
+ MLX4_QP_OPTPAR_SCHED_QUEUE);
+
+ if (is_eth &&
+ (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR)) {
+ u8 qpc_roce_mode = gid_type_to_qpc(gid_attr->gid_type);
+
+ if (qpc_roce_mode == MLX4_QPC_ROCE_MODE_UNDEFINED) {
+ err = -EINVAL;
+ goto out;
+ }
+ context->rlkey_roce_mode |= (qpc_roce_mode << 6);
+ }
+
+ }
+
+ if (attr_mask & IB_QP_TIMEOUT) {
+ context->pri_path.ackto |= attr->timeout << 3;
+ optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT;
+ }
+
+ if (attr_mask & IB_QP_ALT_PATH) {
+ if (attr->alt_port_num == 0 ||
+ attr->alt_port_num > dev->dev->caps.num_ports)
+ goto out;
+
+ if (attr->alt_pkey_index >=
+ dev->dev->caps.pkey_table_len[attr->alt_port_num])
+ goto out;
+
+ if (mlx4_set_alt_path(dev, attr, attr_mask, qp,
+ &context->alt_path,
+ attr->alt_port_num))
+ goto out;
+
+ context->alt_path.pkey_index = attr->alt_pkey_index;
+ context->alt_path.ackto = attr->alt_timeout << 3;
+ optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH;
+ }
+
+ context->pd = cpu_to_be32(pd->pdn);
+
+ if (!rwq_ind_tbl) {
+ context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);
+ get_cqs(qp, src_type, &send_cq, &recv_cq);
+ } else { /* Set dummy CQs to be compatible with HV and PRM */
+ send_cq = to_mcq(rwq_ind_tbl->ind_tbl[0]->cq);
+ recv_cq = send_cq;
+ }
+ context->cqn_send = cpu_to_be32(send_cq->mcq.cqn);
+ context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn);
+
+ /* Set "fast registration enabled" for all kernel QPs */
+ if (!ucontext)
+ context->params1 |= cpu_to_be32(1 << 11);
+
+ if (attr_mask & IB_QP_RNR_RETRY) {
+ context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
+ optpar |= MLX4_QP_OPTPAR_RNR_RETRY;
+ }
+
+ if (attr_mask & IB_QP_RETRY_CNT) {
+ context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+ optpar |= MLX4_QP_OPTPAR_RETRY_COUNT;
+ }
+
+ if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+ if (attr->max_rd_atomic)
+ context->params1 |=
+ cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
+ optpar |= MLX4_QP_OPTPAR_SRA_MAX;
+ }
+
+ if (attr_mask & IB_QP_SQ_PSN)
+ context->next_send_psn = cpu_to_be32(attr->sq_psn);
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+ if (attr->max_dest_rd_atomic)
+ context->params2 |=
+ cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
+ optpar |= MLX4_QP_OPTPAR_RRA_MAX;
+ }
+
+ if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
+ context->params2 |= to_mlx4_access_flags(qp, attr, attr_mask);
+ optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE;
+ }
+
+ if (ibsrq)
+ context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC);
+
+ if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+ context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+ optpar |= MLX4_QP_OPTPAR_RNR_TIMEOUT;
+ }
+ if (attr_mask & IB_QP_RQ_PSN)
+ context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+
+ /* proxy and tunnel qp qkeys will be changed in modify-qp wrappers */
+ if (attr_mask & IB_QP_QKEY) {
+ if (qp->mlx4_ib_qp_type &
+ (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))
+ context->qkey = cpu_to_be32(IB_QP_SET_QKEY);
+ else {
+ if (mlx4_is_mfunc(dev->dev) &&
+ !(qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) &&
+ (attr->qkey & MLX4_RESERVED_QKEY_MASK) ==
+ MLX4_RESERVED_QKEY_BASE) {
+ pr_err("Cannot use reserved QKEY"
+ " 0x%x (range 0xffff0000..0xffffffff"
+ " is reserved)\n", attr->qkey);
+ err = -EINVAL;
+ goto out;
+ }
+ context->qkey = cpu_to_be32(attr->qkey);
+ }
+ optpar |= MLX4_QP_OPTPAR_Q_KEY;
+ }
+
+ if (ibsrq)
+ context->srqn = cpu_to_be32(1 << 24 |
+ to_msrq(ibsrq)->msrq.srqn);
+
+ if (qp->rq.wqe_cnt &&
+ cur_state == IB_QPS_RESET &&
+ new_state == IB_QPS_INIT)
+ context->db_rec_addr = cpu_to_be64(qp->db.dma);
+
+ if (cur_state == IB_QPS_INIT &&
+ new_state == IB_QPS_RTR &&
+ (qp_type == IB_QPT_GSI || qp_type == IB_QPT_SMI ||
+ qp_type == IB_QPT_UD || qp_type == IB_QPT_RAW_PACKET)) {
+ context->pri_path.sched_queue = (qp->port - 1) << 6;
+ if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI ||
+ qp->mlx4_ib_qp_type &
+ (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) {
+ context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE;
+ if (qp->mlx4_ib_qp_type != MLX4_IB_QPT_SMI)
+ context->pri_path.fl = 0x80;
+ } else {
+ if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV)
+ context->pri_path.fl = 0x80;
+ context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE;
+ }
+ if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
+ IB_LINK_LAYER_ETHERNET) {
+ if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI ||
+ qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI)
+ context->pri_path.feup = 1 << 7; /* don't fsm */
+ /* handle smac_index */
+ if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD ||
+ qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI ||
+ qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) {
+ err = handle_eth_ud_smac_index(dev, qp, context);
+ if (err) {
+ err = -EINVAL;
+ goto out;
+ }
+ if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI)
+ dev->qp1_proxy[qp->port - 1] = qp;
+ }
+ }
+ }
+
+ if (qp_type == IB_QPT_RAW_PACKET) {
+ context->pri_path.ackto = (context->pri_path.ackto & 0xf8) |
+ MLX4_IB_LINK_TYPE_ETH;
+ if (dev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+ /* set QP to receive both tunneled & non-tunneled packets */
+ if (!rwq_ind_tbl)
+ context->srqn = cpu_to_be32(7 << 28);
+ }
+ }
+
+ if (qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) {
+ int is_eth = rdma_port_get_link_layer(
+ &dev->ib_dev, qp->port) ==
+ IB_LINK_LAYER_ETHERNET;
+ if (is_eth) {
+ context->pri_path.ackto = MLX4_IB_LINK_TYPE_ETH;
+ optpar |= MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH;
+ }
+ }
+
+ if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD &&
+ attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
+ sqd_event = 1;
+ else
+ sqd_event = 0;
+
+ if (!ucontext &&
+ cur_state == IB_QPS_RESET &&
+ new_state == IB_QPS_INIT)
+ context->rlkey_roce_mode |= (1 << 4);
+
+ /*
+ * Before passing a kernel QP to the HW, make sure that the
+ * ownership bits of the send queue are set and the SQ
+ * headroom is stamped so that the hardware doesn't start
+ * processing stale work requests.
+ */
+ if (!ucontext &&
+ cur_state == IB_QPS_RESET &&
+ new_state == IB_QPS_INIT) {
+ struct mlx4_wqe_ctrl_seg *ctrl;
+ int i;
+
+ for (i = 0; i < qp->sq.wqe_cnt; ++i) {
+ ctrl = get_send_wqe(qp, i);
+ ctrl->owner_opcode = cpu_to_be32(1 << 31);
+ ctrl->qpn_vlan.fence_size =
+ 1 << (qp->sq.wqe_shift - 4);
+ stamp_send_wqe(qp, i);
+ }
+ }
+
+ if (rwq_ind_tbl &&
+ cur_state == IB_QPS_RESET &&
+ new_state == IB_QPS_INIT) {
+ fill_qp_rss_context(context, qp);
+ context->flags |= cpu_to_be32(1 << MLX4_RSS_QPC_FLAG_OFFSET);
+ }
+
+ err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state),
+ to_mlx4_state(new_state), context, optpar,
+ sqd_event, &qp->mqp);
+ if (err)
+ goto out;
+
+ qp->state = new_state;
+
+ if (attr_mask & IB_QP_ACCESS_FLAGS)
+ qp->atomic_rd_en = attr->qp_access_flags;
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+ qp->resp_depth = attr->max_dest_rd_atomic;
+ if (attr_mask & IB_QP_PORT) {
+ qp->port = attr->port_num;
+ update_mcg_macs(dev, qp);
+ }
+ if (attr_mask & IB_QP_ALT_PATH)
+ qp->alt_port = attr->alt_port_num;
+
+ if (is_sqp(dev, qp))
+ store_sqp_attrs(qp->sqp, attr, attr_mask);
+
+ /*
+ * If we moved QP0 to RTR, bring the IB link up; if we moved
+ * QP0 to RESET or ERROR, bring the link back down.
+ */
+ if (is_qp0(dev, qp)) {
+ if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR)
+ if (mlx4_INIT_PORT(dev->dev, qp->port))
+ pr_warn("INIT_PORT failed for port %d\n",
+ qp->port);
+
+ if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
+ (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
+ mlx4_CLOSE_PORT(dev->dev, qp->port);
+ }
+
+ /*
+ * If we moved a kernel QP to RESET, clean up all old CQ
+ * entries and reinitialize the QP.
+ */
+ if (new_state == IB_QPS_RESET) {
+ if (!ucontext) {
+ mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
+ ibsrq ? to_msrq(ibsrq) : NULL);
+ if (send_cq != recv_cq)
+ mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+
+ qp->rq.head = 0;
+ qp->rq.tail = 0;
+ qp->sq.head = 0;
+ qp->sq.tail = 0;
+ qp->sq_next_wqe = 0;
+ if (qp->rq.wqe_cnt)
+ *qp->db.db = 0;
+
+ if (qp->flags & MLX4_IB_QP_NETIF)
+ mlx4_ib_steer_qp_reg(dev, qp, 0);
+ }
+ if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) {
+ mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
+ qp->pri.smac = 0;
+ qp->pri.smac_port = 0;
+ }
+ if (qp->alt.smac) {
+ mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
+ qp->alt.smac = 0;
+ }
+ if (qp->pri.vid < 0x1000) {
+ mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid);
+ qp->pri.vid = 0xFFFF;
+ qp->pri.candidate_vid = 0xFFFF;
+ qp->pri.update_vid = 0;
+ }
+
+ if (qp->alt.vid < 0x1000) {
+ mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid);
+ qp->alt.vid = 0xFFFF;
+ qp->alt.candidate_vid = 0xFFFF;
+ qp->alt.update_vid = 0;
+ }
+ }
+out:
+ if (err && qp->counter_index)
+ mlx4_ib_free_qp_counter(dev, qp);
+ if (err && steer_qp)
+ mlx4_ib_steer_qp_reg(dev, qp, 0);
+ kfree(context);
+ if (qp->pri.candidate_smac ||
+ (!qp->pri.candidate_smac && qp->pri.candidate_smac_port)) {
+ if (err) {
+ mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port, qp->pri.candidate_smac);
+ } else {
+ if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port))
+ mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
+ qp->pri.smac = qp->pri.candidate_smac;
+ qp->pri.smac_index = qp->pri.candidate_smac_index;
+ qp->pri.smac_port = qp->pri.candidate_smac_port;
+ }
+ qp->pri.candidate_smac = 0;
+ qp->pri.candidate_smac_index = 0;
+ qp->pri.candidate_smac_port = 0;
+ }
+ if (qp->alt.candidate_smac) {
+ if (err) {
+ mlx4_unregister_mac(dev->dev, qp->alt.candidate_smac_port, qp->alt.candidate_smac);
+ } else {
+ if (qp->alt.smac)
+ mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
+ qp->alt.smac = qp->alt.candidate_smac;
+ qp->alt.smac_index = qp->alt.candidate_smac_index;
+ qp->alt.smac_port = qp->alt.candidate_smac_port;
+ }
+ qp->alt.candidate_smac = 0;
+ qp->alt.candidate_smac_index = 0;
+ qp->alt.candidate_smac_port = 0;
+ }
+
+ if (qp->pri.update_vid) {
+ if (err) {
+ if (qp->pri.candidate_vid < 0x1000)
+ mlx4_unregister_vlan(dev->dev, qp->pri.candidate_vlan_port,
+ qp->pri.candidate_vid);
+ } else {
+ if (qp->pri.vid < 0x1000)
+ mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port,
+ qp->pri.vid);
+ qp->pri.vid = qp->pri.candidate_vid;
+ qp->pri.vlan_port = qp->pri.candidate_vlan_port;
+ qp->pri.vlan_index = qp->pri.candidate_vlan_index;
+ }
+ qp->pri.candidate_vid = 0xFFFF;
+ qp->pri.update_vid = 0;
+ }
+
+ if (qp->alt.update_vid) {
+ if (err) {
+ if (qp->alt.candidate_vid < 0x1000)
+ mlx4_unregister_vlan(dev->dev, qp->alt.candidate_vlan_port,
+ qp->alt.candidate_vid);
+ } else {
+ if (qp->alt.vid < 0x1000)
+ mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port,
+ qp->alt.vid);
+ qp->alt.vid = qp->alt.candidate_vid;
+ qp->alt.vlan_port = qp->alt.candidate_vlan_port;
+ qp->alt.vlan_index = qp->alt.candidate_vlan_index;
+ }
+ qp->alt.candidate_vid = 0xFFFF;
+ qp->alt.update_vid = 0;
+ }
+
+ return err;
+}
+
+enum {
+ MLX4_IB_MODIFY_QP_RSS_SUP_ATTR_MSK = (IB_QP_STATE |
+ IB_QP_PORT),
+};
+
+static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+ struct mlx4_ib_qp *qp = to_mqp(ibqp);
+ enum ib_qp_state cur_state, new_state;
+ int err = -EINVAL;
+ mutex_lock(&qp->mutex);
+
+ cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
+ new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+ if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+ attr_mask)) {
+ pr_debug("qpn 0x%x: invalid attribute mask specified "
+ "for transition %d to %d. qp_type %d,"
+ " attr_mask 0x%x\n",
+ ibqp->qp_num, cur_state, new_state,
+ ibqp->qp_type, attr_mask);
+ goto out;
+ }
+
+ if (ibqp->rwq_ind_tbl) {
+ if (!(((cur_state == IB_QPS_RESET) &&
+ (new_state == IB_QPS_INIT)) ||
+ ((cur_state == IB_QPS_INIT) &&
+ (new_state == IB_QPS_RTR)))) {
+ pr_debug("qpn 0x%x: RSS QP unsupported transition %d to %d\n",
+ ibqp->qp_num, cur_state, new_state);
+
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (attr_mask & ~MLX4_IB_MODIFY_QP_RSS_SUP_ATTR_MSK) {
+ pr_debug("qpn 0x%x: RSS QP unsupported attribute mask 0x%x for transition %d to %d\n",
+ ibqp->qp_num, attr_mask, cur_state, new_state);
+
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+ }
+
+ if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) {
+ if ((cur_state == IB_QPS_RESET) && (new_state == IB_QPS_INIT)) {
+ if ((ibqp->qp_type == IB_QPT_RC) ||
+ (ibqp->qp_type == IB_QPT_UD) ||
+ (ibqp->qp_type == IB_QPT_UC) ||
+ (ibqp->qp_type == IB_QPT_RAW_PACKET) ||
+ (ibqp->qp_type == IB_QPT_XRC_INI)) {
+ attr->port_num = mlx4_ib_bond_next_port(dev);
+ }
+ } else {
+ /* no sense in changing port_num
+ * when ports are bonded */
+ attr_mask &= ~IB_QP_PORT;
+ }
+ }
+
+ if ((attr_mask & IB_QP_PORT) &&
+ (attr->port_num == 0 || attr->port_num > dev->num_ports)) {
+ pr_debug("qpn 0x%x: invalid port number (%d) specified "
+ "for transition %d to %d. qp_type %d\n",
+ ibqp->qp_num, attr->port_num, cur_state,
+ new_state, ibqp->qp_type);
+ goto out;
+ }
+
+ if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type == IB_QPT_RAW_PACKET) &&
+ (rdma_port_get_link_layer(&dev->ib_dev, attr->port_num) !=
+ IB_LINK_LAYER_ETHERNET))
+ goto out;
+
+ if (attr_mask & IB_QP_PKEY_INDEX) {
+ int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+ if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) {
+ pr_debug("qpn 0x%x: invalid pkey index (%d) specified "
+ "for transition %d to %d. qp_type %d\n",
+ ibqp->qp_num, attr->pkey_index, cur_state,
+ new_state, ibqp->qp_type);
+ goto out;
+ }
+ }
+
+ if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+ attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
+ pr_debug("qpn 0x%x: max_rd_atomic (%d) too large. "
+ "Transition %d to %d. qp_type %d\n",
+ ibqp->qp_num, attr->max_rd_atomic, cur_state,
+ new_state, ibqp->qp_type);
+ goto out;
+ }
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+ attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) {
+ pr_debug("qpn 0x%x: max_dest_rd_atomic (%d) too large. "
+ "Transition %d to %d. qp_type %d\n",
+ ibqp->qp_num, attr->max_dest_rd_atomic, cur_state,
+ new_state, ibqp->qp_type);
+ goto out;
+ }
+
+ if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+ err = 0;
+ goto out;
+ }
+
+ if (ibqp->rwq_ind_tbl && (new_state == IB_QPS_INIT)) {
+ err = bringup_rss_rwqs(ibqp->rwq_ind_tbl, attr->port_num,
+ udata);
+ if (err)
+ goto out;
+ }
+
+ err = __mlx4_ib_modify_qp(ibqp, MLX4_IB_QP_SRC, attr, attr_mask,
+ cur_state, new_state, udata);
+
+ if (ibqp->rwq_ind_tbl && err)
+ bring_down_rss_rwqs(ibqp->rwq_ind_tbl, udata);
+
+ if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT))
+ attr->port_num = 1;
+
+out:
+ mutex_unlock(&qp->mutex);
+ return err;
+}
+
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata)
+{
+ struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+ int ret;
+
+ if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS)
+ return -EOPNOTSUPP;
+
+ ret = _mlx4_ib_modify_qp(ibqp, attr, attr_mask, udata);
+
+ if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
+ struct mlx4_ib_sqp *sqp = mqp->sqp;
+ int err = 0;
+
+ if (sqp->roce_v2_gsi)
+ err = ib_modify_qp(sqp->roce_v2_gsi, attr, attr_mask);
+ if (err)
+ pr_err("Failed to modify GSI QP for RoCEv2 (%d)\n",
+ err);
+ }
+ return ret;
+}
+
+static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey)
+{
+ int i;
+ for (i = 0; i < dev->caps.num_ports; i++) {
+ if (qpn == dev->caps.spec_qps[i].qp0_proxy ||
+ qpn == dev->caps.spec_qps[i].qp0_tunnel) {
+ *qkey = dev->caps.spec_qps[i].qp0_qkey;
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+static int build_sriov_qp0_header(struct mlx4_ib_qp *qp,
+ const struct ib_ud_wr *wr,
+ void *wqe, unsigned *mlx_seg_len)
+{
+ struct mlx4_ib_dev *mdev = to_mdev(qp->ibqp.device);
+ struct mlx4_ib_sqp *sqp = qp->sqp;
+ struct ib_device *ib_dev = qp->ibqp.device;
+ struct mlx4_wqe_mlx_seg *mlx = wqe;
+ struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
+ struct mlx4_ib_ah *ah = to_mah(wr->ah);
+ u16 pkey;
+ u32 qkey;
+ int send_size;
+ int header_size;
+ int spc;
+ int err;
+ int i;
+
+ if (wr->wr.opcode != IB_WR_SEND)
+ return -EINVAL;
+
+ send_size = 0;
+
+ for (i = 0; i < wr->wr.num_sge; ++i)
+ send_size += wr->wr.sg_list[i].length;
+
+ /* for proxy-qp0 sends, need to add in size of tunnel header */
+ /* for tunnel-qp0 sends, tunnel header is already in s/g list */
+ if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER)
+ send_size += sizeof (struct mlx4_ib_tunnel_header);
+
+ ib_ud_header_init(send_size, 1, 0, 0, 0, 0, 0, 0, &sqp->ud_header);
+
+ if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) {
+ sqp->ud_header.lrh.service_level =
+ be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+ sqp->ud_header.lrh.destination_lid =
+ cpu_to_be16(ah->av.ib.g_slid & 0x7f);
+ sqp->ud_header.lrh.source_lid =
+ cpu_to_be16(ah->av.ib.g_slid & 0x7f);
+ }
+
+ mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+ /* force loopback */
+ mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_VL15 | 0x1 | MLX4_WQE_MLX_SLR);
+ mlx->rlid = sqp->ud_header.lrh.destination_lid;
+
+ sqp->ud_header.lrh.virtual_lane = 0;
+ sqp->ud_header.bth.solicited_event = !!(wr->wr.send_flags & IB_SEND_SOLICITED);
+ err = ib_get_cached_pkey(ib_dev, qp->port, 0, &pkey);
+ if (err)
+ return err;
+ sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+ if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI_OWNER)
+ sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->remote_qpn);
+ else
+ sqp->ud_header.bth.destination_qpn =
+ cpu_to_be32(mdev->dev->caps.spec_qps[qp->port - 1].qp0_tunnel);
+
+ sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+ if (mlx4_is_master(mdev->dev)) {
+ if (mlx4_get_parav_qkey(mdev->dev, qp->mqp.qpn, &qkey))
+ return -EINVAL;
+ } else {
+ if (vf_get_qp0_qkey(mdev->dev, qp->mqp.qpn, &qkey))
+ return -EINVAL;
+ }
+ sqp->ud_header.deth.qkey = cpu_to_be32(qkey);
+ sqp->ud_header.deth.source_qpn = cpu_to_be32(qp->mqp.qpn);
+
+ sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+ sqp->ud_header.immediate_present = 0;
+
+ header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
+
+ /*
+ * Inline data segments may not cross a 64 byte boundary. If
+ * our UD header is bigger than the space available up to the
+ * next 64 byte boundary in the WQE, use two inline data
+ * segments to hold the UD header.
+ */
+ spc = MLX4_INLINE_ALIGN -
+ ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+ if (header_size <= spc) {
+ inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+ memcpy(inl + 1, sqp->header_buf, header_size);
+ i = 1;
+ } else {
+ inl->byte_count = cpu_to_be32(1 << 31 | spc);
+ memcpy(inl + 1, sqp->header_buf, spc);
+
+ inl = (void *) (inl + 1) + spc;
+ memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
+ /*
+ * Need a barrier here to make sure all the data is
+ * visible before the byte_count field is set.
+ * Otherwise the HCA prefetcher could grab the 64-byte
+ * chunk with this inline segment and get a valid (!=
+ * 0xffffffff) byte count but stale data, and end up
+ * generating a packet with bad headers.
+ *
+ * The first inline segment's byte_count field doesn't
+ * need a barrier, because it comes after a
+ * control/MLX segment and therefore is at an offset
+ * of 16 mod 64.
+ */
+ wmb();
+ inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
+ i = 2;
+ }
+
+ *mlx_seg_len =
+ ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
+ return 0;
+}
+
+static u8 sl_to_vl(struct mlx4_ib_dev *dev, u8 sl, int port_num)
+{
+ union sl2vl_tbl_to_u64 tmp_vltab;
+ u8 vl;
+
+ if (sl > 15)
+ return 0xf;
+ tmp_vltab.sl64 = atomic64_read(&dev->sl2vl[port_num - 1]);
+ vl = tmp_vltab.sl8[sl >> 1];
+ if (sl & 1)
+ vl &= 0x0f;
+ else
+ vl >>= 4;
+ return vl;
+}
+
+static int fill_gid_by_hw_index(struct mlx4_ib_dev *ibdev, u8 port_num,
+ int index, union ib_gid *gid,
+ enum ib_gid_type *gid_type)
+{
+ struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+ struct mlx4_port_gid_table *port_gid_table;
+ unsigned long flags;
+
+ port_gid_table = &iboe->gids[port_num - 1];
+ spin_lock_irqsave(&iboe->lock, flags);
+ memcpy(gid, &port_gid_table->gids[index].gid, sizeof(*gid));
+ *gid_type = port_gid_table->gids[index].gid_type;
+ spin_unlock_irqrestore(&iboe->lock, flags);
+ if (rdma_is_zero_gid(gid))
+ return -ENOENT;
+
+ return 0;
+}
+
+#define MLX4_ROCEV2_QP1_SPORT 0xC000
+static int build_mlx_header(struct mlx4_ib_qp *qp, const struct ib_ud_wr *wr,
+ void *wqe, unsigned *mlx_seg_len)
+{
+ struct mlx4_ib_sqp *sqp = qp->sqp;
+ struct ib_device *ib_dev = qp->ibqp.device;
+ struct mlx4_ib_dev *ibdev = to_mdev(ib_dev);
+ struct mlx4_wqe_mlx_seg *mlx = wqe;
+ struct mlx4_wqe_ctrl_seg *ctrl = wqe;
+ struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
+ struct mlx4_ib_ah *ah = to_mah(wr->ah);
+ union ib_gid sgid;
+ u16 pkey;
+ int send_size;
+ int header_size;
+ int spc;
+ int i;
+ int err = 0;
+ u16 vlan = 0xffff;
+ bool is_eth;
+ bool is_vlan = false;
+ bool is_grh;
+ bool is_udp = false;
+ int ip_version = 0;
+
+ send_size = 0;
+ for (i = 0; i < wr->wr.num_sge; ++i)
+ send_size += wr->wr.sg_list[i].length;
+
+ is_eth = rdma_port_get_link_layer(qp->ibqp.device, qp->port) == IB_LINK_LAYER_ETHERNET;
+ is_grh = mlx4_ib_ah_grh_present(ah);
+ if (is_eth) {
+ enum ib_gid_type gid_type;
+ if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
+ /* When multi-function is enabled, the ib_core gid
+ * indexes don't necessarily match the hw ones, so
+ * we must use our own cache */
+ err = mlx4_get_roce_gid_from_slave(to_mdev(ib_dev)->dev,
+ be32_to_cpu(ah->av.ib.port_pd) >> 24,
+ ah->av.ib.gid_index, &sgid.raw[0]);
+ if (err)
+ return err;
+ } else {
+ err = fill_gid_by_hw_index(ibdev, qp->port,
+ ah->av.ib.gid_index, &sgid,
+ &gid_type);
+ if (!err) {
+ is_udp = gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP;
+ if (is_udp) {
+ if (ipv6_addr_v4mapped((struct in6_addr *)&sgid))
+ ip_version = 4;
+ else
+ ip_version = 6;
+ is_grh = false;
+ }
+ } else {
+ return err;
+ }
+ }
+ if (ah->av.eth.vlan != cpu_to_be16(0xffff)) {
+ vlan = be16_to_cpu(ah->av.eth.vlan) & 0x0fff;
+ is_vlan = true;
+ }
+ }
+ err = ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh,
+ ip_version, is_udp, 0, &sqp->ud_header);
+ if (err)
+ return err;
+
+ if (!is_eth) {
+ sqp->ud_header.lrh.service_level =
+ be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+ sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid;
+ sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
+ }
+
+ if (is_grh || (ip_version == 6)) {
+ sqp->ud_header.grh.traffic_class =
+ (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
+ sqp->ud_header.grh.flow_label =
+ ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+ sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit;
+ if (is_eth) {
+ memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16);
+ } else {
+ if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
+ /* When multi-function is enabled, the ib_core gid
+ * indexes don't necessarily match the hw ones, so
+ * we must use our own cache
+ */
+ sqp->ud_header.grh.source_gid.global
+ .subnet_prefix =
+ cpu_to_be64(atomic64_read(
+ &(to_mdev(ib_dev)
+ ->sriov
+ .demux[qp->port - 1]
+ .subnet_prefix)));
+ sqp->ud_header.grh.source_gid.global
+ .interface_id =
+ to_mdev(ib_dev)
+ ->sriov.demux[qp->port - 1]
+ .guid_cache[ah->av.ib.gid_index];
+ } else {
+ sqp->ud_header.grh.source_gid =
+ ah->ibah.sgid_attr->gid;
+ }
+ }
+ memcpy(sqp->ud_header.grh.destination_gid.raw,
+ ah->av.ib.dgid, 16);
+ }
+
+ if (ip_version == 4) {
+ sqp->ud_header.ip4.tos =
+ (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
+ sqp->ud_header.ip4.id = 0;
+ sqp->ud_header.ip4.frag_off = htons(IP_DF);
+ sqp->ud_header.ip4.ttl = ah->av.eth.hop_limit;
+
+ memcpy(&sqp->ud_header.ip4.saddr,
+ sgid.raw + 12, 4);
+ memcpy(&sqp->ud_header.ip4.daddr, ah->av.ib.dgid + 12, 4);
+ sqp->ud_header.ip4.check = ib_ud_ip4_csum(&sqp->ud_header);
+ }
+
+ if (is_udp) {
+ sqp->ud_header.udp.dport = htons(ROCE_V2_UDP_DPORT);
+ sqp->ud_header.udp.sport = htons(MLX4_ROCEV2_QP1_SPORT);
+ sqp->ud_header.udp.csum = 0;
+ }
+
+ mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+ if (!is_eth) {
+ mlx->flags |=
+ cpu_to_be32((!qp->ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) |
+ (sqp->ud_header.lrh.destination_lid ==
+ IB_LID_PERMISSIVE ?
+ MLX4_WQE_MLX_SLR :
+ 0) |
+ (sqp->ud_header.lrh.service_level << 8));
+ if (ah->av.ib.port_pd & cpu_to_be32(0x80000000))
+ mlx->flags |= cpu_to_be32(0x1); /* force loopback */
+ mlx->rlid = sqp->ud_header.lrh.destination_lid;
+ }
+
+ switch (wr->wr.opcode) {
+ case IB_WR_SEND:
+ sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+ sqp->ud_header.immediate_present = 0;
+ break;
+ case IB_WR_SEND_WITH_IMM:
+ sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+ sqp->ud_header.immediate_present = 1;
+ sqp->ud_header.immediate_data = wr->wr.ex.imm_data;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (is_eth) {
+ u16 ether_type;
+ u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13;
+
+ ether_type = (!is_udp) ? ETH_P_IBOE:
+ (ip_version == 4 ? ETH_P_IP : ETH_P_IPV6);
+
+ mlx->sched_prio = cpu_to_be16(pcp);
+
+ ether_addr_copy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac);
+ ether_addr_copy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac);
+ memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2);
+ memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4);
+
+ if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
+ mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
+ if (!is_vlan) {
+ sqp->ud_header.eth.type = cpu_to_be16(ether_type);
+ } else {
+ sqp->ud_header.vlan.type = cpu_to_be16(ether_type);
+ sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp);
+ }
+ } else {
+ sqp->ud_header.lrh.virtual_lane =
+ !qp->ibqp.qp_num ?
+ 15 :
+ sl_to_vl(to_mdev(ib_dev),
+ sqp->ud_header.lrh.service_level,
+ qp->port);
+ if (qp->ibqp.qp_num && sqp->ud_header.lrh.virtual_lane == 15)
+ return -EINVAL;
+ if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
+ sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
+ }
+ sqp->ud_header.bth.solicited_event = !!(wr->wr.send_flags & IB_SEND_SOLICITED);
+ if (!qp->ibqp.qp_num)
+ err = ib_get_cached_pkey(ib_dev, qp->port, sqp->pkey_index,
+ &pkey);
+ else
+ err = ib_get_cached_pkey(ib_dev, qp->port, wr->pkey_index,
+ &pkey);
+ if (err)
+ return err;
+
+ sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+ sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->remote_qpn);
+ sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+ sqp->ud_header.deth.qkey = cpu_to_be32(wr->remote_qkey & 0x80000000 ?
+ sqp->qkey : wr->remote_qkey);
+ sqp->ud_header.deth.source_qpn = cpu_to_be32(qp->ibqp.qp_num);
+
+ header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
+
+ if (0) {
+ pr_err("built UD header of size %d:\n", header_size);
+ for (i = 0; i < header_size / 4; ++i) {
+ if (i % 8 == 0)
+ pr_err(" [%02x] ", i * 4);
+ pr_cont(" %08x",
+ be32_to_cpu(((__be32 *) sqp->header_buf)[i]));
+ if ((i + 1) % 8 == 0)
+ pr_cont("\n");
+ }
+ pr_err("\n");
+ }
+
+ /*
+ * Inline data segments may not cross a 64 byte boundary. If
+ * our UD header is bigger than the space available up to the
+ * next 64 byte boundary in the WQE, use two inline data
+ * segments to hold the UD header.
+ */
+ spc = MLX4_INLINE_ALIGN -
+ ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+ if (header_size <= spc) {
+ inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+ memcpy(inl + 1, sqp->header_buf, header_size);
+ i = 1;
+ } else {
+ inl->byte_count = cpu_to_be32(1 << 31 | spc);
+ memcpy(inl + 1, sqp->header_buf, spc);
+
+ inl = (void *) (inl + 1) + spc;
+ memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
+ /*
+ * Need a barrier here to make sure all the data is
+ * visible before the byte_count field is set.
+ * Otherwise the HCA prefetcher could grab the 64-byte
+ * chunk with this inline segment and get a valid (!=
+ * 0xffffffff) byte count but stale data, and end up
+ * generating a packet with bad headers.
+ *
+ * The first inline segment's byte_count field doesn't
+ * need a barrier, because it comes after a
+ * control/MLX segment and therefore is at an offset
+ * of 16 mod 64.
+ */
+ wmb();
+ inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
+ i = 2;
+ }
+
+ *mlx_seg_len =
+ ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
+ return 0;
+}
+
+static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
+{
+ unsigned cur;
+ struct mlx4_ib_cq *cq;
+
+ cur = wq->head - wq->tail;
+ if (likely(cur + nreq < wq->max_post))
+ return 0;
+
+ cq = to_mcq(ib_cq);
+ spin_lock(&cq->lock);
+ cur = wq->head - wq->tail;
+ spin_unlock(&cq->lock);
+
+ return cur + nreq >= wq->max_post;
+}
+
+static __be32 convert_access(int acc)
+{
+ return (acc & IB_ACCESS_REMOTE_ATOMIC ?
+ cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC) : 0) |
+ (acc & IB_ACCESS_REMOTE_WRITE ?
+ cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE) : 0) |
+ (acc & IB_ACCESS_REMOTE_READ ?
+ cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ) : 0) |
+ (acc & IB_ACCESS_LOCAL_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE) : 0) |
+ cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ);
+}
+
+static void set_reg_seg(struct mlx4_wqe_fmr_seg *fseg,
+ const struct ib_reg_wr *wr)
+{
+ struct mlx4_ib_mr *mr = to_mmr(wr->mr);
+
+ fseg->flags = convert_access(wr->access);
+ fseg->mem_key = cpu_to_be32(wr->key);
+ fseg->buf_list = cpu_to_be64(mr->page_map);
+ fseg->start_addr = cpu_to_be64(mr->ibmr.iova);
+ fseg->reg_len = cpu_to_be64(mr->ibmr.length);
+ fseg->offset = 0; /* XXX -- is this just for ZBVA? */
+ fseg->page_size = cpu_to_be32(ilog2(mr->ibmr.page_size));
+ fseg->reserved[0] = 0;
+ fseg->reserved[1] = 0;
+}
+
+static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey)
+{
+ memset(iseg, 0, sizeof(*iseg));
+ iseg->mem_key = cpu_to_be32(rkey);
+}
+
+static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
+ u64 remote_addr, u32 rkey)
+{
+ rseg->raddr = cpu_to_be64(remote_addr);
+ rseg->rkey = cpu_to_be32(rkey);
+ rseg->reserved = 0;
+}
+
+static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg,
+ const struct ib_atomic_wr *wr)
+{
+ if (wr->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+ aseg->swap_add = cpu_to_be64(wr->swap);
+ aseg->compare = cpu_to_be64(wr->compare_add);
+ } else if (wr->wr.opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) {
+ aseg->swap_add = cpu_to_be64(wr->compare_add);
+ aseg->compare = cpu_to_be64(wr->compare_add_mask);
+ } else {
+ aseg->swap_add = cpu_to_be64(wr->compare_add);
+ aseg->compare = 0;
+ }
+
+}
+
+static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg,
+ const struct ib_atomic_wr *wr)
+{
+ aseg->swap_add = cpu_to_be64(wr->swap);
+ aseg->swap_add_mask = cpu_to_be64(wr->swap_mask);
+ aseg->compare = cpu_to_be64(wr->compare_add);
+ aseg->compare_mask = cpu_to_be64(wr->compare_add_mask);
+}
+
+static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
+ const struct ib_ud_wr *wr)
+{
+ memcpy(dseg->av, &to_mah(wr->ah)->av, sizeof (struct mlx4_av));
+ dseg->dqpn = cpu_to_be32(wr->remote_qpn);
+ dseg->qkey = cpu_to_be32(wr->remote_qkey);
+ dseg->vlan = to_mah(wr->ah)->av.eth.vlan;
+ memcpy(dseg->mac, to_mah(wr->ah)->av.eth.mac, 6);
+}
+
+static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev,
+ struct mlx4_wqe_datagram_seg *dseg,
+ const struct ib_ud_wr *wr,
+ enum mlx4_ib_qp_type qpt)
+{
+ union mlx4_ext_av *av = &to_mah(wr->ah)->av;
+ struct mlx4_av sqp_av = {0};
+ int port = *((u8 *) &av->ib.port_pd) & 0x3;
+
+ /* force loopback */
+ sqp_av.port_pd = av->ib.port_pd | cpu_to_be32(0x80000000);
+ sqp_av.g_slid = av->ib.g_slid & 0x7f; /* no GRH */
+ sqp_av.sl_tclass_flowlabel = av->ib.sl_tclass_flowlabel &
+ cpu_to_be32(0xf0000000);
+
+ memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av));
+ if (qpt == MLX4_IB_QPT_PROXY_GSI)
+ dseg->dqpn = cpu_to_be32(dev->dev->caps.spec_qps[port - 1].qp1_tunnel);
+ else
+ dseg->dqpn = cpu_to_be32(dev->dev->caps.spec_qps[port - 1].qp0_tunnel);
+ /* Use QKEY from the QP context, which is set by master */
+ dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY);
+}
+
+static void build_tunnel_header(const struct ib_ud_wr *wr, void *wqe,
+ unsigned *mlx_seg_len)
+{
+ struct mlx4_wqe_inline_seg *inl = wqe;
+ struct mlx4_ib_tunnel_header hdr;
+ struct mlx4_ib_ah *ah = to_mah(wr->ah);
+ int spc;
+ int i;
+
+ memcpy(&hdr.av, &ah->av, sizeof hdr.av);
+ hdr.remote_qpn = cpu_to_be32(wr->remote_qpn);
+ hdr.pkey_index = cpu_to_be16(wr->pkey_index);
+ hdr.qkey = cpu_to_be32(wr->remote_qkey);
+ memcpy(hdr.mac, ah->av.eth.mac, 6);
+ hdr.vlan = ah->av.eth.vlan;
+
+ spc = MLX4_INLINE_ALIGN -
+ ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+ if (sizeof (hdr) <= spc) {
+ memcpy(inl + 1, &hdr, sizeof (hdr));
+ wmb();
+ inl->byte_count = cpu_to_be32(1 << 31 | sizeof (hdr));
+ i = 1;
+ } else {
+ memcpy(inl + 1, &hdr, spc);
+ wmb();
+ inl->byte_count = cpu_to_be32(1 << 31 | spc);
+
+ inl = (void *) (inl + 1) + spc;
+ memcpy(inl + 1, (void *) &hdr + spc, sizeof (hdr) - spc);
+ wmb();
+ inl->byte_count = cpu_to_be32(1 << 31 | (sizeof (hdr) - spc));
+ i = 2;
+ }
+
+ *mlx_seg_len =
+ ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + sizeof (hdr), 16);
+}
+
+static void set_mlx_icrc_seg(void *dseg)
+{
+ u32 *t = dseg;
+ struct mlx4_wqe_inline_seg *iseg = dseg;
+
+ t[1] = 0;
+
+ /*
+ * Need a barrier here before writing the byte_count field to
+ * make sure that all the data is visible before the
+ * byte_count field is set. Otherwise, if the segment begins
+ * a new cacheline, the HCA prefetcher could grab the 64-byte
+ * chunk and get a valid (!= * 0xffffffff) byte count but
+ * stale data, and end up sending the wrong data.
+ */
+ wmb();
+
+ iseg->byte_count = cpu_to_be32((1 << 31) | 4);
+}
+
+static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+ dseg->lkey = cpu_to_be32(sg->lkey);
+ dseg->addr = cpu_to_be64(sg->addr);
+
+ /*
+ * Need a barrier here before writing the byte_count field to
+ * make sure that all the data is visible before the
+ * byte_count field is set. Otherwise, if the segment begins
+ * a new cacheline, the HCA prefetcher could grab the 64-byte
+ * chunk and get a valid (!= * 0xffffffff) byte count but
+ * stale data, and end up sending the wrong data.
+ */
+ wmb();
+
+ dseg->byte_count = cpu_to_be32(sg->length);
+}
+
+static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+ dseg->byte_count = cpu_to_be32(sg->length);
+ dseg->lkey = cpu_to_be32(sg->lkey);
+ dseg->addr = cpu_to_be64(sg->addr);
+}
+
+static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe,
+ const struct ib_ud_wr *wr, struct mlx4_ib_qp *qp,
+ unsigned *lso_seg_len, __be32 *lso_hdr_sz, __be32 *blh)
+{
+ unsigned halign = ALIGN(sizeof *wqe + wr->hlen, 16);
+
+ if (unlikely(halign > MLX4_IB_CACHE_LINE_SIZE))
+ *blh = cpu_to_be32(1 << 6);
+
+ if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) &&
+ wr->wr.num_sge > qp->sq.max_gs - (halign >> 4)))
+ return -EINVAL;
+
+ memcpy(wqe->header, wr->header, wr->hlen);
+
+ *lso_hdr_sz = cpu_to_be32(wr->mss << 16 | wr->hlen);
+ *lso_seg_len = halign;
+ return 0;
+}
+
+static __be32 send_ieth(const struct ib_send_wr *wr)
+{
+ switch (wr->opcode) {
+ case IB_WR_SEND_WITH_IMM:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ return wr->ex.imm_data;
+
+ case IB_WR_SEND_WITH_INV:
+ return cpu_to_be32(wr->ex.invalidate_rkey);
+
+ default:
+ return 0;
+ }
+}
+
+static void add_zero_len_inline(void *wqe)
+{
+ struct mlx4_wqe_inline_seg *inl = wqe;
+ memset(wqe, 0, 16);
+ inl->byte_count = cpu_to_be32(1 << 31);
+}
+
+static int _mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
+ const struct ib_send_wr **bad_wr, bool drain)
+{
+ struct mlx4_ib_qp *qp = to_mqp(ibqp);
+ void *wqe;
+ struct mlx4_wqe_ctrl_seg *ctrl;
+ struct mlx4_wqe_data_seg *dseg;
+ unsigned long flags;
+ int nreq;
+ int err = 0;
+ unsigned ind;
+ int size;
+ unsigned seglen;
+ __be32 dummy;
+ __be32 *lso_wqe;
+ __be32 lso_hdr_sz;
+ __be32 blh;
+ int i;
+ struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+
+ if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
+ struct mlx4_ib_sqp *sqp = qp->sqp;
+
+ if (sqp->roce_v2_gsi) {
+ struct mlx4_ib_ah *ah = to_mah(ud_wr(wr)->ah);
+ enum ib_gid_type gid_type;
+ union ib_gid gid;
+
+ if (!fill_gid_by_hw_index(mdev, qp->port,
+ ah->av.ib.gid_index,
+ &gid, &gid_type))
+ qp = (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ?
+ to_mqp(sqp->roce_v2_gsi) : qp;
+ else
+ pr_err("Failed to get gid at index %d. RoCEv2 will not work properly\n",
+ ah->av.ib.gid_index);
+ }
+ }
+
+ spin_lock_irqsave(&qp->sq.lock, flags);
+ if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR &&
+ !drain) {
+ err = -EIO;
+ *bad_wr = wr;
+ nreq = 0;
+ goto out;
+ }
+
+ ind = qp->sq_next_wqe;
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ lso_wqe = &dummy;
+ blh = 0;
+
+ if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+ err = -ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ if (unlikely(wr->num_sge > qp->sq.max_gs)) {
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+ qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+
+ ctrl->srcrb_flags =
+ (wr->send_flags & IB_SEND_SIGNALED ?
+ cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
+ (wr->send_flags & IB_SEND_SOLICITED ?
+ cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) |
+ ((wr->send_flags & IB_SEND_IP_CSUM) ?
+ cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
+ MLX4_WQE_CTRL_TCP_UDP_CSUM) : 0) |
+ qp->sq_signal_bits;
+
+ ctrl->imm = send_ieth(wr);
+
+ wqe += sizeof *ctrl;
+ size = sizeof *ctrl / 16;
+
+ switch (qp->mlx4_ib_qp_type) {
+ case MLX4_IB_QPT_RC:
+ case MLX4_IB_QPT_UC:
+ switch (wr->opcode) {
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
+ set_raddr_seg(wqe, atomic_wr(wr)->remote_addr,
+ atomic_wr(wr)->rkey);
+ wqe += sizeof (struct mlx4_wqe_raddr_seg);
+
+ set_atomic_seg(wqe, atomic_wr(wr));
+ wqe += sizeof (struct mlx4_wqe_atomic_seg);
+
+ size += (sizeof (struct mlx4_wqe_raddr_seg) +
+ sizeof (struct mlx4_wqe_atomic_seg)) / 16;
+
+ break;
+
+ case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+ set_raddr_seg(wqe, atomic_wr(wr)->remote_addr,
+ atomic_wr(wr)->rkey);
+ wqe += sizeof (struct mlx4_wqe_raddr_seg);
+
+ set_masked_atomic_seg(wqe, atomic_wr(wr));
+ wqe += sizeof (struct mlx4_wqe_masked_atomic_seg);
+
+ size += (sizeof (struct mlx4_wqe_raddr_seg) +
+ sizeof (struct mlx4_wqe_masked_atomic_seg)) / 16;
+
+ break;
+
+ case IB_WR_RDMA_READ:
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ set_raddr_seg(wqe, rdma_wr(wr)->remote_addr,
+ rdma_wr(wr)->rkey);
+ wqe += sizeof (struct mlx4_wqe_raddr_seg);
+ size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
+ break;
+
+ case IB_WR_LOCAL_INV:
+ ctrl->srcrb_flags |=
+ cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
+ set_local_inv_seg(wqe, wr->ex.invalidate_rkey);
+ wqe += sizeof (struct mlx4_wqe_local_inval_seg);
+ size += sizeof (struct mlx4_wqe_local_inval_seg) / 16;
+ break;
+
+ case IB_WR_REG_MR:
+ ctrl->srcrb_flags |=
+ cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
+ set_reg_seg(wqe, reg_wr(wr));
+ wqe += sizeof(struct mlx4_wqe_fmr_seg);
+ size += sizeof(struct mlx4_wqe_fmr_seg) / 16;
+ break;
+
+ default:
+ /* No extra segments required for sends */
+ break;
+ }
+ break;
+
+ case MLX4_IB_QPT_TUN_SMI_OWNER:
+ err = build_sriov_qp0_header(qp, ud_wr(wr), ctrl,
+ &seglen);
+ if (unlikely(err)) {
+ *bad_wr = wr;
+ goto out;
+ }
+ wqe += seglen;
+ size += seglen / 16;
+ break;
+ case MLX4_IB_QPT_TUN_SMI:
+ case MLX4_IB_QPT_TUN_GSI:
+ /* this is a UD qp used in MAD responses to slaves. */
+ set_datagram_seg(wqe, ud_wr(wr));
+ /* set the forced-loopback bit in the data seg av */
+ *(__be32 *) wqe |= cpu_to_be32(0x80000000);
+ wqe += sizeof (struct mlx4_wqe_datagram_seg);
+ size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+ break;
+ case MLX4_IB_QPT_UD:
+ set_datagram_seg(wqe, ud_wr(wr));
+ wqe += sizeof (struct mlx4_wqe_datagram_seg);
+ size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+
+ if (wr->opcode == IB_WR_LSO) {
+ err = build_lso_seg(wqe, ud_wr(wr), qp, &seglen,
+ &lso_hdr_sz, &blh);
+ if (unlikely(err)) {
+ *bad_wr = wr;
+ goto out;
+ }
+ lso_wqe = (__be32 *) wqe;
+ wqe += seglen;
+ size += seglen / 16;
+ }
+ break;
+
+ case MLX4_IB_QPT_PROXY_SMI_OWNER:
+ err = build_sriov_qp0_header(qp, ud_wr(wr), ctrl,
+ &seglen);
+ if (unlikely(err)) {
+ *bad_wr = wr;
+ goto out;
+ }
+ wqe += seglen;
+ size += seglen / 16;
+ /* to start tunnel header on a cache-line boundary */
+ add_zero_len_inline(wqe);
+ wqe += 16;
+ size++;
+ build_tunnel_header(ud_wr(wr), wqe, &seglen);
+ wqe += seglen;
+ size += seglen / 16;
+ break;
+ case MLX4_IB_QPT_PROXY_SMI:
+ case MLX4_IB_QPT_PROXY_GSI:
+ /* If we are tunneling special qps, this is a UD qp.
+ * In this case we first add a UD segment targeting
+ * the tunnel qp, and then add a header with address
+ * information */
+ set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe,
+ ud_wr(wr),
+ qp->mlx4_ib_qp_type);
+ wqe += sizeof (struct mlx4_wqe_datagram_seg);
+ size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+ build_tunnel_header(ud_wr(wr), wqe, &seglen);
+ wqe += seglen;
+ size += seglen / 16;
+ break;
+
+ case MLX4_IB_QPT_SMI:
+ case MLX4_IB_QPT_GSI:
+ err = build_mlx_header(qp, ud_wr(wr), ctrl, &seglen);
+ if (unlikely(err)) {
+ *bad_wr = wr;
+ goto out;
+ }
+ wqe += seglen;
+ size += seglen / 16;
+ break;
+
+ default:
+ break;
+ }
+
+ /*
+ * Write data segments in reverse order, so as to
+ * overwrite cacheline stamp last within each
+ * cacheline. This avoids issues with WQE
+ * prefetching.
+ */
+
+ dseg = wqe;
+ dseg += wr->num_sge - 1;
+ size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16);
+
+ /* Add one more inline data segment for ICRC for MLX sends */
+ if (unlikely(qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI ||
+ qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI ||
+ qp->mlx4_ib_qp_type &
+ (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))) {
+ set_mlx_icrc_seg(dseg + 1);
+ size += sizeof (struct mlx4_wqe_data_seg) / 16;
+ }
+
+ for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
+ set_data_seg(dseg, wr->sg_list + i);
+
+ /*
+ * Possibly overwrite stamping in cacheline with LSO
+ * segment only after making sure all data segments
+ * are written.
+ */
+ wmb();
+ *lso_wqe = lso_hdr_sz;
+
+ ctrl->qpn_vlan.fence_size = (wr->send_flags & IB_SEND_FENCE ?
+ MLX4_WQE_CTRL_FENCE : 0) | size;
+
+ /*
+ * Make sure descriptor is fully written before
+ * setting ownership bit (because HW can start
+ * executing as soon as we do).
+ */
+ wmb();
+
+ if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) {
+ *bad_wr = wr;
+ err = -EINVAL;
+ goto out;
+ }
+
+ ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
+ (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh;
+
+ /*
+ * We can improve latency by not stamping the last
+ * send queue WQE until after ringing the doorbell, so
+ * only stamp here if there are still more WQEs to post.
+ */
+ if (wr->next)
+ stamp_send_wqe(qp, ind + qp->sq_spare_wqes);
+ ind++;
+ }
+
+out:
+ if (likely(nreq)) {
+ qp->sq.head += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ wmb();
+
+ writel_relaxed(qp->doorbell_qpn,
+ to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL);
+
+ stamp_send_wqe(qp, ind + qp->sq_spare_wqes - 1);
+
+ qp->sq_next_wqe = ind;
+ }
+
+ spin_unlock_irqrestore(&qp->sq.lock, flags);
+
+ return err;
+}
+
+int mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
+ const struct ib_send_wr **bad_wr)
+{
+ return _mlx4_ib_post_send(ibqp, wr, bad_wr, false);
+}
+
+static int _mlx4_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr, bool drain)
+{
+ struct mlx4_ib_qp *qp = to_mqp(ibqp);
+ struct mlx4_wqe_data_seg *scat;
+ unsigned long flags;
+ int err = 0;
+ int nreq;
+ int ind;
+ int max_gs;
+ int i;
+ struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+
+ max_gs = qp->rq.max_gs;
+ spin_lock_irqsave(&qp->rq.lock, flags);
+
+ if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR &&
+ !drain) {
+ err = -EIO;
+ *bad_wr = wr;
+ nreq = 0;
+ goto out;
+ }
+
+ ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+ err = -ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ scat = get_recv_wqe(qp, ind);
+
+ if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
+ MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) {
+ ib_dma_sync_single_for_device(ibqp->device,
+ qp->sqp_proxy_rcv[ind].map,
+ sizeof (struct mlx4_ib_proxy_sqp_hdr),
+ DMA_FROM_DEVICE);
+ scat->byte_count =
+ cpu_to_be32(sizeof (struct mlx4_ib_proxy_sqp_hdr));
+ /* use dma lkey from upper layer entry */
+ scat->lkey = cpu_to_be32(wr->sg_list->lkey);
+ scat->addr = cpu_to_be64(qp->sqp_proxy_rcv[ind].map);
+ scat++;
+ max_gs--;
+ }
+
+ for (i = 0; i < wr->num_sge; ++i)
+ __set_data_seg(scat + i, wr->sg_list + i);
+
+ if (i < max_gs) {
+ scat[i].byte_count = 0;
+ scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY);
+ scat[i].addr = 0;
+ }
+
+ qp->rq.wrid[ind] = wr->wr_id;
+
+ ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
+ }
+
+out:
+ if (likely(nreq)) {
+ qp->rq.head += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ wmb();
+
+ *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
+ }
+
+ spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+ return err;
+}
+
+int mlx4_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr)
+{
+ return _mlx4_ib_post_recv(ibqp, wr, bad_wr, false);
+}
+
+static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state)
+{
+ switch (mlx4_state) {
+ case MLX4_QP_STATE_RST: return IB_QPS_RESET;
+ case MLX4_QP_STATE_INIT: return IB_QPS_INIT;
+ case MLX4_QP_STATE_RTR: return IB_QPS_RTR;
+ case MLX4_QP_STATE_RTS: return IB_QPS_RTS;
+ case MLX4_QP_STATE_SQ_DRAINING:
+ case MLX4_QP_STATE_SQD: return IB_QPS_SQD;
+ case MLX4_QP_STATE_SQER: return IB_QPS_SQE;
+ case MLX4_QP_STATE_ERR: return IB_QPS_ERR;
+ default: return -1;
+ }
+}
+
+static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state)
+{
+ switch (mlx4_mig_state) {
+ case MLX4_QP_PM_ARMED: return IB_MIG_ARMED;
+ case MLX4_QP_PM_REARM: return IB_MIG_REARM;
+ case MLX4_QP_PM_MIGRATED: return IB_MIG_MIGRATED;
+ default: return -1;
+ }
+}
+
+static int to_ib_qp_access_flags(int mlx4_flags)
+{
+ int ib_flags = 0;
+
+ if (mlx4_flags & MLX4_QP_BIT_RRE)
+ ib_flags |= IB_ACCESS_REMOTE_READ;
+ if (mlx4_flags & MLX4_QP_BIT_RWE)
+ ib_flags |= IB_ACCESS_REMOTE_WRITE;
+ if (mlx4_flags & MLX4_QP_BIT_RAE)
+ ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
+
+ return ib_flags;
+}
+
+static void to_rdma_ah_attr(struct mlx4_ib_dev *ibdev,
+ struct rdma_ah_attr *ah_attr,
+ struct mlx4_qp_path *path)
+{
+ struct mlx4_dev *dev = ibdev->dev;
+ u8 port_num = path->sched_queue & 0x40 ? 2 : 1;
+
+ memset(ah_attr, 0, sizeof(*ah_attr));
+ if (port_num == 0 || port_num > dev->caps.num_ports)
+ return;
+ ah_attr->type = rdma_ah_find_type(&ibdev->ib_dev, port_num);
+
+ if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE)
+ rdma_ah_set_sl(ah_attr, ((path->sched_queue >> 3) & 0x7) |
+ ((path->sched_queue & 4) << 1));
+ else
+ rdma_ah_set_sl(ah_attr, (path->sched_queue >> 2) & 0xf);
+ rdma_ah_set_port_num(ah_attr, port_num);
+
+ rdma_ah_set_dlid(ah_attr, be16_to_cpu(path->rlid));
+ rdma_ah_set_path_bits(ah_attr, path->grh_mylmc & 0x7f);
+ rdma_ah_set_static_rate(ah_attr,
+ path->static_rate ? path->static_rate - 5 : 0);
+ if (path->grh_mylmc & (1 << 7)) {
+ rdma_ah_set_grh(ah_attr, NULL,
+ be32_to_cpu(path->tclass_flowlabel) & 0xfffff,
+ path->mgid_index,
+ path->hop_limit,
+ (be32_to_cpu(path->tclass_flowlabel)
+ >> 20) & 0xff);
+ rdma_ah_set_dgid_raw(ah_attr, path->rgid);
+ }
+}
+
+int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+ struct mlx4_ib_qp *qp = to_mqp(ibqp);
+ struct mlx4_qp_context context;
+ int mlx4_state;
+ int err = 0;
+
+ if (ibqp->rwq_ind_tbl)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&qp->mutex);
+
+ if (qp->state == IB_QPS_RESET) {
+ qp_attr->qp_state = IB_QPS_RESET;
+ goto done;
+ }
+
+ err = mlx4_qp_query(dev->dev, &qp->mqp, &context);
+ if (err) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ mlx4_state = be32_to_cpu(context.flags) >> 28;
+
+ qp->state = to_ib_qp_state(mlx4_state);
+ qp_attr->qp_state = qp->state;
+ qp_attr->path_mtu = context.mtu_msgmax >> 5;
+ qp_attr->path_mig_state =
+ to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3);
+ qp_attr->qkey = be32_to_cpu(context.qkey);
+ qp_attr->rq_psn = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff;
+ qp_attr->sq_psn = be32_to_cpu(context.next_send_psn) & 0xffffff;
+ qp_attr->dest_qp_num = be32_to_cpu(context.remote_qpn) & 0xffffff;
+ qp_attr->qp_access_flags =
+ to_ib_qp_access_flags(be32_to_cpu(context.params2));
+
+ if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC ||
+ qp->ibqp.qp_type == IB_QPT_XRC_INI ||
+ qp->ibqp.qp_type == IB_QPT_XRC_TGT) {
+ to_rdma_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path);
+ to_rdma_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path);
+ qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f;
+ qp_attr->alt_port_num =
+ rdma_ah_get_port_num(&qp_attr->alt_ah_attr);
+ }
+
+ qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f;
+ if (qp_attr->qp_state == IB_QPS_INIT)
+ qp_attr->port_num = qp->port;
+ else
+ qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1;
+
+ /* qp_attr->en_sqd_async_notify is only applicable in modify qp */
+ qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING;
+
+ qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7);
+
+ qp_attr->max_dest_rd_atomic =
+ 1 << ((be32_to_cpu(context.params2) >> 21) & 0x7);
+ qp_attr->min_rnr_timer =
+ (be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f;
+ qp_attr->timeout = context.pri_path.ackto >> 3;
+ qp_attr->retry_cnt = (be32_to_cpu(context.params1) >> 16) & 0x7;
+ qp_attr->rnr_retry = (be32_to_cpu(context.params1) >> 13) & 0x7;
+ qp_attr->alt_timeout = context.alt_path.ackto >> 3;
+
+done:
+ qp_attr->cur_qp_state = qp_attr->qp_state;
+ qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt;
+ qp_attr->cap.max_recv_sge = qp->rq.max_gs;
+
+ if (!ibqp->uobject) {
+ qp_attr->cap.max_send_wr = qp->sq.wqe_cnt;
+ qp_attr->cap.max_send_sge = qp->sq.max_gs;
+ } else {
+ qp_attr->cap.max_send_wr = 0;
+ qp_attr->cap.max_send_sge = 0;
+ }
+
+ /*
+ * We don't support inline sends for kernel QPs (yet), and we
+ * don't know what userspace's value should be.
+ */
+ qp_attr->cap.max_inline_data = 0;
+
+ qp_init_attr->cap = qp_attr->cap;
+
+ qp_init_attr->create_flags = 0;
+ if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+ qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+ if (qp->flags & MLX4_IB_QP_LSO)
+ qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
+ if (qp->flags & MLX4_IB_QP_NETIF)
+ qp_init_attr->create_flags |= IB_QP_CREATE_NETIF_QP;
+
+ qp_init_attr->sq_sig_type =
+ qp->sq_signal_bits == cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) ?
+ IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+
+out:
+ mutex_unlock(&qp->mutex);
+ return err;
+}
+
+struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd,
+ struct ib_wq_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct mlx4_dev *dev = to_mdev(pd->device)->dev;
+ struct ib_qp_init_attr ib_qp_init_attr = {};
+ struct mlx4_ib_qp *qp;
+ struct mlx4_ib_create_wq ucmd;
+ int err, required_cmd_sz;
+
+ if (!udata)
+ return ERR_PTR(-EINVAL);
+
+ required_cmd_sz = offsetof(typeof(ucmd), comp_mask) +
+ sizeof(ucmd.comp_mask);
+ if (udata->inlen < required_cmd_sz) {
+ pr_debug("invalid inlen\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (udata->inlen > sizeof(ucmd) &&
+ !ib_is_udata_cleared(udata, sizeof(ucmd),
+ udata->inlen - sizeof(ucmd))) {
+ pr_debug("inlen is not supported\n");
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ if (udata->outlen)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (init_attr->wq_type != IB_WQT_RQ) {
+ pr_debug("unsupported wq type %d\n", init_attr->wq_type);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ if (init_attr->create_flags & ~IB_WQ_FLAGS_SCATTER_FCS ||
+ !(dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP)) {
+ pr_debug("unsupported create_flags %u\n",
+ init_attr->create_flags);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+ if (!qp)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_init(&qp->mutex);
+ qp->pri.vid = 0xFFFF;
+ qp->alt.vid = 0xFFFF;
+
+ ib_qp_init_attr.qp_context = init_attr->wq_context;
+ ib_qp_init_attr.qp_type = IB_QPT_RAW_PACKET;
+ ib_qp_init_attr.cap.max_recv_wr = init_attr->max_wr;
+ ib_qp_init_attr.cap.max_recv_sge = init_attr->max_sge;
+ ib_qp_init_attr.recv_cq = init_attr->cq;
+ ib_qp_init_attr.send_cq = ib_qp_init_attr.recv_cq; /* Dummy CQ */
+
+ if (init_attr->create_flags & IB_WQ_FLAGS_SCATTER_FCS)
+ ib_qp_init_attr.create_flags |= IB_QP_CREATE_SCATTER_FCS;
+
+ err = create_rq(pd, &ib_qp_init_attr, udata, qp);
+ if (err) {
+ kfree(qp);
+ return ERR_PTR(err);
+ }
+
+ qp->ibwq.event_handler = init_attr->event_handler;
+ qp->ibwq.wq_num = qp->mqp.qpn;
+ qp->ibwq.state = IB_WQS_RESET;
+
+ return &qp->ibwq;
+}
+
+static int ib_wq2qp_state(enum ib_wq_state state)
+{
+ switch (state) {
+ case IB_WQS_RESET:
+ return IB_QPS_RESET;
+ case IB_WQS_RDY:
+ return IB_QPS_RTR;
+ default:
+ return IB_QPS_ERR;
+ }
+}
+
+static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state,
+ struct ib_udata *udata)
+{
+ struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
+ enum ib_qp_state qp_cur_state;
+ enum ib_qp_state qp_new_state;
+ int attr_mask;
+ int err;
+
+ /* ib_qp.state represents the WQ HW state while ib_wq.state represents
+ * the WQ logic state.
+ */
+ qp_cur_state = qp->state;
+ qp_new_state = ib_wq2qp_state(new_state);
+
+ if (ib_wq2qp_state(new_state) == qp_cur_state)
+ return 0;
+
+ if (new_state == IB_WQS_RDY) {
+ struct ib_qp_attr attr = {};
+
+ attr.port_num = qp->port;
+ attr_mask = IB_QP_PORT;
+
+ err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, &attr,
+ attr_mask, IB_QPS_RESET, IB_QPS_INIT,
+ udata);
+ if (err) {
+ pr_debug("WQN=0x%06x failed to apply RST->INIT on the HW QP\n",
+ ibwq->wq_num);
+ return err;
+ }
+
+ qp_cur_state = IB_QPS_INIT;
+ }
+
+ attr_mask = 0;
+ err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL, attr_mask,
+ qp_cur_state, qp_new_state, udata);
+
+ if (err && (qp_cur_state == IB_QPS_INIT)) {
+ qp_new_state = IB_QPS_RESET;
+ if (__mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL,
+ attr_mask, IB_QPS_INIT, IB_QPS_RESET,
+ udata)) {
+ pr_warn("WQN=0x%06x failed with reverting HW's resources failure\n",
+ ibwq->wq_num);
+ qp_new_state = IB_QPS_INIT;
+ }
+ }
+
+ qp->state = qp_new_state;
+
+ return err;
+}
+
+int mlx4_ib_modify_wq(struct ib_wq *ibwq, struct ib_wq_attr *wq_attr,
+ u32 wq_attr_mask, struct ib_udata *udata)
+{
+ struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
+ struct mlx4_ib_modify_wq ucmd = {};
+ size_t required_cmd_sz;
+ enum ib_wq_state cur_state, new_state;
+ int err = 0;
+
+ required_cmd_sz = offsetof(typeof(ucmd), reserved) +
+ sizeof(ucmd.reserved);
+ if (udata->inlen < required_cmd_sz)
+ return -EINVAL;
+
+ if (udata->inlen > sizeof(ucmd) &&
+ !ib_is_udata_cleared(udata, sizeof(ucmd),
+ udata->inlen - sizeof(ucmd)))
+ return -EOPNOTSUPP;
+
+ if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)))
+ return -EFAULT;
+
+ if (ucmd.comp_mask || ucmd.reserved)
+ return -EOPNOTSUPP;
+
+ if (wq_attr_mask & IB_WQ_FLAGS)
+ return -EOPNOTSUPP;
+
+ cur_state = wq_attr->curr_wq_state;
+ new_state = wq_attr->wq_state;
+
+ if ((new_state == IB_WQS_RDY) && (cur_state == IB_WQS_ERR))
+ return -EINVAL;
+
+ if ((new_state == IB_WQS_ERR) && (cur_state == IB_WQS_RESET))
+ return -EINVAL;
+
+ /* Need to protect against the parent RSS which also may modify WQ
+ * state.
+ */
+ mutex_lock(&qp->mutex);
+
+ /* Can update HW state only if a RSS QP has already associated to this
+ * WQ, so we can apply its port on the WQ.
+ */
+ if (qp->rss_usecnt)
+ err = _mlx4_ib_modify_wq(ibwq, new_state, udata);
+
+ if (!err)
+ ibwq->state = new_state;
+
+ mutex_unlock(&qp->mutex);
+
+ return err;
+}
+
+int mlx4_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibwq->device);
+ struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
+
+ if (qp->counter_index)
+ mlx4_ib_free_qp_counter(dev, qp);
+
+ destroy_qp_common(dev, qp, MLX4_IB_RWQ_SRC, udata);
+
+ kfree(qp);
+ return 0;
+}
+
+int mlx4_ib_create_rwq_ind_table(struct ib_rwq_ind_table *rwq_ind_table,
+ struct ib_rwq_ind_table_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct mlx4_ib_create_rwq_ind_tbl_resp resp = {};
+ unsigned int ind_tbl_size = 1 << init_attr->log_ind_tbl_size;
+ struct ib_device *device = rwq_ind_table->device;
+ unsigned int base_wqn;
+ size_t min_resp_len;
+ int i, err = 0;
+
+ if (udata->inlen > 0 &&
+ !ib_is_udata_cleared(udata, 0,
+ udata->inlen))
+ return -EOPNOTSUPP;
+
+ min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
+ if (udata->outlen && udata->outlen < min_resp_len)
+ return -EINVAL;
+
+ if (ind_tbl_size >
+ device->attrs.rss_caps.max_rwq_indirection_table_size) {
+ pr_debug("log_ind_tbl_size = %d is bigger than supported = %d\n",
+ ind_tbl_size,
+ device->attrs.rss_caps.max_rwq_indirection_table_size);
+ return -EINVAL;
+ }
+
+ base_wqn = init_attr->ind_tbl[0]->wq_num;
+
+ if (base_wqn % ind_tbl_size) {
+ pr_debug("WQN=0x%x isn't aligned with indirection table size\n",
+ base_wqn);
+ return -EINVAL;
+ }
+
+ for (i = 1; i < ind_tbl_size; i++) {
+ if (++base_wqn != init_attr->ind_tbl[i]->wq_num) {
+ pr_debug("indirection table's WQNs aren't consecutive\n");
+ return -EINVAL;
+ }
+ }
+
+ if (udata->outlen) {
+ resp.response_length = offsetof(typeof(resp), response_length) +
+ sizeof(resp.response_length);
+ err = ib_copy_to_udata(udata, &resp, resp.response_length);
+ }
+
+ return err;
+}
+
+struct mlx4_ib_drain_cqe {
+ struct ib_cqe cqe;
+ struct completion done;
+};
+
+static void mlx4_ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct mlx4_ib_drain_cqe *cqe = container_of(wc->wr_cqe,
+ struct mlx4_ib_drain_cqe,
+ cqe);
+
+ complete(&cqe->done);
+}
+
+/* This function returns only once the drained WR was completed */
+static void handle_drain_completion(struct ib_cq *cq,
+ struct mlx4_ib_drain_cqe *sdrain,
+ struct mlx4_ib_dev *dev)
+{
+ struct mlx4_dev *mdev = dev->dev;
+
+ if (cq->poll_ctx == IB_POLL_DIRECT) {
+ while (wait_for_completion_timeout(&sdrain->done, HZ / 10) <= 0)
+ ib_process_cq_direct(cq, -1);
+ return;
+ }
+
+ if (mdev->persist->state == MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+ struct mlx4_ib_cq *mcq = to_mcq(cq);
+ bool triggered = false;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+ /* Make sure that the CQ handler won't run if wasn't run yet */
+ if (!mcq->mcq.reset_notify_added)
+ mcq->mcq.reset_notify_added = 1;
+ else
+ triggered = true;
+ spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
+
+ if (triggered) {
+ /* Wait for any scheduled/running task to be ended */
+ switch (cq->poll_ctx) {
+ case IB_POLL_SOFTIRQ:
+ irq_poll_disable(&cq->iop);
+ irq_poll_enable(&cq->iop);
+ break;
+ case IB_POLL_WORKQUEUE:
+ cancel_work_sync(&cq->work);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+ }
+
+ /* Run the CQ handler - this makes sure that the drain WR will
+ * be processed if wasn't processed yet.
+ */
+ mcq->mcq.comp(&mcq->mcq);
+ }
+
+ wait_for_completion(&sdrain->done);
+}
+
+void mlx4_ib_drain_sq(struct ib_qp *qp)
+{
+ struct ib_cq *cq = qp->send_cq;
+ struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
+ struct mlx4_ib_drain_cqe sdrain;
+ const struct ib_send_wr *bad_swr;
+ struct ib_rdma_wr swr = {
+ .wr = {
+ .next = NULL,
+ { .wr_cqe = &sdrain.cqe, },
+ .opcode = IB_WR_RDMA_WRITE,
+ },
+ };
+ int ret;
+ struct mlx4_ib_dev *dev = to_mdev(qp->device);
+ struct mlx4_dev *mdev = dev->dev;
+
+ ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
+ if (ret && mdev->persist->state != MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+ WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
+ return;
+ }
+
+ sdrain.cqe.done = mlx4_ib_drain_qp_done;
+ init_completion(&sdrain.done);
+
+ ret = _mlx4_ib_post_send(qp, &swr.wr, &bad_swr, true);
+ if (ret) {
+ WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
+ return;
+ }
+
+ handle_drain_completion(cq, &sdrain, dev);
+}
+
+void mlx4_ib_drain_rq(struct ib_qp *qp)
+{
+ struct ib_cq *cq = qp->recv_cq;
+ struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
+ struct mlx4_ib_drain_cqe rdrain;
+ struct ib_recv_wr rwr = {};
+ const struct ib_recv_wr *bad_rwr;
+ int ret;
+ struct mlx4_ib_dev *dev = to_mdev(qp->device);
+ struct mlx4_dev *mdev = dev->dev;
+
+ ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
+ if (ret && mdev->persist->state != MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+ WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
+ return;
+ }
+
+ rwr.wr_cqe = &rdrain.cqe;
+ rdrain.cqe.done = mlx4_ib_drain_qp_done;
+ init_completion(&rdrain.done);
+
+ ret = _mlx4_ib_post_recv(qp, &rwr, &bad_rwr, true);
+ if (ret) {
+ WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
+ return;
+ }
+
+ handle_drain_completion(cq, &rdrain, dev);
+}
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
new file mode 100644
index 000000000..c4cf91235
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/srq.h>
+#include <linux/slab.h>
+
+#include "mlx4_ib.h"
+#include <rdma/mlx4-abi.h>
+#include <rdma/uverbs_ioctl.h>
+
+static void *get_wqe(struct mlx4_ib_srq *srq, int n)
+{
+ return mlx4_buf_offset(&srq->buf, n << srq->msrq.wqe_shift);
+}
+
+static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type)
+{
+ struct ib_event event;
+ struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq;
+
+ if (ibsrq->event_handler) {
+ event.device = ibsrq->device;
+ event.element.srq = ibsrq;
+ switch (type) {
+ case MLX4_EVENT_TYPE_SRQ_LIMIT:
+ event.event = IB_EVENT_SRQ_LIMIT_REACHED;
+ break;
+ case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR:
+ event.event = IB_EVENT_SRQ_ERR;
+ break;
+ default:
+ pr_warn("Unexpected event type %d "
+ "on SRQ %06x\n", type, srq->srqn);
+ return;
+ }
+
+ ibsrq->event_handler(&event, ibsrq->srq_context);
+ }
+}
+
+int mlx4_ib_create_srq(struct ib_srq *ib_srq,
+ struct ib_srq_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ib_srq->device);
+ struct mlx4_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+ udata, struct mlx4_ib_ucontext, ibucontext);
+ struct mlx4_ib_srq *srq = to_msrq(ib_srq);
+ struct mlx4_wqe_srq_next_seg *next;
+ struct mlx4_wqe_data_seg *scatter;
+ u32 cqn;
+ u16 xrcdn;
+ int desc_size;
+ int buf_size;
+ int err;
+ int i;
+
+ if (init_attr->srq_type != IB_SRQT_BASIC &&
+ init_attr->srq_type != IB_SRQT_XRC)
+ return -EOPNOTSUPP;
+
+ /* Sanity check SRQ size before proceeding */
+ if (init_attr->attr.max_wr >= dev->dev->caps.max_srq_wqes ||
+ init_attr->attr.max_sge > dev->dev->caps.max_srq_sge)
+ return -EINVAL;
+
+ mutex_init(&srq->mutex);
+ spin_lock_init(&srq->lock);
+ srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1);
+ srq->msrq.max_gs = init_attr->attr.max_sge;
+
+ desc_size = max(32UL,
+ roundup_pow_of_two(sizeof (struct mlx4_wqe_srq_next_seg) +
+ srq->msrq.max_gs *
+ sizeof (struct mlx4_wqe_data_seg)));
+ srq->msrq.wqe_shift = ilog2(desc_size);
+
+ buf_size = srq->msrq.max * desc_size;
+
+ if (udata) {
+ struct mlx4_ib_create_srq ucmd;
+
+ if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)))
+ return -EFAULT;
+
+ srq->umem =
+ ib_umem_get(ib_srq->device, ucmd.buf_addr, buf_size, 0);
+ if (IS_ERR(srq->umem))
+ return PTR_ERR(srq->umem);
+
+ err = mlx4_mtt_init(
+ dev->dev, ib_umem_num_dma_blocks(srq->umem, PAGE_SIZE),
+ PAGE_SHIFT, &srq->mtt);
+ if (err)
+ goto err_buf;
+
+ err = mlx4_ib_umem_write_mtt(dev, &srq->mtt, srq->umem);
+ if (err)
+ goto err_mtt;
+
+ err = mlx4_ib_db_map_user(udata, ucmd.db_addr, &srq->db);
+ if (err)
+ goto err_mtt;
+ } else {
+ err = mlx4_db_alloc(dev->dev, &srq->db, 0);
+ if (err)
+ return err;
+
+ *srq->db.db = 0;
+
+ if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2,
+ &srq->buf)) {
+ err = -ENOMEM;
+ goto err_db;
+ }
+
+ srq->head = 0;
+ srq->tail = srq->msrq.max - 1;
+ srq->wqe_ctr = 0;
+
+ for (i = 0; i < srq->msrq.max; ++i) {
+ next = get_wqe(srq, i);
+ next->next_wqe_index =
+ cpu_to_be16((i + 1) & (srq->msrq.max - 1));
+
+ for (scatter = (void *) (next + 1);
+ (void *) scatter < (void *) next + desc_size;
+ ++scatter)
+ scatter->lkey = cpu_to_be32(MLX4_INVALID_LKEY);
+ }
+
+ err = mlx4_mtt_init(dev->dev, srq->buf.npages, srq->buf.page_shift,
+ &srq->mtt);
+ if (err)
+ goto err_buf;
+
+ err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf);
+ if (err)
+ goto err_mtt;
+
+ srq->wrid = kvmalloc_array(srq->msrq.max,
+ sizeof(u64), GFP_KERNEL);
+ if (!srq->wrid) {
+ err = -ENOMEM;
+ goto err_mtt;
+ }
+ }
+
+ cqn = ib_srq_has_cq(init_attr->srq_type) ?
+ to_mcq(init_attr->ext.cq)->mcq.cqn : 0;
+ xrcdn = (init_attr->srq_type == IB_SRQT_XRC) ?
+ to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn :
+ (u16) dev->dev->caps.reserved_xrcds;
+ err = mlx4_srq_alloc(dev->dev, to_mpd(ib_srq->pd)->pdn, cqn, xrcdn,
+ &srq->mtt, srq->db.dma, &srq->msrq);
+ if (err)
+ goto err_wrid;
+
+ srq->msrq.event = mlx4_ib_srq_event;
+ srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn;
+
+ if (udata)
+ if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) {
+ err = -EFAULT;
+ goto err_wrid;
+ }
+
+ init_attr->attr.max_wr = srq->msrq.max - 1;
+
+ return 0;
+
+err_wrid:
+ if (udata)
+ mlx4_ib_db_unmap_user(ucontext, &srq->db);
+ else
+ kvfree(srq->wrid);
+
+err_mtt:
+ mlx4_mtt_cleanup(dev->dev, &srq->mtt);
+
+err_buf:
+ if (!srq->umem)
+ mlx4_buf_free(dev->dev, buf_size, &srq->buf);
+ ib_umem_release(srq->umem);
+
+err_db:
+ if (!udata)
+ mlx4_db_free(dev->dev, &srq->db);
+
+ return err;
+}
+
+int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibsrq->device);
+ struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+ int ret;
+
+ /* We don't support resizing SRQs (yet?) */
+ if (attr_mask & IB_SRQ_MAX_WR)
+ return -EINVAL;
+
+ if (attr_mask & IB_SRQ_LIMIT) {
+ if (attr->srq_limit >= srq->msrq.max)
+ return -EINVAL;
+
+ mutex_lock(&srq->mutex);
+ ret = mlx4_srq_arm(dev->dev, &srq->msrq, attr->srq_limit);
+ mutex_unlock(&srq->mutex);
+
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+int mlx4_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibsrq->device);
+ struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+ int ret;
+ int limit_watermark;
+
+ ret = mlx4_srq_query(dev->dev, &srq->msrq, &limit_watermark);
+ if (ret)
+ return ret;
+
+ srq_attr->srq_limit = limit_watermark;
+ srq_attr->max_wr = srq->msrq.max - 1;
+ srq_attr->max_sge = srq->msrq.max_gs;
+
+ return 0;
+}
+
+int mlx4_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(srq->device);
+ struct mlx4_ib_srq *msrq = to_msrq(srq);
+
+ mlx4_srq_free(dev->dev, &msrq->msrq);
+ mlx4_mtt_cleanup(dev->dev, &msrq->mtt);
+
+ if (udata) {
+ mlx4_ib_db_unmap_user(
+ rdma_udata_to_drv_context(
+ udata,
+ struct mlx4_ib_ucontext,
+ ibucontext),
+ &msrq->db);
+ } else {
+ kvfree(msrq->wrid);
+ mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift,
+ &msrq->buf);
+ mlx4_db_free(dev->dev, &msrq->db);
+ }
+ ib_umem_release(msrq->umem);
+ return 0;
+}
+
+void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index)
+{
+ struct mlx4_wqe_srq_next_seg *next;
+
+ /* always called with interrupts disabled. */
+ spin_lock(&srq->lock);
+
+ next = get_wqe(srq, srq->tail);
+ next->next_wqe_index = cpu_to_be16(wqe_index);
+ srq->tail = wqe_index;
+
+ spin_unlock(&srq->lock);
+}
+
+int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr)
+{
+ struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+ struct mlx4_wqe_srq_next_seg *next;
+ struct mlx4_wqe_data_seg *scat;
+ unsigned long flags;
+ int err = 0;
+ int nreq;
+ int i;
+ struct mlx4_ib_dev *mdev = to_mdev(ibsrq->device);
+
+ spin_lock_irqsave(&srq->lock, flags);
+ if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+ err = -EIO;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (unlikely(wr->num_sge > srq->msrq.max_gs)) {
+ err = -EINVAL;
+ *bad_wr = wr;
+ break;
+ }
+
+ if (unlikely(srq->head == srq->tail)) {
+ err = -ENOMEM;
+ *bad_wr = wr;
+ break;
+ }
+
+ srq->wrid[srq->head] = wr->wr_id;
+
+ next = get_wqe(srq, srq->head);
+ srq->head = be16_to_cpu(next->next_wqe_index);
+ scat = (struct mlx4_wqe_data_seg *) (next + 1);
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length);
+ scat[i].lkey = cpu_to_be32(wr->sg_list[i].lkey);
+ scat[i].addr = cpu_to_be64(wr->sg_list[i].addr);
+ }
+
+ if (i < srq->msrq.max_gs) {
+ scat[i].byte_count = 0;
+ scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY);
+ scat[i].addr = 0;
+ }
+ }
+
+ if (likely(nreq)) {
+ srq->wqe_ctr += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ wmb();
+
+ *srq->db.db = cpu_to_be32(srq->wqe_ctr);
+ }
+out:
+
+ spin_unlock_irqrestore(&srq->lock, flags);
+
+ return err;
+}
diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c
new file mode 100644
index 000000000..88f534cf6
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/sysfs.c
@@ -0,0 +1,872 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*#include "core_priv.h"*/
+#include "mlx4_ib.h"
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+
+#include <rdma/ib_mad.h>
+/*show_admin_alias_guid returns the administratively assigned value of that GUID.
+ * Values returned in buf parameter string:
+ * 0 - requests opensm to assign a value.
+ * ffffffffffffffff - delete this entry.
+ * other - value assigned by administrator.
+ */
+static ssize_t show_admin_alias_guid(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+ container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+ struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+ struct mlx4_ib_dev *mdev = port->dev;
+ __be64 sysadmin_ag_val;
+
+ sysadmin_ag_val = mlx4_get_admin_guid(mdev->dev,
+ mlx4_ib_iov_dentry->entry_num,
+ port->num);
+
+ return sysfs_emit(buf, "%llx\n", be64_to_cpu(sysadmin_ag_val));
+}
+
+/* store_admin_alias_guid stores the (new) administratively assigned value of that GUID.
+ * Values in buf parameter string:
+ * 0 - requests opensm to assign a value.
+ * 0xffffffffffffffff - delete this entry.
+ * other - guid value assigned by the administrator.
+ */
+static ssize_t store_admin_alias_guid(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int record_num;/*0-15*/
+ int guid_index_in_rec; /*0 - 7*/
+ struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+ container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+ struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+ struct mlx4_ib_dev *mdev = port->dev;
+ u64 sysadmin_ag_val;
+ unsigned long flags;
+
+ record_num = mlx4_ib_iov_dentry->entry_num / 8;
+ guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8;
+ if (0 == record_num && 0 == guid_index_in_rec) {
+ pr_err("GUID 0 block 0 is RO\n");
+ return count;
+ }
+ spin_lock_irqsave(&mdev->sriov.alias_guid.ag_work_lock, flags);
+ sscanf(buf, "%llx", &sysadmin_ag_val);
+ *(__be64 *)&mdev->sriov.alias_guid.ports_guid[port->num - 1].
+ all_rec_per_port[record_num].
+ all_recs[GUID_REC_SIZE * guid_index_in_rec] =
+ cpu_to_be64(sysadmin_ag_val);
+
+ /* Change the state to be pending for update */
+ mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].status
+ = MLX4_GUID_INFO_STATUS_IDLE ;
+ mlx4_set_admin_guid(mdev->dev, cpu_to_be64(sysadmin_ag_val),
+ mlx4_ib_iov_dentry->entry_num,
+ port->num);
+
+ /* set the record index */
+ mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].guid_indexes
+ |= mlx4_ib_get_aguid_comp_mask_from_ix(guid_index_in_rec);
+
+ spin_unlock_irqrestore(&mdev->sriov.alias_guid.ag_work_lock, flags);
+ mlx4_ib_init_alias_guid_work(mdev, port->num - 1);
+
+ return count;
+}
+
+static ssize_t show_port_gid(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+ container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+ struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+ struct mlx4_ib_dev *mdev = port->dev;
+ union ib_gid gid;
+ int ret;
+ __be16 *raw;
+
+ ret = __mlx4_ib_query_gid(&mdev->ib_dev, port->num,
+ mlx4_ib_iov_dentry->entry_num, &gid, 1);
+ if (ret)
+ return ret;
+
+ raw = (__be16 *)gid.raw;
+ return sysfs_emit(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+ be16_to_cpu(raw[0]),
+ be16_to_cpu(raw[1]),
+ be16_to_cpu(raw[2]),
+ be16_to_cpu(raw[3]),
+ be16_to_cpu(raw[4]),
+ be16_to_cpu(raw[5]),
+ be16_to_cpu(raw[6]),
+ be16_to_cpu(raw[7]));
+}
+
+static ssize_t show_phys_port_pkey(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+ container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+ struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+ struct mlx4_ib_dev *mdev = port->dev;
+ u16 pkey;
+ ssize_t ret;
+
+ ret = __mlx4_ib_query_pkey(&mdev->ib_dev, port->num,
+ mlx4_ib_iov_dentry->entry_num, &pkey, 1);
+ if (ret)
+ return ret;
+
+ return sysfs_emit(buf, "0x%04x\n", pkey);
+}
+
+#define DENTRY_REMOVE(_dentry) \
+do { \
+ sysfs_remove_file((_dentry)->kobj, &(_dentry)->dentry.attr); \
+} while (0);
+
+static int create_sysfs_entry(void *_ctx, struct mlx4_ib_iov_sysfs_attr *_dentry,
+ char *_name, struct kobject *_kobj,
+ ssize_t (*show)(struct device *dev,
+ struct device_attribute *attr,
+ char *buf),
+ ssize_t (*store)(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+ )
+{
+ int ret = 0;
+ struct mlx4_ib_iov_sysfs_attr *vdentry = _dentry;
+
+ vdentry->ctx = _ctx;
+ vdentry->dentry.show = show;
+ vdentry->dentry.store = store;
+ sysfs_attr_init(&vdentry->dentry.attr);
+ vdentry->dentry.attr.name = vdentry->name;
+ vdentry->dentry.attr.mode = 0;
+ vdentry->kobj = _kobj;
+ snprintf(vdentry->name, 15, "%s", _name);
+
+ if (vdentry->dentry.store)
+ vdentry->dentry.attr.mode |= S_IWUSR;
+
+ if (vdentry->dentry.show)
+ vdentry->dentry.attr.mode |= S_IRUGO;
+
+ ret = sysfs_create_file(vdentry->kobj, &vdentry->dentry.attr);
+ if (ret) {
+ pr_err("failed to create %s\n", vdentry->dentry.attr.name);
+ vdentry->ctx = NULL;
+ return ret;
+ }
+
+ return ret;
+}
+
+int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+ struct attribute *attr)
+{
+ struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1];
+ int ret;
+
+ ret = sysfs_create_file(port->mcgs_parent, attr);
+ if (ret)
+ pr_err("failed to create %s\n", attr->name);
+
+ return ret;
+}
+
+void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+ struct attribute *attr)
+{
+ struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1];
+
+ sysfs_remove_file(port->mcgs_parent, attr);
+}
+
+static int add_port_entries(struct mlx4_ib_dev *device, int port_num)
+{
+ int i;
+ char buff[12];
+ struct mlx4_ib_iov_port *port = NULL;
+ int ret = 0 ;
+ struct ib_port_attr attr;
+
+ memset(&attr, 0, sizeof(attr));
+ /* get the physical gid and pkey table sizes.*/
+ ret = __mlx4_ib_query_port(&device->ib_dev, port_num, &attr, 1);
+ if (ret)
+ goto err;
+
+ port = &device->iov_ports[port_num - 1];
+ port->dev = device;
+ port->num = port_num;
+ /* Directory structure:
+ * iov -
+ * port num -
+ * admin_guids
+ * gids (operational)
+ * mcg_table
+ */
+ port->dentr_ar = kzalloc(sizeof (struct mlx4_ib_iov_sysfs_attr_ar),
+ GFP_KERNEL);
+ if (!port->dentr_ar) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ sprintf(buff, "%d", port_num);
+ port->cur_port = kobject_create_and_add(buff,
+ kobject_get(device->ports_parent));
+ if (!port->cur_port) {
+ ret = -ENOMEM;
+ goto kobj_create_err;
+ }
+ /* admin GUIDs */
+ port->admin_alias_parent = kobject_create_and_add("admin_guids",
+ kobject_get(port->cur_port));
+ if (!port->admin_alias_parent) {
+ ret = -ENOMEM;
+ goto err_admin_guids;
+ }
+ for (i = 0 ; i < attr.gid_tbl_len; i++) {
+ sprintf(buff, "%d", i);
+ port->dentr_ar->dentries[i].entry_num = i;
+ ret = create_sysfs_entry(port, &port->dentr_ar->dentries[i],
+ buff, port->admin_alias_parent,
+ show_admin_alias_guid, store_admin_alias_guid);
+ if (ret)
+ goto err_admin_alias_parent;
+ }
+
+ /* gids subdirectory (operational gids) */
+ port->gids_parent = kobject_create_and_add("gids",
+ kobject_get(port->cur_port));
+ if (!port->gids_parent) {
+ ret = -ENOMEM;
+ goto err_gids;
+ }
+
+ for (i = 0 ; i < attr.gid_tbl_len; i++) {
+ sprintf(buff, "%d", i);
+ port->dentr_ar->dentries[attr.gid_tbl_len + i].entry_num = i;
+ ret = create_sysfs_entry(port,
+ &port->dentr_ar->dentries[attr.gid_tbl_len + i],
+ buff,
+ port->gids_parent, show_port_gid, NULL);
+ if (ret)
+ goto err_gids_parent;
+ }
+
+ /* physical port pkey table */
+ port->pkeys_parent =
+ kobject_create_and_add("pkeys", kobject_get(port->cur_port));
+ if (!port->pkeys_parent) {
+ ret = -ENOMEM;
+ goto err_pkeys;
+ }
+
+ for (i = 0 ; i < attr.pkey_tbl_len; i++) {
+ sprintf(buff, "%d", i);
+ port->dentr_ar->dentries[2 * attr.gid_tbl_len + i].entry_num = i;
+ ret = create_sysfs_entry(port,
+ &port->dentr_ar->dentries[2 * attr.gid_tbl_len + i],
+ buff, port->pkeys_parent,
+ show_phys_port_pkey, NULL);
+ if (ret)
+ goto err_pkeys_parent;
+ }
+
+ /* MCGs table */
+ port->mcgs_parent =
+ kobject_create_and_add("mcgs", kobject_get(port->cur_port));
+ if (!port->mcgs_parent) {
+ ret = -ENOMEM;
+ goto err_mcgs;
+ }
+ return 0;
+
+err_mcgs:
+ kobject_put(port->cur_port);
+
+err_pkeys_parent:
+ kobject_put(port->pkeys_parent);
+
+err_pkeys:
+ kobject_put(port->cur_port);
+
+err_gids_parent:
+ kobject_put(port->gids_parent);
+
+err_gids:
+ kobject_put(port->cur_port);
+
+err_admin_alias_parent:
+ kobject_put(port->admin_alias_parent);
+
+err_admin_guids:
+ kobject_put(port->cur_port);
+ kobject_put(port->cur_port); /* once more for create_and_add buff */
+
+kobj_create_err:
+ kobject_put(device->ports_parent);
+ kfree(port->dentr_ar);
+
+err:
+ pr_err("add_port_entries FAILED: for port:%d, error: %d\n",
+ port_num, ret);
+ return ret;
+}
+
+static void get_name(struct mlx4_ib_dev *dev, char *name, int i, int max)
+{
+ /* pci_name format is: bus:dev:func -> xxxx:yy:zz.n
+ * with no ARI only 3 last bits are used so when the fn is higher than 8
+ * need to add it to the dev num, so count in the last number will be
+ * modulo 8 */
+ snprintf(name, max, "%.8s%.2d.%d", pci_name(dev->dev->persist->pdev),
+ i / 8, i % 8);
+}
+
+struct mlx4_port {
+ struct kobject kobj;
+ struct mlx4_ib_dev *dev;
+ struct attribute_group pkey_group;
+ struct attribute_group gid_group;
+ struct device_attribute enable_smi_admin;
+ struct device_attribute smi_enabled;
+ int slave;
+ u8 port_num;
+};
+
+
+static void mlx4_port_release(struct kobject *kobj)
+{
+ struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj);
+ struct attribute *a;
+ int i;
+
+ for (i = 0; (a = p->pkey_group.attrs[i]); ++i)
+ kfree(a);
+ kfree(p->pkey_group.attrs);
+ for (i = 0; (a = p->gid_group.attrs[i]); ++i)
+ kfree(a);
+ kfree(p->gid_group.attrs);
+ kfree(p);
+}
+
+struct port_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct mlx4_port *, struct port_attribute *, char *buf);
+ ssize_t (*store)(struct mlx4_port *, struct port_attribute *,
+ const char *buf, size_t count);
+};
+
+static ssize_t port_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct port_attribute *port_attr =
+ container_of(attr, struct port_attribute, attr);
+ struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj);
+
+ if (!port_attr->show)
+ return -EIO;
+ return port_attr->show(p, port_attr, buf);
+}
+
+static ssize_t port_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t size)
+{
+ struct port_attribute *port_attr =
+ container_of(attr, struct port_attribute, attr);
+ struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj);
+
+ if (!port_attr->store)
+ return -EIO;
+ return port_attr->store(p, port_attr, buf, size);
+}
+
+static const struct sysfs_ops port_sysfs_ops = {
+ .show = port_attr_show,
+ .store = port_attr_store,
+};
+
+static struct kobj_type port_type = {
+ .release = mlx4_port_release,
+ .sysfs_ops = &port_sysfs_ops,
+};
+
+struct port_table_attribute {
+ struct port_attribute attr;
+ char name[8];
+ int index;
+};
+
+static ssize_t show_port_pkey(struct mlx4_port *p, struct port_attribute *attr,
+ char *buf)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ struct pkey_mgt *m = &p->dev->pkeys;
+ u8 key = m->virt2phys_pkey[p->slave][p->port_num - 1][tab_attr->index];
+
+ if (key >= p->dev->dev->caps.pkey_table_len[p->port_num])
+ return sysfs_emit(buf, "none\n");
+ return sysfs_emit(buf, "%d\n", key);
+}
+
+static ssize_t store_port_pkey(struct mlx4_port *p, struct port_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ int idx;
+ int err;
+
+ /* do not allow remapping Dom0 virtual pkey table */
+ if (p->slave == mlx4_master_func_num(p->dev->dev))
+ return -EINVAL;
+
+ if (!strncasecmp(buf, "no", 2))
+ idx = p->dev->dev->phys_caps.pkey_phys_table_len[p->port_num] - 1;
+ else if (sscanf(buf, "%i", &idx) != 1 ||
+ idx >= p->dev->dev->caps.pkey_table_len[p->port_num] ||
+ idx < 0)
+ return -EINVAL;
+
+ p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1]
+ [tab_attr->index] = idx;
+ mlx4_sync_pkey_table(p->dev->dev, p->slave, p->port_num,
+ tab_attr->index, idx);
+ err = mlx4_gen_pkey_eqe(p->dev->dev, p->slave, p->port_num);
+ if (err) {
+ pr_err("mlx4_gen_pkey_eqe failed for slave %d,"
+ " port %d, index %d\n", p->slave, p->port_num, idx);
+ return err;
+ }
+ return count;
+}
+
+static ssize_t show_port_gid_idx(struct mlx4_port *p,
+ struct port_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", p->slave);
+}
+
+static struct attribute **
+alloc_group_attrs(ssize_t (*show)(struct mlx4_port *,
+ struct port_attribute *, char *buf),
+ ssize_t (*store)(struct mlx4_port *, struct port_attribute *,
+ const char *buf, size_t count),
+ int len)
+{
+ struct attribute **tab_attr;
+ struct port_table_attribute *element;
+ int i;
+
+ tab_attr = kcalloc(1 + len, sizeof (struct attribute *), GFP_KERNEL);
+ if (!tab_attr)
+ return NULL;
+
+ for (i = 0; i < len; i++) {
+ element = kzalloc(sizeof (struct port_table_attribute),
+ GFP_KERNEL);
+ if (!element)
+ goto err;
+ if (snprintf(element->name, sizeof (element->name),
+ "%d", i) >= sizeof (element->name)) {
+ kfree(element);
+ goto err;
+ }
+ sysfs_attr_init(&element->attr.attr);
+ element->attr.attr.name = element->name;
+ if (store) {
+ element->attr.attr.mode = S_IWUSR | S_IRUGO;
+ element->attr.store = store;
+ } else
+ element->attr.attr.mode = S_IRUGO;
+
+ element->attr.show = show;
+ element->index = i;
+ tab_attr[i] = &element->attr.attr;
+ }
+ return tab_attr;
+
+err:
+ while (--i >= 0)
+ kfree(tab_attr[i]);
+ kfree(tab_attr);
+ return NULL;
+}
+
+static ssize_t sysfs_show_smi_enabled(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct mlx4_port *p =
+ container_of(attr, struct mlx4_port, smi_enabled);
+
+ return sysfs_emit(buf, "%d\n",
+ !!mlx4_vf_smi_enabled(p->dev->dev, p->slave,
+ p->port_num));
+}
+
+static ssize_t sysfs_show_enable_smi_admin(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct mlx4_port *p =
+ container_of(attr, struct mlx4_port, enable_smi_admin);
+
+ return sysfs_emit(buf, "%d\n",
+ !!mlx4_vf_get_enable_smi_admin(p->dev->dev, p->slave,
+ p->port_num));
+}
+
+static ssize_t sysfs_store_enable_smi_admin(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct mlx4_port *p =
+ container_of(attr, struct mlx4_port, enable_smi_admin);
+ int enable;
+
+ if (sscanf(buf, "%i", &enable) != 1 ||
+ enable < 0 || enable > 1)
+ return -EINVAL;
+
+ if (mlx4_vf_set_enable_smi_admin(p->dev->dev, p->slave, p->port_num, enable))
+ return -EINVAL;
+ return count;
+}
+
+static int add_vf_smi_entries(struct mlx4_port *p)
+{
+ int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) ==
+ IB_LINK_LAYER_ETHERNET;
+ int ret;
+
+ /* do not display entries if eth transport, or if master */
+ if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev))
+ return 0;
+
+ sysfs_attr_init(&p->smi_enabled.attr);
+ p->smi_enabled.show = sysfs_show_smi_enabled;
+ p->smi_enabled.store = NULL;
+ p->smi_enabled.attr.name = "smi_enabled";
+ p->smi_enabled.attr.mode = 0444;
+ ret = sysfs_create_file(&p->kobj, &p->smi_enabled.attr);
+ if (ret) {
+ pr_err("failed to create smi_enabled\n");
+ return ret;
+ }
+
+ sysfs_attr_init(&p->enable_smi_admin.attr);
+ p->enable_smi_admin.show = sysfs_show_enable_smi_admin;
+ p->enable_smi_admin.store = sysfs_store_enable_smi_admin;
+ p->enable_smi_admin.attr.name = "enable_smi_admin";
+ p->enable_smi_admin.attr.mode = 0644;
+ ret = sysfs_create_file(&p->kobj, &p->enable_smi_admin.attr);
+ if (ret) {
+ pr_err("failed to create enable_smi_admin\n");
+ sysfs_remove_file(&p->kobj, &p->smi_enabled.attr);
+ return ret;
+ }
+ return 0;
+}
+
+static void remove_vf_smi_entries(struct mlx4_port *p)
+{
+ int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) ==
+ IB_LINK_LAYER_ETHERNET;
+
+ if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev))
+ return;
+
+ sysfs_remove_file(&p->kobj, &p->smi_enabled.attr);
+ sysfs_remove_file(&p->kobj, &p->enable_smi_admin.attr);
+}
+
+static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave)
+{
+ struct mlx4_port *p;
+ int i;
+ int ret;
+ int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port_num) ==
+ IB_LINK_LAYER_ETHERNET;
+
+ p = kzalloc(sizeof *p, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ p->dev = dev;
+ p->port_num = port_num;
+ p->slave = slave;
+
+ ret = kobject_init_and_add(&p->kobj, &port_type,
+ kobject_get(dev->dev_ports_parent[slave]),
+ "%d", port_num);
+ if (ret)
+ goto err_alloc;
+
+ p->pkey_group.name = "pkey_idx";
+ p->pkey_group.attrs =
+ alloc_group_attrs(show_port_pkey,
+ is_eth ? NULL : store_port_pkey,
+ dev->dev->caps.pkey_table_len[port_num]);
+ if (!p->pkey_group.attrs) {
+ ret = -ENOMEM;
+ goto err_alloc;
+ }
+
+ ret = sysfs_create_group(&p->kobj, &p->pkey_group);
+ if (ret)
+ goto err_free_pkey;
+
+ p->gid_group.name = "gid_idx";
+ p->gid_group.attrs = alloc_group_attrs(show_port_gid_idx, NULL, 1);
+ if (!p->gid_group.attrs) {
+ ret = -ENOMEM;
+ goto err_free_pkey;
+ }
+
+ ret = sysfs_create_group(&p->kobj, &p->gid_group);
+ if (ret)
+ goto err_free_gid;
+
+ ret = add_vf_smi_entries(p);
+ if (ret)
+ goto err_free_gid;
+
+ list_add_tail(&p->kobj.entry, &dev->pkeys.pkey_port_list[slave]);
+ return 0;
+
+err_free_gid:
+ kfree(p->gid_group.attrs[0]);
+ kfree(p->gid_group.attrs);
+
+err_free_pkey:
+ for (i = 0; i < dev->dev->caps.pkey_table_len[port_num]; ++i)
+ kfree(p->pkey_group.attrs[i]);
+ kfree(p->pkey_group.attrs);
+
+err_alloc:
+ kobject_put(dev->dev_ports_parent[slave]);
+ kfree(p);
+ return ret;
+}
+
+static int register_one_pkey_tree(struct mlx4_ib_dev *dev, int slave)
+{
+ char name[32];
+ int err;
+ int port;
+ struct kobject *p, *t;
+ struct mlx4_port *mport;
+ struct mlx4_active_ports actv_ports;
+
+ get_name(dev, name, slave, sizeof name);
+
+ dev->pkeys.device_parent[slave] =
+ kobject_create_and_add(name, kobject_get(dev->iov_parent));
+
+ if (!dev->pkeys.device_parent[slave]) {
+ err = -ENOMEM;
+ goto fail_dev;
+ }
+
+ INIT_LIST_HEAD(&dev->pkeys.pkey_port_list[slave]);
+
+ dev->dev_ports_parent[slave] =
+ kobject_create_and_add("ports",
+ kobject_get(dev->pkeys.device_parent[slave]));
+
+ if (!dev->dev_ports_parent[slave]) {
+ err = -ENOMEM;
+ goto err_ports;
+ }
+
+ actv_ports = mlx4_get_active_ports(dev->dev, slave);
+
+ for (port = 1; port <= dev->dev->caps.num_ports; ++port) {
+ if (!test_bit(port - 1, actv_ports.ports))
+ continue;
+ err = add_port(dev, port, slave);
+ if (err)
+ goto err_add;
+ }
+ return 0;
+
+err_add:
+ list_for_each_entry_safe(p, t,
+ &dev->pkeys.pkey_port_list[slave],
+ entry) {
+ list_del(&p->entry);
+ mport = container_of(p, struct mlx4_port, kobj);
+ sysfs_remove_group(p, &mport->pkey_group);
+ sysfs_remove_group(p, &mport->gid_group);
+ remove_vf_smi_entries(mport);
+ kobject_put(p);
+ }
+ kobject_put(dev->dev_ports_parent[slave]);
+
+err_ports:
+ kobject_put(dev->pkeys.device_parent[slave]);
+ /* extra put for the device_parent create_and_add */
+ kobject_put(dev->pkeys.device_parent[slave]);
+
+fail_dev:
+ kobject_put(dev->iov_parent);
+ return err;
+}
+
+static int register_pkey_tree(struct mlx4_ib_dev *device)
+{
+ int i;
+
+ if (!mlx4_is_master(device->dev))
+ return 0;
+
+ for (i = 0; i <= device->dev->persist->num_vfs; ++i)
+ register_one_pkey_tree(device, i);
+
+ return 0;
+}
+
+static void unregister_pkey_tree(struct mlx4_ib_dev *device)
+{
+ int slave;
+ struct kobject *p, *t;
+ struct mlx4_port *port;
+
+ if (!mlx4_is_master(device->dev))
+ return;
+
+ for (slave = device->dev->persist->num_vfs; slave >= 0; --slave) {
+ list_for_each_entry_safe(p, t,
+ &device->pkeys.pkey_port_list[slave],
+ entry) {
+ list_del(&p->entry);
+ port = container_of(p, struct mlx4_port, kobj);
+ sysfs_remove_group(p, &port->pkey_group);
+ sysfs_remove_group(p, &port->gid_group);
+ remove_vf_smi_entries(port);
+ kobject_put(p);
+ kobject_put(device->dev_ports_parent[slave]);
+ }
+ kobject_put(device->dev_ports_parent[slave]);
+ kobject_put(device->pkeys.device_parent[slave]);
+ kobject_put(device->pkeys.device_parent[slave]);
+ kobject_put(device->iov_parent);
+ }
+}
+
+int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *dev)
+{
+ unsigned int i;
+ int ret = 0;
+
+ if (!mlx4_is_master(dev->dev))
+ return 0;
+
+ dev->iov_parent = kobject_create_and_add("iov", &dev->ib_dev.dev.kobj);
+ if (!dev->iov_parent) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ dev->ports_parent =
+ kobject_create_and_add("ports",
+ kobject_get(dev->iov_parent));
+ if (!dev->ports_parent) {
+ ret = -ENOMEM;
+ goto err_ports;
+ }
+
+ rdma_for_each_port(&dev->ib_dev, i) {
+ ret = add_port_entries(dev, i);
+ if (ret)
+ goto err_add_entries;
+ }
+
+ ret = register_pkey_tree(dev);
+ if (ret)
+ goto err_add_entries;
+ return 0;
+
+err_add_entries:
+ kobject_put(dev->ports_parent);
+
+err_ports:
+ kobject_put(dev->iov_parent);
+err:
+ pr_err("mlx4_ib_device_register_sysfs error (%d)\n", ret);
+ return ret;
+}
+
+static void unregister_alias_guid_tree(struct mlx4_ib_dev *device)
+{
+ struct mlx4_ib_iov_port *p;
+ int i;
+
+ if (!mlx4_is_master(device->dev))
+ return;
+
+ for (i = 0; i < device->dev->caps.num_ports; i++) {
+ p = &device->iov_ports[i];
+ kobject_put(p->admin_alias_parent);
+ kobject_put(p->gids_parent);
+ kobject_put(p->pkeys_parent);
+ kobject_put(p->mcgs_parent);
+ kobject_put(p->cur_port);
+ kobject_put(p->cur_port);
+ kobject_put(p->cur_port);
+ kobject_put(p->cur_port);
+ kobject_put(p->cur_port);
+ kobject_put(p->dev->ports_parent);
+ kfree(p->dentr_ar);
+ }
+}
+
+void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device)
+{
+ unregister_alias_guid_tree(device);
+ unregister_pkey_tree(device);
+ kobject_put(device->ports_parent);
+ kobject_put(device->iov_parent);
+ kobject_put(device->iov_parent);
+}