summaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/core
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/core')
-rw-r--r--drivers/infiniband/core/Makefile41
-rw-r--r--drivers/infiniband/core/addr.c785
-rw-r--r--drivers/infiniband/core/agent.c221
-rw-r--r--drivers/infiniband/core/agent.h51
-rw-r--r--drivers/infiniband/core/cache.c1463
-rw-r--r--drivers/infiniband/core/cgroup.c62
-rw-r--r--drivers/infiniband/core/cm.c4577
-rw-r--r--drivers/infiniband/core/cm_msgs.h829
-rw-r--r--drivers/infiniband/core/cma.c4702
-rw-r--r--drivers/infiniband/core/cma_configfs.c363
-rw-r--r--drivers/infiniband/core/cma_priv.h97
-rw-r--r--drivers/infiniband/core/core_priv.h344
-rw-r--r--drivers/infiniband/core/cq.c231
-rw-r--r--drivers/infiniband/core/device.c1261
-rw-r--r--drivers/infiniband/core/fmr_pool.c508
-rw-r--r--drivers/infiniband/core/iwcm.c1210
-rw-r--r--drivers/infiniband/core/iwcm.h62
-rw-r--r--drivers/infiniband/core/iwpm_msg.c750
-rw-r--r--drivers/infiniband/core/iwpm_util.c756
-rw-r--r--drivers/infiniband/core/iwpm_util.h269
-rw-r--r--drivers/infiniband/core/mad.c3372
-rw-r--r--drivers/infiniband/core/mad_priv.h226
-rw-r--r--drivers/infiniband/core/mad_rmpp.c968
-rw-r--r--drivers/infiniband/core/mad_rmpp.h58
-rw-r--r--drivers/infiniband/core/mr_pool.c86
-rw-r--r--drivers/infiniband/core/multicast.c908
-rw-r--r--drivers/infiniband/core/netlink.c311
-rw-r--r--drivers/infiniband/core/nldev.c1121
-rw-r--r--drivers/infiniband/core/opa_smi.h78
-rw-r--r--drivers/infiniband/core/packer.c201
-rw-r--r--drivers/infiniband/core/rdma_core.c1030
-rw-r--r--drivers/infiniband/core/rdma_core.h164
-rw-r--r--drivers/infiniband/core/restrack.c233
-rw-r--r--drivers/infiniband/core/roce_gid_mgmt.c924
-rw-r--r--drivers/infiniband/core/rw.c745
-rw-r--r--drivers/infiniband/core/sa.h66
-rw-r--r--drivers/infiniband/core/sa_query.c2527
-rw-r--r--drivers/infiniband/core/security.c749
-rw-r--r--drivers/infiniband/core/smi.c338
-rw-r--r--drivers/infiniband/core/smi.h90
-rw-r--r--drivers/infiniband/core/sysfs.c1381
-rw-r--r--drivers/infiniband/core/ucm.c1359
-rw-r--r--drivers/infiniband/core/ucma.c1896
-rw-r--r--drivers/infiniband/core/ud_header.c547
-rw-r--r--drivers/infiniband/core/umem.c349
-rw-r--r--drivers/infiniband/core/umem_odp.c852
-rw-r--r--drivers/infiniband/core/user_mad.c1425
-rw-r--r--drivers/infiniband/core/uverbs.h362
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c4146
-rw-r--r--drivers/infiniband/core/uverbs_ioctl.c613
-rw-r--r--drivers/infiniband/core/uverbs_main.c1261
-rw-r--r--drivers/infiniband/core/uverbs_marshall.c215
-rw-r--r--drivers/infiniband/core/uverbs_std_types.c286
-rw-r--r--drivers/infiniband/core/uverbs_std_types_counters.c151
-rw-r--r--drivers/infiniband/core/uverbs_std_types_cq.c209
-rw-r--r--drivers/infiniband/core/uverbs_std_types_dm.c111
-rw-r--r--drivers/infiniband/core/uverbs_std_types_flow_action.c443
-rw-r--r--drivers/infiniband/core/uverbs_std_types_mr.c149
-rw-r--r--drivers/infiniband/core/uverbs_uapi.c349
-rw-r--r--drivers/infiniband/core/verbs.c2630
60 files changed, 51511 insertions, 0 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
new file mode 100644
index 000000000..867cee5e2
--- /dev/null
+++ b/drivers/infiniband/core/Makefile
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: GPL-2.0
+infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_cm.o
+user_access-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_ucm.o
+
+obj-$(CONFIG_INFINIBAND) += ib_core.o ib_cm.o iw_cm.o \
+ $(infiniband-y)
+obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o
+obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o $(user_access-y)
+obj-$(CONFIG_INFINIBAND_USER_ACCESS_UCM) += ib_ucm.o $(user_access-y)
+
+ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
+ device.o fmr_pool.o cache.o netlink.o \
+ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
+ multicast.o mad.o smi.o agent.o mad_rmpp.o \
+ nldev.o restrack.o
+
+ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
+ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
+ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
+ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
+
+ib_cm-y := cm.o
+
+iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o
+
+rdma_cm-y := cma.o
+
+rdma_cm-$(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) += cma_configfs.o
+
+rdma_ucm-y := ucma.o
+
+ib_umad-y := user_mad.o
+
+ib_ucm-y := ucm.o
+
+ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
+ rdma_core.o uverbs_std_types.o uverbs_ioctl.o \
+ uverbs_std_types_cq.o \
+ uverbs_std_types_flow_action.o uverbs_std_types_dm.o \
+ uverbs_std_types_mr.o uverbs_std_types_counters.o \
+ uverbs_uapi.o
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
new file mode 100644
index 000000000..30385ba7c
--- /dev/null
+++ b/drivers/infiniband/core/addr.c
@@ -0,0 +1,785 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc. All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mutex.h>
+#include <linux/inetdevice.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/module.h>
+#include <net/arp.h>
+#include <net/neighbour.h>
+#include <net/route.h>
+#include <net/netevent.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib.h>
+#include <rdma/rdma_netlink.h>
+#include <net/netlink.h>
+
+#include "core_priv.h"
+
+struct addr_req {
+ struct list_head list;
+ struct sockaddr_storage src_addr;
+ struct sockaddr_storage dst_addr;
+ struct rdma_dev_addr *addr;
+ void *context;
+ void (*callback)(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *addr, void *context);
+ unsigned long timeout;
+ struct delayed_work work;
+ int status;
+ u32 seq;
+};
+
+static atomic_t ib_nl_addr_request_seq = ATOMIC_INIT(0);
+
+static DEFINE_SPINLOCK(lock);
+static LIST_HEAD(req_list);
+static struct workqueue_struct *addr_wq;
+
+static const struct nla_policy ib_nl_addr_policy[LS_NLA_TYPE_MAX] = {
+ [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY,
+ .len = sizeof(struct rdma_nla_ls_gid)},
+};
+
+static inline bool ib_nl_is_good_ip_resp(const struct nlmsghdr *nlh)
+{
+ struct nlattr *tb[LS_NLA_TYPE_MAX] = {};
+ int ret;
+
+ if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
+ return false;
+
+ ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+ nlmsg_len(nlh), ib_nl_addr_policy, NULL);
+ if (ret)
+ return false;
+
+ return true;
+}
+
+static void ib_nl_process_good_ip_rsep(const struct nlmsghdr *nlh)
+{
+ const struct nlattr *head, *curr;
+ union ib_gid gid;
+ struct addr_req *req;
+ int len, rem;
+ int found = 0;
+
+ head = (const struct nlattr *)nlmsg_data(nlh);
+ len = nlmsg_len(nlh);
+
+ nla_for_each_attr(curr, head, len, rem) {
+ if (curr->nla_type == LS_NLA_TYPE_DGID)
+ memcpy(&gid, nla_data(curr), nla_len(curr));
+ }
+
+ spin_lock_bh(&lock);
+ list_for_each_entry(req, &req_list, list) {
+ if (nlh->nlmsg_seq != req->seq)
+ continue;
+ /* We set the DGID part, the rest was set earlier */
+ rdma_addr_set_dgid(req->addr, &gid);
+ req->status = 0;
+ found = 1;
+ break;
+ }
+ spin_unlock_bh(&lock);
+
+ if (!found)
+ pr_info("Couldn't find request waiting for DGID: %pI6\n",
+ &gid);
+}
+
+int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ if ((nlh->nlmsg_flags & NLM_F_REQUEST) ||
+ !(NETLINK_CB(skb).sk))
+ return -EPERM;
+
+ if (ib_nl_is_good_ip_resp(nlh))
+ ib_nl_process_good_ip_rsep(nlh);
+
+ return 0;
+}
+
+static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr,
+ const void *daddr,
+ u32 seq, u16 family)
+{
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ struct rdma_ls_ip_resolve_header *header;
+ void *data;
+ size_t size;
+ int attrtype;
+ int len;
+
+ if (family == AF_INET) {
+ size = sizeof(struct in_addr);
+ attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV4;
+ } else {
+ size = sizeof(struct in6_addr);
+ attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV6;
+ }
+
+ len = nla_total_size(sizeof(size));
+ len += NLMSG_ALIGN(sizeof(*header));
+
+ skb = nlmsg_new(len, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ data = ibnl_put_msg(skb, &nlh, seq, 0, RDMA_NL_LS,
+ RDMA_NL_LS_OP_IP_RESOLVE, NLM_F_REQUEST);
+ if (!data) {
+ nlmsg_free(skb);
+ return -ENODATA;
+ }
+
+ /* Construct the family header first */
+ header = skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
+ header->ifindex = dev_addr->bound_dev_if;
+ nla_put(skb, attrtype, size, daddr);
+
+ /* Repair the nlmsg header length */
+ nlmsg_end(skb, nlh);
+ rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, GFP_KERNEL);
+
+ /* Make the request retry, so when we get the response from userspace
+ * we will have something.
+ */
+ return -ENODATA;
+}
+
+int rdma_addr_size(const struct sockaddr *addr)
+{
+ switch (addr->sa_family) {
+ case AF_INET:
+ return sizeof(struct sockaddr_in);
+ case AF_INET6:
+ return sizeof(struct sockaddr_in6);
+ case AF_IB:
+ return sizeof(struct sockaddr_ib);
+ default:
+ return 0;
+ }
+}
+EXPORT_SYMBOL(rdma_addr_size);
+
+int rdma_addr_size_in6(struct sockaddr_in6 *addr)
+{
+ int ret = rdma_addr_size((struct sockaddr *) addr);
+
+ return ret <= sizeof(*addr) ? ret : 0;
+}
+EXPORT_SYMBOL(rdma_addr_size_in6);
+
+int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr)
+{
+ int ret = rdma_addr_size((struct sockaddr *) addr);
+
+ return ret <= sizeof(*addr) ? ret : 0;
+}
+EXPORT_SYMBOL(rdma_addr_size_kss);
+
+void rdma_copy_addr(struct rdma_dev_addr *dev_addr,
+ const struct net_device *dev,
+ const unsigned char *dst_dev_addr)
+{
+ dev_addr->dev_type = dev->type;
+ memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
+ memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN);
+ if (dst_dev_addr)
+ memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN);
+ dev_addr->bound_dev_if = dev->ifindex;
+}
+EXPORT_SYMBOL(rdma_copy_addr);
+
+int rdma_translate_ip(const struct sockaddr *addr,
+ struct rdma_dev_addr *dev_addr)
+{
+ struct net_device *dev;
+
+ if (dev_addr->bound_dev_if) {
+ dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
+ if (!dev)
+ return -ENODEV;
+ rdma_copy_addr(dev_addr, dev, NULL);
+ dev_put(dev);
+ return 0;
+ }
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ dev = ip_dev_find(dev_addr->net,
+ ((const struct sockaddr_in *)addr)->sin_addr.s_addr);
+
+ if (!dev)
+ return -EADDRNOTAVAIL;
+
+ rdma_copy_addr(dev_addr, dev, NULL);
+ dev_put(dev);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ rcu_read_lock();
+ for_each_netdev_rcu(dev_addr->net, dev) {
+ if (ipv6_chk_addr(dev_addr->net,
+ &((const struct sockaddr_in6 *)addr)->sin6_addr,
+ dev, 1)) {
+ rdma_copy_addr(dev_addr, dev, NULL);
+ break;
+ }
+ }
+ rcu_read_unlock();
+ break;
+#endif
+ }
+ return 0;
+}
+EXPORT_SYMBOL(rdma_translate_ip);
+
+static void set_timeout(struct addr_req *req, unsigned long time)
+{
+ unsigned long delay;
+
+ delay = time - jiffies;
+ if ((long)delay < 0)
+ delay = 0;
+
+ mod_delayed_work(addr_wq, &req->work, delay);
+}
+
+static void queue_req(struct addr_req *req)
+{
+ spin_lock_bh(&lock);
+ list_add_tail(&req->list, &req_list);
+ set_timeout(req, req->timeout);
+ spin_unlock_bh(&lock);
+}
+
+static int ib_nl_fetch_ha(const struct dst_entry *dst,
+ struct rdma_dev_addr *dev_addr,
+ const void *daddr, u32 seq, u16 family)
+{
+ if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS))
+ return -EADDRNOTAVAIL;
+
+ /* We fill in what we can, the response will fill the rest */
+ rdma_copy_addr(dev_addr, dst->dev, NULL);
+ return ib_nl_ip_send_msg(dev_addr, daddr, seq, family);
+}
+
+static int dst_fetch_ha(const struct dst_entry *dst,
+ struct rdma_dev_addr *dev_addr,
+ const void *daddr)
+{
+ struct neighbour *n;
+ int ret = 0;
+
+ n = dst_neigh_lookup(dst, daddr);
+ if (!n)
+ return -ENODATA;
+
+ if (!(n->nud_state & NUD_VALID)) {
+ neigh_event_send(n, NULL);
+ ret = -ENODATA;
+ } else {
+ rdma_copy_addr(dev_addr, dst->dev, n->ha);
+ }
+
+ neigh_release(n);
+
+ return ret;
+}
+
+static bool has_gateway(const struct dst_entry *dst, sa_family_t family)
+{
+ struct rtable *rt;
+ struct rt6_info *rt6;
+
+ if (family == AF_INET) {
+ rt = container_of(dst, struct rtable, dst);
+ return rt->rt_uses_gateway;
+ }
+
+ rt6 = container_of(dst, struct rt6_info, dst);
+ return rt6->rt6i_flags & RTF_GATEWAY;
+}
+
+static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
+ const struct sockaddr *dst_in, u32 seq)
+{
+ const struct sockaddr_in *dst_in4 =
+ (const struct sockaddr_in *)dst_in;
+ const struct sockaddr_in6 *dst_in6 =
+ (const struct sockaddr_in6 *)dst_in;
+ const void *daddr = (dst_in->sa_family == AF_INET) ?
+ (const void *)&dst_in4->sin_addr.s_addr :
+ (const void *)&dst_in6->sin6_addr;
+ sa_family_t family = dst_in->sa_family;
+
+ /* Gateway + ARPHRD_INFINIBAND -> IB router */
+ if (has_gateway(dst, family) && dst->dev->type == ARPHRD_INFINIBAND)
+ return ib_nl_fetch_ha(dst, dev_addr, daddr, seq, family);
+ else
+ return dst_fetch_ha(dst, dev_addr, daddr);
+}
+
+static int addr4_resolve(struct sockaddr_in *src_in,
+ const struct sockaddr_in *dst_in,
+ struct rdma_dev_addr *addr,
+ struct rtable **prt)
+{
+ __be32 src_ip = src_in->sin_addr.s_addr;
+ __be32 dst_ip = dst_in->sin_addr.s_addr;
+ struct rtable *rt;
+ struct flowi4 fl4;
+ int ret;
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.daddr = dst_ip;
+ fl4.saddr = src_ip;
+ fl4.flowi4_oif = addr->bound_dev_if;
+ rt = ip_route_output_key(addr->net, &fl4);
+ ret = PTR_ERR_OR_ZERO(rt);
+ if (ret)
+ return ret;
+
+ src_in->sin_family = AF_INET;
+ src_in->sin_addr.s_addr = fl4.saddr;
+
+ /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're
+ * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network
+ * type accordingly.
+ */
+ if (rt->rt_uses_gateway && rt->dst.dev->type != ARPHRD_INFINIBAND)
+ addr->network = RDMA_NETWORK_IPV4;
+
+ addr->hoplimit = ip4_dst_hoplimit(&rt->dst);
+
+ *prt = rt;
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int addr6_resolve(struct sockaddr_in6 *src_in,
+ const struct sockaddr_in6 *dst_in,
+ struct rdma_dev_addr *addr,
+ struct dst_entry **pdst)
+{
+ struct flowi6 fl6;
+ struct dst_entry *dst;
+ struct rt6_info *rt;
+
+ memset(&fl6, 0, sizeof fl6);
+ fl6.daddr = dst_in->sin6_addr;
+ fl6.saddr = src_in->sin6_addr;
+ fl6.flowi6_oif = addr->bound_dev_if;
+
+ dst = ipv6_stub->ipv6_dst_lookup_flow(addr->net, NULL, &fl6, NULL);
+ if (IS_ERR(dst))
+ return PTR_ERR(dst);
+
+ rt = (struct rt6_info *)dst;
+ if (ipv6_addr_any(&src_in->sin6_addr)) {
+ src_in->sin6_family = AF_INET6;
+ src_in->sin6_addr = fl6.saddr;
+ }
+
+ /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're
+ * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network
+ * type accordingly.
+ */
+ if (rt->rt6i_flags & RTF_GATEWAY &&
+ ip6_dst_idev(dst)->dev->type != ARPHRD_INFINIBAND)
+ addr->network = RDMA_NETWORK_IPV6;
+
+ addr->hoplimit = ip6_dst_hoplimit(dst);
+
+ *pdst = dst;
+ return 0;
+}
+#else
+static int addr6_resolve(struct sockaddr_in6 *src_in,
+ const struct sockaddr_in6 *dst_in,
+ struct rdma_dev_addr *addr,
+ struct dst_entry **pdst)
+{
+ return -EADDRNOTAVAIL;
+}
+#endif
+
+static int addr_resolve_neigh(const struct dst_entry *dst,
+ const struct sockaddr *dst_in,
+ struct rdma_dev_addr *addr,
+ u32 seq)
+{
+ if (dst->dev->flags & IFF_LOOPBACK) {
+ int ret;
+
+ ret = rdma_translate_ip(dst_in, addr);
+ if (!ret)
+ memcpy(addr->dst_dev_addr, addr->src_dev_addr,
+ MAX_ADDR_LEN);
+
+ return ret;
+ }
+
+ /* If the device doesn't do ARP internally */
+ if (!(dst->dev->flags & IFF_NOARP))
+ return fetch_ha(dst, addr, dst_in, seq);
+
+ rdma_copy_addr(addr, dst->dev, NULL);
+
+ return 0;
+}
+
+static int addr_resolve(struct sockaddr *src_in,
+ const struct sockaddr *dst_in,
+ struct rdma_dev_addr *addr,
+ bool resolve_neigh,
+ u32 seq)
+{
+ struct net_device *ndev;
+ struct dst_entry *dst;
+ int ret;
+
+ if (!addr->net) {
+ pr_warn_ratelimited("%s: missing namespace\n", __func__);
+ return -EINVAL;
+ }
+
+ if (src_in->sa_family == AF_INET) {
+ struct rtable *rt = NULL;
+ const struct sockaddr_in *dst_in4 =
+ (const struct sockaddr_in *)dst_in;
+
+ ret = addr4_resolve((struct sockaddr_in *)src_in,
+ dst_in4, addr, &rt);
+ if (ret)
+ return ret;
+
+ if (resolve_neigh)
+ ret = addr_resolve_neigh(&rt->dst, dst_in, addr, seq);
+
+ if (addr->bound_dev_if) {
+ ndev = dev_get_by_index(addr->net, addr->bound_dev_if);
+ } else {
+ ndev = rt->dst.dev;
+ dev_hold(ndev);
+ }
+
+ ip_rt_put(rt);
+ } else {
+ const struct sockaddr_in6 *dst_in6 =
+ (const struct sockaddr_in6 *)dst_in;
+
+ ret = addr6_resolve((struct sockaddr_in6 *)src_in,
+ dst_in6, addr,
+ &dst);
+ if (ret)
+ return ret;
+
+ if (resolve_neigh)
+ ret = addr_resolve_neigh(dst, dst_in, addr, seq);
+
+ if (addr->bound_dev_if) {
+ ndev = dev_get_by_index(addr->net, addr->bound_dev_if);
+ } else {
+ ndev = dst->dev;
+ dev_hold(ndev);
+ }
+
+ dst_release(dst);
+ }
+
+ if (ndev) {
+ if (ndev->flags & IFF_LOOPBACK)
+ ret = rdma_translate_ip(dst_in, addr);
+ else
+ addr->bound_dev_if = ndev->ifindex;
+ dev_put(ndev);
+ }
+
+ return ret;
+}
+
+static void process_one_req(struct work_struct *_work)
+{
+ struct addr_req *req;
+ struct sockaddr *src_in, *dst_in;
+
+ req = container_of(_work, struct addr_req, work.work);
+
+ if (req->status == -ENODATA) {
+ src_in = (struct sockaddr *)&req->src_addr;
+ dst_in = (struct sockaddr *)&req->dst_addr;
+ req->status = addr_resolve(src_in, dst_in, req->addr,
+ true, req->seq);
+ if (req->status && time_after_eq(jiffies, req->timeout)) {
+ req->status = -ETIMEDOUT;
+ } else if (req->status == -ENODATA) {
+ /* requeue the work for retrying again */
+ spin_lock_bh(&lock);
+ if (!list_empty(&req->list))
+ set_timeout(req, req->timeout);
+ spin_unlock_bh(&lock);
+ return;
+ }
+ }
+
+ req->callback(req->status, (struct sockaddr *)&req->src_addr,
+ req->addr, req->context);
+ req->callback = NULL;
+
+ spin_lock_bh(&lock);
+ /*
+ * Although the work will normally have been canceled by the workqueue,
+ * it can still be requeued as long as it is on the req_list.
+ */
+ cancel_delayed_work(&req->work);
+ if (!list_empty(&req->list)) {
+ list_del_init(&req->list);
+ kfree(req);
+ }
+ spin_unlock_bh(&lock);
+}
+
+int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr,
+ struct rdma_dev_addr *addr, int timeout_ms,
+ void (*callback)(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *addr, void *context),
+ void *context)
+{
+ struct sockaddr *src_in, *dst_in;
+ struct addr_req *req;
+ int ret = 0;
+
+ req = kzalloc(sizeof *req, GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ src_in = (struct sockaddr *) &req->src_addr;
+ dst_in = (struct sockaddr *) &req->dst_addr;
+
+ if (src_addr) {
+ if (src_addr->sa_family != dst_addr->sa_family) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ memcpy(src_in, src_addr, rdma_addr_size(src_addr));
+ } else {
+ src_in->sa_family = dst_addr->sa_family;
+ }
+
+ memcpy(dst_in, dst_addr, rdma_addr_size(dst_addr));
+ req->addr = addr;
+ req->callback = callback;
+ req->context = context;
+ INIT_DELAYED_WORK(&req->work, process_one_req);
+ req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq);
+
+ req->status = addr_resolve(src_in, dst_in, addr, true, req->seq);
+ switch (req->status) {
+ case 0:
+ req->timeout = jiffies;
+ queue_req(req);
+ break;
+ case -ENODATA:
+ req->timeout = msecs_to_jiffies(timeout_ms) + jiffies;
+ queue_req(req);
+ break;
+ default:
+ ret = req->status;
+ goto err;
+ }
+ return ret;
+err:
+ kfree(req);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_ip);
+
+int rdma_resolve_ip_route(struct sockaddr *src_addr,
+ const struct sockaddr *dst_addr,
+ struct rdma_dev_addr *addr)
+{
+ struct sockaddr_storage ssrc_addr = {};
+ struct sockaddr *src_in = (struct sockaddr *)&ssrc_addr;
+
+ if (src_addr) {
+ if (src_addr->sa_family != dst_addr->sa_family)
+ return -EINVAL;
+
+ memcpy(src_in, src_addr, rdma_addr_size(src_addr));
+ } else {
+ src_in->sa_family = dst_addr->sa_family;
+ }
+
+ return addr_resolve(src_in, dst_addr, addr, false, 0);
+}
+
+void rdma_addr_cancel(struct rdma_dev_addr *addr)
+{
+ struct addr_req *req, *temp_req;
+ struct addr_req *found = NULL;
+
+ spin_lock_bh(&lock);
+ list_for_each_entry_safe(req, temp_req, &req_list, list) {
+ if (req->addr == addr) {
+ /*
+ * Removing from the list means we take ownership of
+ * the req
+ */
+ list_del_init(&req->list);
+ found = req;
+ break;
+ }
+ }
+ spin_unlock_bh(&lock);
+
+ if (!found)
+ return;
+
+ /*
+ * sync canceling the work after removing it from the req_list
+ * guarentees no work is running and none will be started.
+ */
+ cancel_delayed_work_sync(&found->work);
+
+ if (found->callback)
+ found->callback(-ECANCELED, (struct sockaddr *)&found->src_addr,
+ found->addr, found->context);
+
+ kfree(found);
+}
+EXPORT_SYMBOL(rdma_addr_cancel);
+
+struct resolve_cb_context {
+ struct completion comp;
+ int status;
+};
+
+static void resolve_cb(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *addr, void *context)
+{
+ ((struct resolve_cb_context *)context)->status = status;
+ complete(&((struct resolve_cb_context *)context)->comp);
+}
+
+int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
+ const union ib_gid *dgid,
+ u8 *dmac, const struct net_device *ndev,
+ int *hoplimit)
+{
+ struct rdma_dev_addr dev_addr;
+ struct resolve_cb_context ctx;
+ union {
+ struct sockaddr_in _sockaddr_in;
+ struct sockaddr_in6 _sockaddr_in6;
+ } sgid_addr, dgid_addr;
+ int ret;
+
+ rdma_gid2ip((struct sockaddr *)&sgid_addr, sgid);
+ rdma_gid2ip((struct sockaddr *)&dgid_addr, dgid);
+
+ memset(&dev_addr, 0, sizeof(dev_addr));
+ dev_addr.bound_dev_if = ndev->ifindex;
+ dev_addr.net = &init_net;
+
+ init_completion(&ctx.comp);
+ ret = rdma_resolve_ip((struct sockaddr *)&sgid_addr,
+ (struct sockaddr *)&dgid_addr, &dev_addr, 1000,
+ resolve_cb, &ctx);
+ if (ret)
+ return ret;
+
+ wait_for_completion(&ctx.comp);
+
+ ret = ctx.status;
+ if (ret)
+ return ret;
+
+ memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN);
+ *hoplimit = dev_addr.hoplimit;
+ return 0;
+}
+
+static int netevent_callback(struct notifier_block *self, unsigned long event,
+ void *ctx)
+{
+ struct addr_req *req;
+
+ if (event == NETEVENT_NEIGH_UPDATE) {
+ struct neighbour *neigh = ctx;
+
+ if (neigh->nud_state & NUD_VALID) {
+ spin_lock_bh(&lock);
+ list_for_each_entry(req, &req_list, list)
+ set_timeout(req, jiffies);
+ spin_unlock_bh(&lock);
+ }
+ }
+ return 0;
+}
+
+static struct notifier_block nb = {
+ .notifier_call = netevent_callback
+};
+
+int addr_init(void)
+{
+ addr_wq = alloc_ordered_workqueue("ib_addr", 0);
+ if (!addr_wq)
+ return -ENOMEM;
+
+ register_netevent_notifier(&nb);
+
+ return 0;
+}
+
+void addr_cleanup(void)
+{
+ unregister_netevent_notifier(&nb);
+ destroy_workqueue(addr_wq);
+ WARN_ON(!list_empty(&req_list));
+}
diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
new file mode 100644
index 000000000..324ef85a1
--- /dev/null
+++ b/drivers/infiniband/core/agent.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2004, 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004, 2005 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "agent.h"
+#include "smi.h"
+#include "mad_priv.h"
+
+#define SPFX "ib_agent: "
+
+struct ib_agent_port_private {
+ struct list_head port_list;
+ struct ib_mad_agent *agent[2];
+};
+
+static DEFINE_SPINLOCK(ib_agent_port_list_lock);
+static LIST_HEAD(ib_agent_port_list);
+
+static struct ib_agent_port_private *
+__ib_get_agent_port(const struct ib_device *device, int port_num)
+{
+ struct ib_agent_port_private *entry;
+
+ list_for_each_entry(entry, &ib_agent_port_list, port_list) {
+ if (entry->agent[1]->device == device &&
+ entry->agent[1]->port_num == port_num)
+ return entry;
+ }
+ return NULL;
+}
+
+static struct ib_agent_port_private *
+ib_get_agent_port(const struct ib_device *device, int port_num)
+{
+ struct ib_agent_port_private *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+ entry = __ib_get_agent_port(device, port_num);
+ spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+ return entry;
+}
+
+void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh,
+ const struct ib_wc *wc, const struct ib_device *device,
+ int port_num, int qpn, size_t resp_mad_len, bool opa)
+{
+ struct ib_agent_port_private *port_priv;
+ struct ib_mad_agent *agent;
+ struct ib_mad_send_buf *send_buf;
+ struct ib_ah *ah;
+ struct ib_mad_send_wr_private *mad_send_wr;
+
+ if (rdma_cap_ib_switch(device))
+ port_priv = ib_get_agent_port(device, 0);
+ else
+ port_priv = ib_get_agent_port(device, port_num);
+
+ if (!port_priv) {
+ dev_err(&device->dev, "Unable to find port agent\n");
+ return;
+ }
+
+ agent = port_priv->agent[qpn];
+ ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num);
+ if (IS_ERR(ah)) {
+ dev_err(&device->dev, "ib_create_ah_from_wc error %ld\n",
+ PTR_ERR(ah));
+ return;
+ }
+
+ if (opa && mad_hdr->base_version != OPA_MGMT_BASE_VERSION)
+ resp_mad_len = IB_MGMT_MAD_SIZE;
+
+ send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0,
+ IB_MGMT_MAD_HDR,
+ resp_mad_len - IB_MGMT_MAD_HDR,
+ GFP_KERNEL,
+ mad_hdr->base_version);
+ if (IS_ERR(send_buf)) {
+ dev_err(&device->dev, "ib_create_send_mad error\n");
+ goto err1;
+ }
+
+ memcpy(send_buf->mad, mad_hdr, resp_mad_len);
+ send_buf->ah = ah;
+
+ if (rdma_cap_ib_switch(device)) {
+ mad_send_wr = container_of(send_buf,
+ struct ib_mad_send_wr_private,
+ send_buf);
+ mad_send_wr->send_wr.port_num = port_num;
+ }
+
+ if (ib_post_send_mad(send_buf, NULL)) {
+ dev_err(&device->dev, "ib_post_send_mad error\n");
+ goto err2;
+ }
+ return;
+err2:
+ ib_free_send_mad(send_buf);
+err1:
+ rdma_destroy_ah(ah);
+}
+
+static void agent_send_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ rdma_destroy_ah(mad_send_wc->send_buf->ah);
+ ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int ib_agent_port_open(struct ib_device *device, int port_num)
+{
+ struct ib_agent_port_private *port_priv;
+ unsigned long flags;
+ int ret;
+
+ /* Create new device info */
+ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
+ if (!port_priv) {
+ ret = -ENOMEM;
+ goto error1;
+ }
+
+ if (rdma_cap_ib_smi(device, port_num)) {
+ /* Obtain send only MAD agent for SMI QP */
+ port_priv->agent[0] = ib_register_mad_agent(device, port_num,
+ IB_QPT_SMI, NULL, 0,
+ &agent_send_handler,
+ NULL, NULL, 0);
+ if (IS_ERR(port_priv->agent[0])) {
+ ret = PTR_ERR(port_priv->agent[0]);
+ goto error2;
+ }
+ }
+
+ /* Obtain send only MAD agent for GSI QP */
+ port_priv->agent[1] = ib_register_mad_agent(device, port_num,
+ IB_QPT_GSI, NULL, 0,
+ &agent_send_handler,
+ NULL, NULL, 0);
+ if (IS_ERR(port_priv->agent[1])) {
+ ret = PTR_ERR(port_priv->agent[1]);
+ goto error3;
+ }
+
+ spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+ list_add_tail(&port_priv->port_list, &ib_agent_port_list);
+ spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+ return 0;
+
+error3:
+ if (port_priv->agent[0])
+ ib_unregister_mad_agent(port_priv->agent[0]);
+error2:
+ kfree(port_priv);
+error1:
+ return ret;
+}
+
+int ib_agent_port_close(struct ib_device *device, int port_num)
+{
+ struct ib_agent_port_private *port_priv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+ port_priv = __ib_get_agent_port(device, port_num);
+ if (port_priv == NULL) {
+ spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+ dev_err(&device->dev, "Port %d not found\n", port_num);
+ return -ENODEV;
+ }
+ list_del(&port_priv->port_list);
+ spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+ ib_unregister_mad_agent(port_priv->agent[1]);
+ if (port_priv->agent[0])
+ ib_unregister_mad_agent(port_priv->agent[0]);
+
+ kfree(port_priv);
+ return 0;
+}
diff --git a/drivers/infiniband/core/agent.h b/drivers/infiniband/core/agent.h
new file mode 100644
index 000000000..65f92beda
--- /dev/null
+++ b/drivers/infiniband/core/agent.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __AGENT_H_
+#define __AGENT_H_
+
+#include <linux/err.h>
+#include <rdma/ib_mad.h>
+
+extern int ib_agent_port_open(struct ib_device *device, int port_num);
+
+extern int ib_agent_port_close(struct ib_device *device, int port_num);
+
+extern void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh,
+ const struct ib_wc *wc, const struct ib_device *device,
+ int port_num, int qpn, size_t resp_mad_len, bool opa);
+
+#endif /* __AGENT_H_ */
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
new file mode 100644
index 000000000..3208ad6ad
--- /dev/null
+++ b/drivers/infiniband/core/cache.c
@@ -0,0 +1,1463 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/netdevice.h>
+#include <net/addrconf.h>
+
+#include <rdma/ib_cache.h>
+
+#include "core_priv.h"
+
+struct ib_pkey_cache {
+ int table_len;
+ u16 table[0];
+};
+
+struct ib_update_work {
+ struct work_struct work;
+ struct ib_device *device;
+ u8 port_num;
+ bool enforce_security;
+};
+
+union ib_gid zgid;
+EXPORT_SYMBOL(zgid);
+
+enum gid_attr_find_mask {
+ GID_ATTR_FIND_MASK_GID = 1UL << 0,
+ GID_ATTR_FIND_MASK_NETDEV = 1UL << 1,
+ GID_ATTR_FIND_MASK_DEFAULT = 1UL << 2,
+ GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 3,
+};
+
+enum gid_table_entry_state {
+ GID_TABLE_ENTRY_INVALID = 1,
+ GID_TABLE_ENTRY_VALID = 2,
+ /*
+ * Indicates that entry is pending to be removed, there may
+ * be active users of this GID entry.
+ * When last user of the GID entry releases reference to it,
+ * GID entry is detached from the table.
+ */
+ GID_TABLE_ENTRY_PENDING_DEL = 3,
+};
+
+struct ib_gid_table_entry {
+ struct kref kref;
+ struct work_struct del_work;
+ struct ib_gid_attr attr;
+ void *context;
+ enum gid_table_entry_state state;
+};
+
+struct ib_gid_table {
+ int sz;
+ /* In RoCE, adding a GID to the table requires:
+ * (a) Find if this GID is already exists.
+ * (b) Find a free space.
+ * (c) Write the new GID
+ *
+ * Delete requires different set of operations:
+ * (a) Find the GID
+ * (b) Delete it.
+ *
+ **/
+ /* Any writer to data_vec must hold this lock and the write side of
+ * rwlock. Readers must hold only rwlock. All writers must be in a
+ * sleepable context.
+ */
+ struct mutex lock;
+ /* rwlock protects data_vec[ix]->state and entry pointer.
+ */
+ rwlock_t rwlock;
+ struct ib_gid_table_entry **data_vec;
+ /* bit field, each bit indicates the index of default GID */
+ u32 default_gid_indices;
+};
+
+static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port)
+{
+ struct ib_event event;
+
+ event.device = ib_dev;
+ event.element.port_num = port;
+ event.event = IB_EVENT_GID_CHANGE;
+
+ ib_dispatch_event(&event);
+}
+
+static const char * const gid_type_str[] = {
+ [IB_GID_TYPE_IB] = "IB/RoCE v1",
+ [IB_GID_TYPE_ROCE_UDP_ENCAP] = "RoCE v2",
+};
+
+const char *ib_cache_gid_type_str(enum ib_gid_type gid_type)
+{
+ if (gid_type < ARRAY_SIZE(gid_type_str) && gid_type_str[gid_type])
+ return gid_type_str[gid_type];
+
+ return "Invalid GID type";
+}
+EXPORT_SYMBOL(ib_cache_gid_type_str);
+
+/** rdma_is_zero_gid - Check if given GID is zero or not.
+ * @gid: GID to check
+ * Returns true if given GID is zero, returns false otherwise.
+ */
+bool rdma_is_zero_gid(const union ib_gid *gid)
+{
+ return !memcmp(gid, &zgid, sizeof(*gid));
+}
+EXPORT_SYMBOL(rdma_is_zero_gid);
+
+/** is_gid_index_default - Check if a given index belongs to
+ * reserved default GIDs or not.
+ * @table: GID table pointer
+ * @index: Index to check in GID table
+ * Returns true if index is one of the reserved default GID index otherwise
+ * returns false.
+ */
+static bool is_gid_index_default(const struct ib_gid_table *table,
+ unsigned int index)
+{
+ return index < 32 && (BIT(index) & table->default_gid_indices);
+}
+
+int ib_cache_gid_parse_type_str(const char *buf)
+{
+ unsigned int i;
+ size_t len;
+ int err = -EINVAL;
+
+ len = strlen(buf);
+ if (len == 0)
+ return -EINVAL;
+
+ if (buf[len - 1] == '\n')
+ len--;
+
+ for (i = 0; i < ARRAY_SIZE(gid_type_str); ++i)
+ if (gid_type_str[i] && !strncmp(buf, gid_type_str[i], len) &&
+ len == strlen(gid_type_str[i])) {
+ err = i;
+ break;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(ib_cache_gid_parse_type_str);
+
+static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u8 port)
+{
+ return device->cache.ports[port - rdma_start_port(device)].gid;
+}
+
+static bool is_gid_entry_free(const struct ib_gid_table_entry *entry)
+{
+ return !entry;
+}
+
+static bool is_gid_entry_valid(const struct ib_gid_table_entry *entry)
+{
+ return entry && entry->state == GID_TABLE_ENTRY_VALID;
+}
+
+static void schedule_free_gid(struct kref *kref)
+{
+ struct ib_gid_table_entry *entry =
+ container_of(kref, struct ib_gid_table_entry, kref);
+
+ queue_work(ib_wq, &entry->del_work);
+}
+
+static void free_gid_entry_locked(struct ib_gid_table_entry *entry)
+{
+ struct ib_device *device = entry->attr.device;
+ u8 port_num = entry->attr.port_num;
+ struct ib_gid_table *table = rdma_gid_table(device, port_num);
+
+ pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
+ device->name, port_num, entry->attr.index,
+ entry->attr.gid.raw);
+
+ if (rdma_cap_roce_gid_table(device, port_num) &&
+ entry->state != GID_TABLE_ENTRY_INVALID)
+ device->del_gid(&entry->attr, &entry->context);
+
+ write_lock_irq(&table->rwlock);
+
+ /*
+ * The only way to avoid overwriting NULL in table is
+ * by comparing if it is same entry in table or not!
+ * If new entry in table is added by the time we free here,
+ * don't overwrite the table entry.
+ */
+ if (entry == table->data_vec[entry->attr.index])
+ table->data_vec[entry->attr.index] = NULL;
+ /* Now this index is ready to be allocated */
+ write_unlock_irq(&table->rwlock);
+
+ if (entry->attr.ndev)
+ dev_put(entry->attr.ndev);
+ kfree(entry);
+}
+
+static void free_gid_entry(struct kref *kref)
+{
+ struct ib_gid_table_entry *entry =
+ container_of(kref, struct ib_gid_table_entry, kref);
+
+ free_gid_entry_locked(entry);
+}
+
+/**
+ * free_gid_work - Release reference to the GID entry
+ * @work: Work structure to refer to GID entry which needs to be
+ * deleted.
+ *
+ * free_gid_work() frees the entry from the HCA's hardware table
+ * if provider supports it. It releases reference to netdevice.
+ */
+static void free_gid_work(struct work_struct *work)
+{
+ struct ib_gid_table_entry *entry =
+ container_of(work, struct ib_gid_table_entry, del_work);
+ struct ib_device *device = entry->attr.device;
+ u8 port_num = entry->attr.port_num;
+ struct ib_gid_table *table = rdma_gid_table(device, port_num);
+
+ mutex_lock(&table->lock);
+ free_gid_entry_locked(entry);
+ mutex_unlock(&table->lock);
+}
+
+static struct ib_gid_table_entry *
+alloc_gid_entry(const struct ib_gid_attr *attr)
+{
+ struct ib_gid_table_entry *entry;
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return NULL;
+ kref_init(&entry->kref);
+ memcpy(&entry->attr, attr, sizeof(*attr));
+ if (entry->attr.ndev)
+ dev_hold(entry->attr.ndev);
+ INIT_WORK(&entry->del_work, free_gid_work);
+ entry->state = GID_TABLE_ENTRY_INVALID;
+ return entry;
+}
+
+static void store_gid_entry(struct ib_gid_table *table,
+ struct ib_gid_table_entry *entry)
+{
+ entry->state = GID_TABLE_ENTRY_VALID;
+
+ pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
+ entry->attr.device->name, entry->attr.port_num,
+ entry->attr.index, entry->attr.gid.raw);
+
+ lockdep_assert_held(&table->lock);
+ write_lock_irq(&table->rwlock);
+ table->data_vec[entry->attr.index] = entry;
+ write_unlock_irq(&table->rwlock);
+}
+
+static void get_gid_entry(struct ib_gid_table_entry *entry)
+{
+ kref_get(&entry->kref);
+}
+
+static void put_gid_entry(struct ib_gid_table_entry *entry)
+{
+ kref_put(&entry->kref, schedule_free_gid);
+}
+
+static void put_gid_entry_locked(struct ib_gid_table_entry *entry)
+{
+ kref_put(&entry->kref, free_gid_entry);
+}
+
+static int add_roce_gid(struct ib_gid_table_entry *entry)
+{
+ const struct ib_gid_attr *attr = &entry->attr;
+ int ret;
+
+ if (!attr->ndev) {
+ pr_err("%s NULL netdev device=%s port=%d index=%d\n",
+ __func__, attr->device->name, attr->port_num,
+ attr->index);
+ return -EINVAL;
+ }
+ if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) {
+ ret = attr->device->add_gid(attr, &entry->context);
+ if (ret) {
+ pr_err("%s GID add failed device=%s port=%d index=%d\n",
+ __func__, attr->device->name, attr->port_num,
+ attr->index);
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/**
+ * del_gid - Delete GID table entry
+ *
+ * @ib_dev: IB device whose GID entry to be deleted
+ * @port: Port number of the IB device
+ * @table: GID table of the IB device for a port
+ * @ix: GID entry index to delete
+ *
+ */
+static void del_gid(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table, int ix)
+{
+ struct ib_gid_table_entry *entry;
+
+ lockdep_assert_held(&table->lock);
+
+ pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
+ ib_dev->name, port, ix,
+ table->data_vec[ix]->attr.gid.raw);
+
+ write_lock_irq(&table->rwlock);
+ entry = table->data_vec[ix];
+ entry->state = GID_TABLE_ENTRY_PENDING_DEL;
+ /*
+ * For non RoCE protocol, GID entry slot is ready to use.
+ */
+ if (!rdma_protocol_roce(ib_dev, port))
+ table->data_vec[ix] = NULL;
+ write_unlock_irq(&table->rwlock);
+
+ put_gid_entry_locked(entry);
+}
+
+/**
+ * add_modify_gid - Add or modify GID table entry
+ *
+ * @table: GID table in which GID to be added or modified
+ * @attr: Attributes of the GID
+ *
+ * Returns 0 on success or appropriate error code. It accepts zero
+ * GID addition for non RoCE ports for HCA's who report them as valid
+ * GID. However such zero GIDs are not added to the cache.
+ */
+static int add_modify_gid(struct ib_gid_table *table,
+ const struct ib_gid_attr *attr)
+{
+ struct ib_gid_table_entry *entry;
+ int ret = 0;
+
+ /*
+ * Invalidate any old entry in the table to make it safe to write to
+ * this index.
+ */
+ if (is_gid_entry_valid(table->data_vec[attr->index]))
+ del_gid(attr->device, attr->port_num, table, attr->index);
+
+ /*
+ * Some HCA's report multiple GID entries with only one valid GID, and
+ * leave other unused entries as the zero GID. Convert zero GIDs to
+ * empty table entries instead of storing them.
+ */
+ if (rdma_is_zero_gid(&attr->gid))
+ return 0;
+
+ entry = alloc_gid_entry(attr);
+ if (!entry)
+ return -ENOMEM;
+
+ if (rdma_protocol_roce(attr->device, attr->port_num)) {
+ ret = add_roce_gid(entry);
+ if (ret)
+ goto done;
+ }
+
+ store_gid_entry(table, entry);
+ return 0;
+
+done:
+ put_gid_entry(entry);
+ return ret;
+}
+
+/* rwlock should be read locked, or lock should be held */
+static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
+ const struct ib_gid_attr *val, bool default_gid,
+ unsigned long mask, int *pempty)
+{
+ int i = 0;
+ int found = -1;
+ int empty = pempty ? -1 : 0;
+
+ while (i < table->sz && (found < 0 || empty < 0)) {
+ struct ib_gid_table_entry *data = table->data_vec[i];
+ struct ib_gid_attr *attr;
+ int curr_index = i;
+
+ i++;
+
+ /* find_gid() is used during GID addition where it is expected
+ * to return a free entry slot which is not duplicate.
+ * Free entry slot is requested and returned if pempty is set,
+ * so lookup free slot only if requested.
+ */
+ if (pempty && empty < 0) {
+ if (is_gid_entry_free(data) &&
+ default_gid ==
+ is_gid_index_default(table, curr_index)) {
+ /*
+ * Found an invalid (free) entry; allocate it.
+ * If default GID is requested, then our
+ * found slot must be one of the DEFAULT
+ * reserved slots or we fail.
+ * This ensures that only DEFAULT reserved
+ * slots are used for default property GIDs.
+ */
+ empty = curr_index;
+ }
+ }
+
+ /*
+ * Additionally find_gid() is used to find valid entry during
+ * lookup operation; so ignore the entries which are marked as
+ * pending for removal and the entries which are marked as
+ * invalid.
+ */
+ if (!is_gid_entry_valid(data))
+ continue;
+
+ if (found >= 0)
+ continue;
+
+ attr = &data->attr;
+ if (mask & GID_ATTR_FIND_MASK_GID_TYPE &&
+ attr->gid_type != val->gid_type)
+ continue;
+
+ if (mask & GID_ATTR_FIND_MASK_GID &&
+ memcmp(gid, &data->attr.gid, sizeof(*gid)))
+ continue;
+
+ if (mask & GID_ATTR_FIND_MASK_NETDEV &&
+ attr->ndev != val->ndev)
+ continue;
+
+ if (mask & GID_ATTR_FIND_MASK_DEFAULT &&
+ is_gid_index_default(table, curr_index) != default_gid)
+ continue;
+
+ found = curr_index;
+ }
+
+ if (pempty)
+ *pempty = empty;
+
+ return found;
+}
+
+static void make_default_gid(struct net_device *dev, union ib_gid *gid)
+{
+ gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
+ addrconf_ifid_eui48(&gid->raw[8], dev);
+}
+
+static int __ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr,
+ unsigned long mask, bool default_gid)
+{
+ struct ib_gid_table *table;
+ int ret = 0;
+ int empty;
+ int ix;
+
+ /* Do not allow adding zero GID in support of
+ * IB spec version 1.3 section 4.1.1 point (6) and
+ * section 12.7.10 and section 12.7.20
+ */
+ if (rdma_is_zero_gid(gid))
+ return -EINVAL;
+
+ table = rdma_gid_table(ib_dev, port);
+
+ mutex_lock(&table->lock);
+
+ ix = find_gid(table, gid, attr, default_gid, mask, &empty);
+ if (ix >= 0)
+ goto out_unlock;
+
+ if (empty < 0) {
+ ret = -ENOSPC;
+ goto out_unlock;
+ }
+ attr->device = ib_dev;
+ attr->index = empty;
+ attr->port_num = port;
+ attr->gid = *gid;
+ ret = add_modify_gid(table, attr);
+ if (!ret)
+ dispatch_gid_change_event(ib_dev, port);
+
+out_unlock:
+ mutex_unlock(&table->lock);
+ if (ret)
+ pr_warn("%s: unable to add gid %pI6 error=%d\n",
+ __func__, gid->raw, ret);
+ return ret;
+}
+
+int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr)
+{
+ struct net_device *idev;
+ unsigned long mask;
+ int ret;
+
+ if (ib_dev->get_netdev) {
+ idev = ib_dev->get_netdev(ib_dev, port);
+ if (idev && attr->ndev != idev) {
+ union ib_gid default_gid;
+
+ /* Adding default GIDs in not permitted */
+ make_default_gid(idev, &default_gid);
+ if (!memcmp(gid, &default_gid, sizeof(*gid))) {
+ dev_put(idev);
+ return -EPERM;
+ }
+ }
+ if (idev)
+ dev_put(idev);
+ }
+
+ mask = GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_GID_TYPE |
+ GID_ATTR_FIND_MASK_NETDEV;
+
+ ret = __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false);
+ return ret;
+}
+
+static int
+_ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr,
+ unsigned long mask, bool default_gid)
+{
+ struct ib_gid_table *table;
+ int ret = 0;
+ int ix;
+
+ table = rdma_gid_table(ib_dev, port);
+
+ mutex_lock(&table->lock);
+
+ ix = find_gid(table, gid, attr, default_gid, mask, NULL);
+ if (ix < 0) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ del_gid(ib_dev, port, table, ix);
+ dispatch_gid_change_event(ib_dev, port);
+
+out_unlock:
+ mutex_unlock(&table->lock);
+ if (ret)
+ pr_debug("%s: can't delete gid %pI6 error=%d\n",
+ __func__, gid->raw, ret);
+ return ret;
+}
+
+int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr)
+{
+ unsigned long mask = GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_GID_TYPE |
+ GID_ATTR_FIND_MASK_DEFAULT |
+ GID_ATTR_FIND_MASK_NETDEV;
+
+ return _ib_cache_gid_del(ib_dev, port, gid, attr, mask, false);
+}
+
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev)
+{
+ struct ib_gid_table *table;
+ int ix;
+ bool deleted = false;
+
+ table = rdma_gid_table(ib_dev, port);
+
+ mutex_lock(&table->lock);
+
+ for (ix = 0; ix < table->sz; ix++) {
+ if (is_gid_entry_valid(table->data_vec[ix]) &&
+ table->data_vec[ix]->attr.ndev == ndev) {
+ del_gid(ib_dev, port, table, ix);
+ deleted = true;
+ }
+ }
+
+ mutex_unlock(&table->lock);
+
+ if (deleted)
+ dispatch_gid_change_event(ib_dev, port);
+
+ return 0;
+}
+
+/**
+ * rdma_find_gid_by_port - Returns the GID entry attributes when it finds
+ * a valid GID entry for given search parameters. It searches for the specified
+ * GID value in the local software cache.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @gid_type: The GID type to search for.
+ * @port_num: The port number of the device where the GID value should be
+ * searched.
+ * @ndev: In RoCE, the net device of the device. NULL means ignore.
+ *
+ * Returns sgid attributes if the GID is found with valid reference or
+ * returns ERR_PTR for the error.
+ * The caller must invoke rdma_put_gid_attr() to release the reference.
+ */
+const struct ib_gid_attr *
+rdma_find_gid_by_port(struct ib_device *ib_dev,
+ const union ib_gid *gid,
+ enum ib_gid_type gid_type,
+ u8 port, struct net_device *ndev)
+{
+ int local_index;
+ struct ib_gid_table *table;
+ unsigned long mask = GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_GID_TYPE;
+ struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type};
+ const struct ib_gid_attr *attr;
+ unsigned long flags;
+
+ if (!rdma_is_port_valid(ib_dev, port))
+ return ERR_PTR(-ENOENT);
+
+ table = rdma_gid_table(ib_dev, port);
+
+ if (ndev)
+ mask |= GID_ATTR_FIND_MASK_NETDEV;
+
+ read_lock_irqsave(&table->rwlock, flags);
+ local_index = find_gid(table, gid, &val, false, mask, NULL);
+ if (local_index >= 0) {
+ get_gid_entry(table->data_vec[local_index]);
+ attr = &table->data_vec[local_index]->attr;
+ read_unlock_irqrestore(&table->rwlock, flags);
+ return attr;
+ }
+
+ read_unlock_irqrestore(&table->rwlock, flags);
+ return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL(rdma_find_gid_by_port);
+
+/**
+ * rdma_find_gid_by_filter - Returns the GID table attribute where a
+ * specified GID value occurs
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port: The port number of the device where the GID value could be
+ * searched.
+ * @filter: The filter function is executed on any matching GID in the table.
+ * If the filter function returns true, the corresponding index is returned,
+ * otherwise, we continue searching the GID table. It's guaranteed that
+ * while filter is executed, ndev field is valid and the structure won't
+ * change. filter is executed in an atomic context. filter must not be NULL.
+ *
+ * rdma_find_gid_by_filter() searches for the specified GID value
+ * of which the filter function returns true in the port's GID table.
+ *
+ */
+const struct ib_gid_attr *rdma_find_gid_by_filter(
+ struct ib_device *ib_dev, const union ib_gid *gid, u8 port,
+ bool (*filter)(const union ib_gid *gid, const struct ib_gid_attr *,
+ void *),
+ void *context)
+{
+ const struct ib_gid_attr *res = ERR_PTR(-ENOENT);
+ struct ib_gid_table *table;
+ unsigned long flags;
+ unsigned int i;
+
+ if (!rdma_is_port_valid(ib_dev, port))
+ return ERR_PTR(-EINVAL);
+
+ table = rdma_gid_table(ib_dev, port);
+
+ read_lock_irqsave(&table->rwlock, flags);
+ for (i = 0; i < table->sz; i++) {
+ struct ib_gid_table_entry *entry = table->data_vec[i];
+
+ if (!is_gid_entry_valid(entry))
+ continue;
+
+ if (memcmp(gid, &entry->attr.gid, sizeof(*gid)))
+ continue;
+
+ if (filter(gid, &entry->attr, context)) {
+ get_gid_entry(entry);
+ res = &entry->attr;
+ break;
+ }
+ }
+ read_unlock_irqrestore(&table->rwlock, flags);
+ return res;
+}
+
+static struct ib_gid_table *alloc_gid_table(int sz)
+{
+ struct ib_gid_table *table = kzalloc(sizeof(*table), GFP_KERNEL);
+
+ if (!table)
+ return NULL;
+
+ table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL);
+ if (!table->data_vec)
+ goto err_free_table;
+
+ mutex_init(&table->lock);
+
+ table->sz = sz;
+ rwlock_init(&table->rwlock);
+ return table;
+
+err_free_table:
+ kfree(table);
+ return NULL;
+}
+
+static void release_gid_table(struct ib_device *device, u8 port,
+ struct ib_gid_table *table)
+{
+ bool leak = false;
+ int i;
+
+ if (!table)
+ return;
+
+ for (i = 0; i < table->sz; i++) {
+ if (is_gid_entry_free(table->data_vec[i]))
+ continue;
+ if (kref_read(&table->data_vec[i]->kref) > 1) {
+ pr_err("GID entry ref leak for %s (index %d) ref=%d\n",
+ device->name, i,
+ kref_read(&table->data_vec[i]->kref));
+ leak = true;
+ }
+ }
+ if (leak)
+ return;
+
+ kfree(table->data_vec);
+ kfree(table);
+}
+
+static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table)
+{
+ int i;
+ bool deleted = false;
+
+ if (!table)
+ return;
+
+ mutex_lock(&table->lock);
+ for (i = 0; i < table->sz; ++i) {
+ if (is_gid_entry_valid(table->data_vec[i])) {
+ del_gid(ib_dev, port, table, i);
+ deleted = true;
+ }
+ }
+ mutex_unlock(&table->lock);
+
+ if (deleted)
+ dispatch_gid_change_event(ib_dev, port);
+}
+
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev,
+ unsigned long gid_type_mask,
+ enum ib_cache_gid_default_mode mode)
+{
+ union ib_gid gid = { };
+ struct ib_gid_attr gid_attr;
+ unsigned int gid_type;
+ unsigned long mask;
+
+ mask = GID_ATTR_FIND_MASK_GID_TYPE |
+ GID_ATTR_FIND_MASK_DEFAULT |
+ GID_ATTR_FIND_MASK_NETDEV;
+ memset(&gid_attr, 0, sizeof(gid_attr));
+ gid_attr.ndev = ndev;
+
+ for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) {
+ if (1UL << gid_type & ~gid_type_mask)
+ continue;
+
+ gid_attr.gid_type = gid_type;
+
+ if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) {
+ make_default_gid(ndev, &gid);
+ __ib_cache_gid_add(ib_dev, port, &gid,
+ &gid_attr, mask, true);
+ } else if (mode == IB_CACHE_GID_DEFAULT_MODE_DELETE) {
+ _ib_cache_gid_del(ib_dev, port, &gid,
+ &gid_attr, mask, true);
+ }
+ }
+}
+
+static void gid_table_reserve_default(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table)
+{
+ unsigned int i;
+ unsigned long roce_gid_type_mask;
+ unsigned int num_default_gids;
+
+ roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+ num_default_gids = hweight_long(roce_gid_type_mask);
+ /* Reserve starting indices for default GIDs */
+ for (i = 0; i < num_default_gids && i < table->sz; i++)
+ table->default_gid_indices |= BIT(i);
+}
+
+
+static void gid_table_release_one(struct ib_device *ib_dev)
+{
+ struct ib_gid_table *table;
+ u8 port;
+
+ for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+ table = ib_dev->cache.ports[port].gid;
+ release_gid_table(ib_dev, port, table);
+ ib_dev->cache.ports[port].gid = NULL;
+ }
+}
+
+static int _gid_table_setup_one(struct ib_device *ib_dev)
+{
+ u8 port;
+ struct ib_gid_table *table;
+
+ for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+ u8 rdma_port = port + rdma_start_port(ib_dev);
+
+ table = alloc_gid_table(
+ ib_dev->port_immutable[rdma_port].gid_tbl_len);
+ if (!table)
+ goto rollback_table_setup;
+
+ gid_table_reserve_default(ib_dev, rdma_port, table);
+ ib_dev->cache.ports[port].gid = table;
+ }
+ return 0;
+
+rollback_table_setup:
+ gid_table_release_one(ib_dev);
+ return -ENOMEM;
+}
+
+static void gid_table_cleanup_one(struct ib_device *ib_dev)
+{
+ struct ib_gid_table *table;
+ u8 port;
+
+ for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+ table = ib_dev->cache.ports[port].gid;
+ cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
+ table);
+ }
+}
+
+static int gid_table_setup_one(struct ib_device *ib_dev)
+{
+ int err;
+
+ err = _gid_table_setup_one(ib_dev);
+
+ if (err)
+ return err;
+
+ rdma_roce_rescan_device(ib_dev);
+
+ return err;
+}
+
+/**
+ * rdma_query_gid - Read the GID content from the GID software cache
+ * @device: Device to query the GID
+ * @port_num: Port number of the device
+ * @index: Index of the GID table entry to read
+ * @gid: Pointer to GID where to store the entry's GID
+ *
+ * rdma_query_gid() only reads the GID entry content for requested device,
+ * port and index. It reads for IB, RoCE and iWarp link layers. It doesn't
+ * hold any reference to the GID table entry in the HCA or software cache.
+ *
+ * Returns 0 on success or appropriate error code.
+ *
+ */
+int rdma_query_gid(struct ib_device *device, u8 port_num,
+ int index, union ib_gid *gid)
+{
+ struct ib_gid_table *table;
+ unsigned long flags;
+ int res = -EINVAL;
+
+ if (!rdma_is_port_valid(device, port_num))
+ return -EINVAL;
+
+ table = rdma_gid_table(device, port_num);
+ read_lock_irqsave(&table->rwlock, flags);
+
+ if (index < 0 || index >= table->sz ||
+ !is_gid_entry_valid(table->data_vec[index]))
+ goto done;
+
+ memcpy(gid, &table->data_vec[index]->attr.gid, sizeof(*gid));
+ res = 0;
+
+done:
+ read_unlock_irqrestore(&table->rwlock, flags);
+ return res;
+}
+EXPORT_SYMBOL(rdma_query_gid);
+
+/**
+ * rdma_find_gid - Returns SGID attributes if the matching GID is found.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @gid_type: The GID type to search for.
+ * @ndev: In RoCE, the net device of the device. NULL means ignore.
+ *
+ * rdma_find_gid() searches for the specified GID value in the software cache.
+ *
+ * Returns GID attributes if a valid GID is found or returns ERR_PTR for the
+ * error. The caller must invoke rdma_put_gid_attr() to release the reference.
+ *
+ */
+const struct ib_gid_attr *rdma_find_gid(struct ib_device *device,
+ const union ib_gid *gid,
+ enum ib_gid_type gid_type,
+ struct net_device *ndev)
+{
+ unsigned long mask = GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_GID_TYPE;
+ struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type};
+ u8 p;
+
+ if (ndev)
+ mask |= GID_ATTR_FIND_MASK_NETDEV;
+
+ for (p = 0; p < device->phys_port_cnt; p++) {
+ struct ib_gid_table *table;
+ unsigned long flags;
+ int index;
+
+ table = device->cache.ports[p].gid;
+ read_lock_irqsave(&table->rwlock, flags);
+ index = find_gid(table, gid, &gid_attr_val, false, mask, NULL);
+ if (index >= 0) {
+ const struct ib_gid_attr *attr;
+
+ get_gid_entry(table->data_vec[index]);
+ attr = &table->data_vec[index]->attr;
+ read_unlock_irqrestore(&table->rwlock, flags);
+ return attr;
+ }
+ read_unlock_irqrestore(&table->rwlock, flags);
+ }
+
+ return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL(rdma_find_gid);
+
+int ib_get_cached_pkey(struct ib_device *device,
+ u8 port_num,
+ int index,
+ u16 *pkey)
+{
+ struct ib_pkey_cache *cache;
+ unsigned long flags;
+ int ret = 0;
+
+ if (!rdma_is_port_valid(device, port_num))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+
+ cache = device->cache.ports[port_num - rdma_start_port(device)].pkey;
+
+ if (index < 0 || index >= cache->table_len)
+ ret = -EINVAL;
+ else
+ *pkey = cache->table[index];
+
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_pkey);
+
+int ib_get_cached_subnet_prefix(struct ib_device *device,
+ u8 port_num,
+ u64 *sn_pfx)
+{
+ unsigned long flags;
+ int p;
+
+ if (!rdma_is_port_valid(device, port_num))
+ return -EINVAL;
+
+ p = port_num - rdma_start_port(device);
+ read_lock_irqsave(&device->cache.lock, flags);
+ *sn_pfx = device->cache.ports[p].subnet_prefix;
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_get_cached_subnet_prefix);
+
+int ib_find_cached_pkey(struct ib_device *device,
+ u8 port_num,
+ u16 pkey,
+ u16 *index)
+{
+ struct ib_pkey_cache *cache;
+ unsigned long flags;
+ int i;
+ int ret = -ENOENT;
+ int partial_ix = -1;
+
+ if (!rdma_is_port_valid(device, port_num))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+
+ cache = device->cache.ports[port_num - rdma_start_port(device)].pkey;
+
+ *index = -1;
+
+ for (i = 0; i < cache->table_len; ++i)
+ if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) {
+ if (cache->table[i] & 0x8000) {
+ *index = i;
+ ret = 0;
+ break;
+ } else
+ partial_ix = i;
+ }
+
+ if (ret && partial_ix >= 0) {
+ *index = partial_ix;
+ ret = 0;
+ }
+
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_pkey);
+
+int ib_find_exact_cached_pkey(struct ib_device *device,
+ u8 port_num,
+ u16 pkey,
+ u16 *index)
+{
+ struct ib_pkey_cache *cache;
+ unsigned long flags;
+ int i;
+ int ret = -ENOENT;
+
+ if (!rdma_is_port_valid(device, port_num))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+
+ cache = device->cache.ports[port_num - rdma_start_port(device)].pkey;
+
+ *index = -1;
+
+ for (i = 0; i < cache->table_len; ++i)
+ if (cache->table[i] == pkey) {
+ *index = i;
+ ret = 0;
+ break;
+ }
+
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_find_exact_cached_pkey);
+
+int ib_get_cached_lmc(struct ib_device *device,
+ u8 port_num,
+ u8 *lmc)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ if (!rdma_is_port_valid(device, port_num))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+ *lmc = device->cache.ports[port_num - rdma_start_port(device)].lmc;
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_lmc);
+
+int ib_get_cached_port_state(struct ib_device *device,
+ u8 port_num,
+ enum ib_port_state *port_state)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ if (!rdma_is_port_valid(device, port_num))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+ *port_state = device->cache.ports[port_num
+ - rdma_start_port(device)].port_state;
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_port_state);
+
+/**
+ * rdma_get_gid_attr - Returns GID attributes for a port of a device
+ * at a requested gid_index, if a valid GID entry exists.
+ * @device: The device to query.
+ * @port_num: The port number on the device where the GID value
+ * is to be queried.
+ * @index: Index of the GID table entry whose attributes are to
+ * be queried.
+ *
+ * rdma_get_gid_attr() acquires reference count of gid attributes from the
+ * cached GID table. Caller must invoke rdma_put_gid_attr() to release
+ * reference to gid attribute regardless of link layer.
+ *
+ * Returns pointer to valid gid attribute or ERR_PTR for the appropriate error
+ * code.
+ */
+const struct ib_gid_attr *
+rdma_get_gid_attr(struct ib_device *device, u8 port_num, int index)
+{
+ const struct ib_gid_attr *attr = ERR_PTR(-EINVAL);
+ struct ib_gid_table *table;
+ unsigned long flags;
+
+ if (!rdma_is_port_valid(device, port_num))
+ return ERR_PTR(-EINVAL);
+
+ table = rdma_gid_table(device, port_num);
+ if (index < 0 || index >= table->sz)
+ return ERR_PTR(-EINVAL);
+
+ read_lock_irqsave(&table->rwlock, flags);
+ if (!is_gid_entry_valid(table->data_vec[index]))
+ goto done;
+
+ get_gid_entry(table->data_vec[index]);
+ attr = &table->data_vec[index]->attr;
+done:
+ read_unlock_irqrestore(&table->rwlock, flags);
+ return attr;
+}
+EXPORT_SYMBOL(rdma_get_gid_attr);
+
+/**
+ * rdma_put_gid_attr - Release reference to the GID attribute
+ * @attr: Pointer to the GID attribute whose reference
+ * needs to be released.
+ *
+ * rdma_put_gid_attr() must be used to release reference whose
+ * reference is acquired using rdma_get_gid_attr() or any APIs
+ * which returns a pointer to the ib_gid_attr regardless of link layer
+ * of IB or RoCE.
+ *
+ */
+void rdma_put_gid_attr(const struct ib_gid_attr *attr)
+{
+ struct ib_gid_table_entry *entry =
+ container_of(attr, struct ib_gid_table_entry, attr);
+
+ put_gid_entry(entry);
+}
+EXPORT_SYMBOL(rdma_put_gid_attr);
+
+/**
+ * rdma_hold_gid_attr - Get reference to existing GID attribute
+ *
+ * @attr: Pointer to the GID attribute whose reference
+ * needs to be taken.
+ *
+ * Increase the reference count to a GID attribute to keep it from being
+ * freed. Callers are required to already be holding a reference to attribute.
+ *
+ */
+void rdma_hold_gid_attr(const struct ib_gid_attr *attr)
+{
+ struct ib_gid_table_entry *entry =
+ container_of(attr, struct ib_gid_table_entry, attr);
+
+ get_gid_entry(entry);
+}
+EXPORT_SYMBOL(rdma_hold_gid_attr);
+
+static int config_non_roce_gid_cache(struct ib_device *device,
+ u8 port, int gid_tbl_len)
+{
+ struct ib_gid_attr gid_attr = {};
+ struct ib_gid_table *table;
+ int ret = 0;
+ int i;
+
+ gid_attr.device = device;
+ gid_attr.port_num = port;
+ table = rdma_gid_table(device, port);
+
+ mutex_lock(&table->lock);
+ for (i = 0; i < gid_tbl_len; ++i) {
+ if (!device->query_gid)
+ continue;
+ ret = device->query_gid(device, port, i, &gid_attr.gid);
+ if (ret) {
+ pr_warn("query_gid failed (%d) for %s (index %d)\n",
+ ret, device->name, i);
+ goto err;
+ }
+ gid_attr.index = i;
+ add_modify_gid(table, &gid_attr);
+ }
+err:
+ mutex_unlock(&table->lock);
+ return ret;
+}
+
+static void ib_cache_update(struct ib_device *device,
+ u8 port,
+ bool enforce_security)
+{
+ struct ib_port_attr *tprops = NULL;
+ struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache;
+ int i;
+ int ret;
+
+ if (!rdma_is_port_valid(device, port))
+ return;
+
+ tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+ if (!tprops)
+ return;
+
+ ret = ib_query_port(device, port, tprops);
+ if (ret) {
+ pr_warn("ib_query_port failed (%d) for %s\n",
+ ret, device->name);
+ goto err;
+ }
+
+ if (!rdma_protocol_roce(device, port)) {
+ ret = config_non_roce_gid_cache(device, port,
+ tprops->gid_tbl_len);
+ if (ret)
+ goto err;
+ }
+
+ pkey_cache = kmalloc(struct_size(pkey_cache, table,
+ tprops->pkey_tbl_len),
+ GFP_KERNEL);
+ if (!pkey_cache)
+ goto err;
+
+ pkey_cache->table_len = tprops->pkey_tbl_len;
+
+ for (i = 0; i < pkey_cache->table_len; ++i) {
+ ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
+ if (ret) {
+ pr_warn("ib_query_pkey failed (%d) for %s (index %d)\n",
+ ret, device->name, i);
+ goto err;
+ }
+ }
+
+ write_lock_irq(&device->cache.lock);
+
+ old_pkey_cache = device->cache.ports[port -
+ rdma_start_port(device)].pkey;
+
+ device->cache.ports[port - rdma_start_port(device)].pkey = pkey_cache;
+ device->cache.ports[port - rdma_start_port(device)].lmc = tprops->lmc;
+ device->cache.ports[port - rdma_start_port(device)].port_state =
+ tprops->state;
+
+ device->cache.ports[port - rdma_start_port(device)].subnet_prefix =
+ tprops->subnet_prefix;
+ write_unlock_irq(&device->cache.lock);
+
+ if (enforce_security)
+ ib_security_cache_change(device,
+ port,
+ tprops->subnet_prefix);
+
+ kfree(old_pkey_cache);
+ kfree(tprops);
+ return;
+
+err:
+ kfree(pkey_cache);
+ kfree(tprops);
+}
+
+static void ib_cache_task(struct work_struct *_work)
+{
+ struct ib_update_work *work =
+ container_of(_work, struct ib_update_work, work);
+
+ ib_cache_update(work->device,
+ work->port_num,
+ work->enforce_security);
+ kfree(work);
+}
+
+static void ib_cache_event(struct ib_event_handler *handler,
+ struct ib_event *event)
+{
+ struct ib_update_work *work;
+
+ if (event->event == IB_EVENT_PORT_ERR ||
+ event->event == IB_EVENT_PORT_ACTIVE ||
+ event->event == IB_EVENT_LID_CHANGE ||
+ event->event == IB_EVENT_PKEY_CHANGE ||
+ event->event == IB_EVENT_SM_CHANGE ||
+ event->event == IB_EVENT_CLIENT_REREGISTER ||
+ event->event == IB_EVENT_GID_CHANGE) {
+ work = kmalloc(sizeof *work, GFP_ATOMIC);
+ if (work) {
+ INIT_WORK(&work->work, ib_cache_task);
+ work->device = event->device;
+ work->port_num = event->element.port_num;
+ if (event->event == IB_EVENT_PKEY_CHANGE ||
+ event->event == IB_EVENT_GID_CHANGE)
+ work->enforce_security = true;
+ else
+ work->enforce_security = false;
+
+ queue_work(ib_wq, &work->work);
+ }
+ }
+}
+
+int ib_cache_setup_one(struct ib_device *device)
+{
+ int p;
+ int err;
+
+ rwlock_init(&device->cache.lock);
+
+ device->cache.ports =
+ kcalloc(rdma_end_port(device) - rdma_start_port(device) + 1,
+ sizeof(*device->cache.ports),
+ GFP_KERNEL);
+ if (!device->cache.ports)
+ return -ENOMEM;
+
+ err = gid_table_setup_one(device);
+ if (err) {
+ kfree(device->cache.ports);
+ device->cache.ports = NULL;
+ return err;
+ }
+
+ for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
+ ib_cache_update(device, p + rdma_start_port(device), true);
+
+ INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
+ device, ib_cache_event);
+ ib_register_event_handler(&device->cache.event_handler);
+ return 0;
+}
+
+void ib_cache_release_one(struct ib_device *device)
+{
+ int p;
+
+ /*
+ * The release function frees all the cache elements.
+ * This function should be called as part of freeing
+ * all the device's resources when the cache could no
+ * longer be accessed.
+ */
+ for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
+ kfree(device->cache.ports[p].pkey);
+
+ gid_table_release_one(device);
+ kfree(device->cache.ports);
+}
+
+void ib_cache_cleanup_one(struct ib_device *device)
+{
+ /* The cleanup function unregisters the event handler,
+ * waits for all in-progress workqueue elements and cleans
+ * up the GID cache. This function should be called after
+ * the device was removed from the devices list and all
+ * clients were removed, so the cache exists but is
+ * non-functional and shouldn't be updated anymore.
+ */
+ ib_unregister_event_handler(&device->cache.event_handler);
+ flush_workqueue(ib_wq);
+ gid_table_cleanup_one(device);
+
+ /*
+ * Flush the wq second time for any pending GID delete work.
+ */
+ flush_workqueue(ib_wq);
+}
diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c
new file mode 100644
index 000000000..126ac5f99
--- /dev/null
+++ b/drivers/infiniband/core/cgroup.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include "core_priv.h"
+
+/**
+ * ib_device_register_rdmacg - register with rdma cgroup.
+ * @device: device to register to participate in resource
+ * accounting by rdma cgroup.
+ *
+ * Register with the rdma cgroup. Should be called before
+ * exposing rdma device to user space applications to avoid
+ * resource accounting leak.
+ * Returns 0 on success or otherwise failure code.
+ */
+int ib_device_register_rdmacg(struct ib_device *device)
+{
+ device->cg_device.name = device->name;
+ return rdmacg_register_device(&device->cg_device);
+}
+
+/**
+ * ib_device_unregister_rdmacg - unregister with rdma cgroup.
+ * @device: device to unregister.
+ *
+ * Unregister with the rdma cgroup. Should be called after
+ * all the resources are deallocated, and after a stage when any
+ * other resource allocation by user application cannot be done
+ * for this device to avoid any leak in accounting.
+ */
+void ib_device_unregister_rdmacg(struct ib_device *device)
+{
+ rdmacg_unregister_device(&device->cg_device);
+}
+
+int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+ struct ib_device *device,
+ enum rdmacg_resource_type resource_index)
+{
+ return rdmacg_try_charge(&cg_obj->cg, &device->cg_device,
+ resource_index);
+}
+EXPORT_SYMBOL(ib_rdmacg_try_charge);
+
+void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+ struct ib_device *device,
+ enum rdmacg_resource_type resource_index)
+{
+ rdmacg_uncharge(cg_obj->cg, &device->cg_device,
+ resource_index);
+}
+EXPORT_SYMBOL(ib_rdmacg_uncharge);
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
new file mode 100644
index 000000000..9bdb3fd97
--- /dev/null
+++ b/drivers/infiniband/core/cm.c
@@ -0,0 +1,4577 @@
+/*
+ * Copyright (c) 2004-2007 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/random.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/workqueue.h>
+#include <linux/kdev_t.h>
+#include <linux/etherdevice.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_cm.h>
+#include "cm_msgs.h"
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("InfiniBand CM");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static const char * const ibcm_rej_reason_strs[] = {
+ [IB_CM_REJ_NO_QP] = "no QP",
+ [IB_CM_REJ_NO_EEC] = "no EEC",
+ [IB_CM_REJ_NO_RESOURCES] = "no resources",
+ [IB_CM_REJ_TIMEOUT] = "timeout",
+ [IB_CM_REJ_UNSUPPORTED] = "unsupported",
+ [IB_CM_REJ_INVALID_COMM_ID] = "invalid comm ID",
+ [IB_CM_REJ_INVALID_COMM_INSTANCE] = "invalid comm instance",
+ [IB_CM_REJ_INVALID_SERVICE_ID] = "invalid service ID",
+ [IB_CM_REJ_INVALID_TRANSPORT_TYPE] = "invalid transport type",
+ [IB_CM_REJ_STALE_CONN] = "stale conn",
+ [IB_CM_REJ_RDC_NOT_EXIST] = "RDC not exist",
+ [IB_CM_REJ_INVALID_GID] = "invalid GID",
+ [IB_CM_REJ_INVALID_LID] = "invalid LID",
+ [IB_CM_REJ_INVALID_SL] = "invalid SL",
+ [IB_CM_REJ_INVALID_TRAFFIC_CLASS] = "invalid traffic class",
+ [IB_CM_REJ_INVALID_HOP_LIMIT] = "invalid hop limit",
+ [IB_CM_REJ_INVALID_PACKET_RATE] = "invalid packet rate",
+ [IB_CM_REJ_INVALID_ALT_GID] = "invalid alt GID",
+ [IB_CM_REJ_INVALID_ALT_LID] = "invalid alt LID",
+ [IB_CM_REJ_INVALID_ALT_SL] = "invalid alt SL",
+ [IB_CM_REJ_INVALID_ALT_TRAFFIC_CLASS] = "invalid alt traffic class",
+ [IB_CM_REJ_INVALID_ALT_HOP_LIMIT] = "invalid alt hop limit",
+ [IB_CM_REJ_INVALID_ALT_PACKET_RATE] = "invalid alt packet rate",
+ [IB_CM_REJ_PORT_CM_REDIRECT] = "port CM redirect",
+ [IB_CM_REJ_PORT_REDIRECT] = "port redirect",
+ [IB_CM_REJ_INVALID_MTU] = "invalid MTU",
+ [IB_CM_REJ_INSUFFICIENT_RESP_RESOURCES] = "insufficient resp resources",
+ [IB_CM_REJ_CONSUMER_DEFINED] = "consumer defined",
+ [IB_CM_REJ_INVALID_RNR_RETRY] = "invalid RNR retry",
+ [IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID] = "duplicate local comm ID",
+ [IB_CM_REJ_INVALID_CLASS_VERSION] = "invalid class version",
+ [IB_CM_REJ_INVALID_FLOW_LABEL] = "invalid flow label",
+ [IB_CM_REJ_INVALID_ALT_FLOW_LABEL] = "invalid alt flow label",
+};
+
+const char *__attribute_const__ ibcm_reject_msg(int reason)
+{
+ size_t index = reason;
+
+ if (index < ARRAY_SIZE(ibcm_rej_reason_strs) &&
+ ibcm_rej_reason_strs[index])
+ return ibcm_rej_reason_strs[index];
+ else
+ return "unrecognized reason";
+}
+EXPORT_SYMBOL(ibcm_reject_msg);
+
+static void cm_add_one(struct ib_device *device);
+static void cm_remove_one(struct ib_device *device, void *client_data);
+
+static struct ib_client cm_client = {
+ .name = "cm",
+ .add = cm_add_one,
+ .remove = cm_remove_one
+};
+
+static struct ib_cm {
+ spinlock_t lock;
+ struct list_head device_list;
+ rwlock_t device_lock;
+ struct rb_root listen_service_table;
+ u64 listen_service_id;
+ /* struct rb_root peer_service_table; todo: fix peer to peer */
+ struct rb_root remote_qp_table;
+ struct rb_root remote_id_table;
+ struct rb_root remote_sidr_table;
+ struct idr local_id_table;
+ __be32 random_id_operand;
+ struct list_head timewait_list;
+ struct workqueue_struct *wq;
+ /* Sync on cm change port state */
+ spinlock_t state_lock;
+} cm;
+
+/* Counter indexes ordered by attribute ID */
+enum {
+ CM_REQ_COUNTER,
+ CM_MRA_COUNTER,
+ CM_REJ_COUNTER,
+ CM_REP_COUNTER,
+ CM_RTU_COUNTER,
+ CM_DREQ_COUNTER,
+ CM_DREP_COUNTER,
+ CM_SIDR_REQ_COUNTER,
+ CM_SIDR_REP_COUNTER,
+ CM_LAP_COUNTER,
+ CM_APR_COUNTER,
+ CM_ATTR_COUNT,
+ CM_ATTR_ID_OFFSET = 0x0010,
+};
+
+enum {
+ CM_XMIT,
+ CM_XMIT_RETRIES,
+ CM_RECV,
+ CM_RECV_DUPLICATES,
+ CM_COUNTER_GROUPS
+};
+
+static char const counter_group_names[CM_COUNTER_GROUPS]
+ [sizeof("cm_rx_duplicates")] = {
+ "cm_tx_msgs", "cm_tx_retries",
+ "cm_rx_msgs", "cm_rx_duplicates"
+};
+
+struct cm_counter_group {
+ struct kobject obj;
+ atomic_long_t counter[CM_ATTR_COUNT];
+};
+
+struct cm_counter_attribute {
+ struct attribute attr;
+ int index;
+};
+
+#define CM_COUNTER_ATTR(_name, _index) \
+struct cm_counter_attribute cm_##_name##_counter_attr = { \
+ .attr = { .name = __stringify(_name), .mode = 0444 }, \
+ .index = _index \
+}
+
+static CM_COUNTER_ATTR(req, CM_REQ_COUNTER);
+static CM_COUNTER_ATTR(mra, CM_MRA_COUNTER);
+static CM_COUNTER_ATTR(rej, CM_REJ_COUNTER);
+static CM_COUNTER_ATTR(rep, CM_REP_COUNTER);
+static CM_COUNTER_ATTR(rtu, CM_RTU_COUNTER);
+static CM_COUNTER_ATTR(dreq, CM_DREQ_COUNTER);
+static CM_COUNTER_ATTR(drep, CM_DREP_COUNTER);
+static CM_COUNTER_ATTR(sidr_req, CM_SIDR_REQ_COUNTER);
+static CM_COUNTER_ATTR(sidr_rep, CM_SIDR_REP_COUNTER);
+static CM_COUNTER_ATTR(lap, CM_LAP_COUNTER);
+static CM_COUNTER_ATTR(apr, CM_APR_COUNTER);
+
+static struct attribute *cm_counter_default_attrs[] = {
+ &cm_req_counter_attr.attr,
+ &cm_mra_counter_attr.attr,
+ &cm_rej_counter_attr.attr,
+ &cm_rep_counter_attr.attr,
+ &cm_rtu_counter_attr.attr,
+ &cm_dreq_counter_attr.attr,
+ &cm_drep_counter_attr.attr,
+ &cm_sidr_req_counter_attr.attr,
+ &cm_sidr_rep_counter_attr.attr,
+ &cm_lap_counter_attr.attr,
+ &cm_apr_counter_attr.attr,
+ NULL
+};
+
+struct cm_port {
+ struct cm_device *cm_dev;
+ struct ib_mad_agent *mad_agent;
+ struct kobject port_obj;
+ u8 port_num;
+ struct list_head cm_priv_prim_list;
+ struct list_head cm_priv_altr_list;
+ struct cm_counter_group counter_group[CM_COUNTER_GROUPS];
+};
+
+struct cm_device {
+ struct list_head list;
+ struct ib_device *ib_device;
+ struct device *device;
+ u8 ack_delay;
+ int going_down;
+ struct cm_port *port[0];
+};
+
+struct cm_av {
+ struct cm_port *port;
+ union ib_gid dgid;
+ struct rdma_ah_attr ah_attr;
+ u16 pkey_index;
+ u8 timeout;
+};
+
+struct cm_work {
+ struct delayed_work work;
+ struct list_head list;
+ struct cm_port *port;
+ struct ib_mad_recv_wc *mad_recv_wc; /* Received MADs */
+ __be32 local_id; /* Established / timewait */
+ __be32 remote_id;
+ struct ib_cm_event cm_event;
+ struct sa_path_rec path[0];
+};
+
+struct cm_timewait_info {
+ struct cm_work work; /* Must be first. */
+ struct list_head list;
+ struct rb_node remote_qp_node;
+ struct rb_node remote_id_node;
+ __be64 remote_ca_guid;
+ __be32 remote_qpn;
+ u8 inserted_remote_qp;
+ u8 inserted_remote_id;
+};
+
+struct cm_id_private {
+ struct ib_cm_id id;
+
+ struct rb_node service_node;
+ struct rb_node sidr_id_node;
+ spinlock_t lock; /* Do not acquire inside cm.lock */
+ struct completion comp;
+ atomic_t refcount;
+ /* Number of clients sharing this ib_cm_id. Only valid for listeners.
+ * Protected by the cm.lock spinlock. */
+ int listen_sharecount;
+
+ struct ib_mad_send_buf *msg;
+ struct cm_timewait_info *timewait_info;
+ /* todo: use alternate port on send failure */
+ struct cm_av av;
+ struct cm_av alt_av;
+
+ void *private_data;
+ __be64 tid;
+ __be32 local_qpn;
+ __be32 remote_qpn;
+ enum ib_qp_type qp_type;
+ __be32 sq_psn;
+ __be32 rq_psn;
+ int timeout_ms;
+ enum ib_mtu path_mtu;
+ __be16 pkey;
+ u8 private_data_len;
+ u8 max_cm_retries;
+ u8 peer_to_peer;
+ u8 responder_resources;
+ u8 initiator_depth;
+ u8 retry_count;
+ u8 rnr_retry_count;
+ u8 service_timeout;
+ u8 target_ack_delay;
+
+ struct list_head prim_list;
+ struct list_head altr_list;
+ /* Indicates that the send port mad is registered and av is set */
+ int prim_send_port_not_ready;
+ int altr_send_port_not_ready;
+
+ struct list_head work_list;
+ atomic_t work_count;
+};
+
+static void cm_work_handler(struct work_struct *work);
+
+static inline void cm_deref_id(struct cm_id_private *cm_id_priv)
+{
+ if (atomic_dec_and_test(&cm_id_priv->refcount))
+ complete(&cm_id_priv->comp);
+}
+
+static int cm_alloc_msg(struct cm_id_private *cm_id_priv,
+ struct ib_mad_send_buf **msg)
+{
+ struct ib_mad_agent *mad_agent;
+ struct ib_mad_send_buf *m;
+ struct ib_ah *ah;
+ struct cm_av *av;
+ unsigned long flags, flags2;
+ int ret = 0;
+
+ /* don't let the port to be released till the agent is down */
+ spin_lock_irqsave(&cm.state_lock, flags2);
+ spin_lock_irqsave(&cm.lock, flags);
+ if (!cm_id_priv->prim_send_port_not_ready)
+ av = &cm_id_priv->av;
+ else if (!cm_id_priv->altr_send_port_not_ready &&
+ (cm_id_priv->alt_av.port))
+ av = &cm_id_priv->alt_av;
+ else {
+ pr_info("%s: not valid CM id\n", __func__);
+ ret = -ENODEV;
+ spin_unlock_irqrestore(&cm.lock, flags);
+ goto out;
+ }
+ spin_unlock_irqrestore(&cm.lock, flags);
+ /* Make sure the port haven't released the mad yet */
+ mad_agent = cm_id_priv->av.port->mad_agent;
+ if (!mad_agent) {
+ pr_info("%s: not a valid MAD agent\n", __func__);
+ ret = -ENODEV;
+ goto out;
+ }
+ ah = rdma_create_ah(mad_agent->qp->pd, &av->ah_attr);
+ if (IS_ERR(ah)) {
+ ret = PTR_ERR(ah);
+ goto out;
+ }
+
+ m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn,
+ av->pkey_index,
+ 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+ GFP_ATOMIC,
+ IB_MGMT_BASE_VERSION);
+ if (IS_ERR(m)) {
+ rdma_destroy_ah(ah);
+ ret = PTR_ERR(m);
+ goto out;
+ }
+
+ /* Timeout set by caller if response is expected. */
+ m->ah = ah;
+ m->retries = cm_id_priv->max_cm_retries;
+
+ atomic_inc(&cm_id_priv->refcount);
+ m->context[0] = cm_id_priv;
+ *msg = m;
+
+out:
+ spin_unlock_irqrestore(&cm.state_lock, flags2);
+ return ret;
+}
+
+static struct ib_mad_send_buf *cm_alloc_response_msg_no_ah(struct cm_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ return ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index,
+ 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+ GFP_ATOMIC,
+ IB_MGMT_BASE_VERSION);
+}
+
+static int cm_create_response_msg_ah(struct cm_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc,
+ struct ib_mad_send_buf *msg)
+{
+ struct ib_ah *ah;
+
+ ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc,
+ mad_recv_wc->recv_buf.grh, port->port_num);
+ if (IS_ERR(ah))
+ return PTR_ERR(ah);
+
+ msg->ah = ah;
+ return 0;
+}
+
+static void cm_free_msg(struct ib_mad_send_buf *msg)
+{
+ if (msg->ah)
+ rdma_destroy_ah(msg->ah);
+ if (msg->context[0])
+ cm_deref_id(msg->context[0]);
+ ib_free_send_mad(msg);
+}
+
+static int cm_alloc_response_msg(struct cm_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc,
+ struct ib_mad_send_buf **msg)
+{
+ struct ib_mad_send_buf *m;
+ int ret;
+
+ m = cm_alloc_response_msg_no_ah(port, mad_recv_wc);
+ if (IS_ERR(m))
+ return PTR_ERR(m);
+
+ ret = cm_create_response_msg_ah(port, mad_recv_wc, m);
+ if (ret) {
+ cm_free_msg(m);
+ return ret;
+ }
+
+ *msg = m;
+ return 0;
+}
+
+static void * cm_copy_private_data(const void *private_data,
+ u8 private_data_len)
+{
+ void *data;
+
+ if (!private_data || !private_data_len)
+ return NULL;
+
+ data = kmemdup(private_data, private_data_len, GFP_KERNEL);
+ if (!data)
+ return ERR_PTR(-ENOMEM);
+
+ return data;
+}
+
+static void cm_set_private_data(struct cm_id_private *cm_id_priv,
+ void *private_data, u8 private_data_len)
+{
+ if (cm_id_priv->private_data && cm_id_priv->private_data_len)
+ kfree(cm_id_priv->private_data);
+
+ cm_id_priv->private_data = private_data;
+ cm_id_priv->private_data_len = private_data_len;
+}
+
+static int cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc,
+ struct ib_grh *grh, struct cm_av *av)
+{
+ struct rdma_ah_attr new_ah_attr;
+ int ret;
+
+ av->port = port;
+ av->pkey_index = wc->pkey_index;
+
+ /*
+ * av->ah_attr might be initialized based on past wc during incoming
+ * connect request or while sending out connect request. So initialize
+ * a new ah_attr on stack. If initialization fails, old ah_attr is
+ * used for sending any responses. If initialization is successful,
+ * than new ah_attr is used by overwriting old one.
+ */
+ ret = ib_init_ah_attr_from_wc(port->cm_dev->ib_device,
+ port->port_num, wc,
+ grh, &new_ah_attr);
+ if (ret)
+ return ret;
+
+ rdma_move_ah_attr(&av->ah_attr, &new_ah_attr);
+ return 0;
+}
+
+static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
+ struct ib_grh *grh, struct cm_av *av)
+{
+ av->port = port;
+ av->pkey_index = wc->pkey_index;
+ return ib_init_ah_attr_from_wc(port->cm_dev->ib_device,
+ port->port_num, wc,
+ grh, &av->ah_attr);
+}
+
+static int add_cm_id_to_port_list(struct cm_id_private *cm_id_priv,
+ struct cm_av *av,
+ struct cm_port *port)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&cm.lock, flags);
+
+ if (&cm_id_priv->av == av)
+ list_add_tail(&cm_id_priv->prim_list, &port->cm_priv_prim_list);
+ else if (&cm_id_priv->alt_av == av)
+ list_add_tail(&cm_id_priv->altr_list, &port->cm_priv_altr_list);
+ else
+ ret = -EINVAL;
+
+ spin_unlock_irqrestore(&cm.lock, flags);
+ return ret;
+}
+
+static struct cm_port *
+get_cm_port_from_path(struct sa_path_rec *path, const struct ib_gid_attr *attr)
+{
+ struct cm_device *cm_dev;
+ struct cm_port *port = NULL;
+ unsigned long flags;
+
+ if (attr) {
+ read_lock_irqsave(&cm.device_lock, flags);
+ list_for_each_entry(cm_dev, &cm.device_list, list) {
+ if (cm_dev->ib_device == attr->device) {
+ port = cm_dev->port[attr->port_num - 1];
+ break;
+ }
+ }
+ read_unlock_irqrestore(&cm.device_lock, flags);
+ } else {
+ /* SGID attribute can be NULL in following
+ * conditions.
+ * (a) Alternative path
+ * (b) IB link layer without GRH
+ * (c) LAP send messages
+ */
+ read_lock_irqsave(&cm.device_lock, flags);
+ list_for_each_entry(cm_dev, &cm.device_list, list) {
+ attr = rdma_find_gid(cm_dev->ib_device,
+ &path->sgid,
+ sa_conv_pathrec_to_gid_type(path),
+ NULL);
+ if (!IS_ERR(attr)) {
+ port = cm_dev->port[attr->port_num - 1];
+ break;
+ }
+ }
+ read_unlock_irqrestore(&cm.device_lock, flags);
+ if (port)
+ rdma_put_gid_attr(attr);
+ }
+ return port;
+}
+
+static int cm_init_av_by_path(struct sa_path_rec *path,
+ const struct ib_gid_attr *sgid_attr,
+ struct cm_av *av,
+ struct cm_id_private *cm_id_priv)
+{
+ struct rdma_ah_attr new_ah_attr;
+ struct cm_device *cm_dev;
+ struct cm_port *port;
+ int ret;
+
+ port = get_cm_port_from_path(path, sgid_attr);
+ if (!port)
+ return -EINVAL;
+ cm_dev = port->cm_dev;
+
+ ret = ib_find_cached_pkey(cm_dev->ib_device, port->port_num,
+ be16_to_cpu(path->pkey), &av->pkey_index);
+ if (ret)
+ return ret;
+
+ av->port = port;
+
+ /*
+ * av->ah_attr might be initialized based on wc or during
+ * request processing time which might have reference to sgid_attr.
+ * So initialize a new ah_attr on stack.
+ * If initialization fails, old ah_attr is used for sending any
+ * responses. If initialization is successful, than new ah_attr
+ * is used by overwriting the old one. So that right ah_attr
+ * can be used to return an error response.
+ */
+ ret = ib_init_ah_attr_from_path(cm_dev->ib_device, port->port_num, path,
+ &new_ah_attr, sgid_attr);
+ if (ret)
+ return ret;
+
+ av->timeout = path->packet_life_time + 1;
+
+ ret = add_cm_id_to_port_list(cm_id_priv, av, port);
+ if (ret) {
+ rdma_destroy_ah_attr(&new_ah_attr);
+ return ret;
+ }
+ rdma_move_ah_attr(&av->ah_attr, &new_ah_attr);
+ return 0;
+}
+
+static int cm_alloc_id(struct cm_id_private *cm_id_priv)
+{
+ unsigned long flags;
+ int id;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock_irqsave(&cm.lock, flags);
+
+ id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT);
+
+ spin_unlock_irqrestore(&cm.lock, flags);
+ idr_preload_end();
+
+ cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand;
+ return id < 0 ? id : 0;
+}
+
+static void cm_free_id(__be32 local_id)
+{
+ spin_lock_irq(&cm.lock);
+ idr_remove(&cm.local_id_table,
+ (__force int) (local_id ^ cm.random_id_operand));
+ spin_unlock_irq(&cm.lock);
+}
+
+static struct cm_id_private * cm_get_id(__be32 local_id, __be32 remote_id)
+{
+ struct cm_id_private *cm_id_priv;
+
+ cm_id_priv = idr_find(&cm.local_id_table,
+ (__force int) (local_id ^ cm.random_id_operand));
+ if (cm_id_priv) {
+ if (cm_id_priv->id.remote_id == remote_id)
+ atomic_inc(&cm_id_priv->refcount);
+ else
+ cm_id_priv = NULL;
+ }
+
+ return cm_id_priv;
+}
+
+static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id)
+{
+ struct cm_id_private *cm_id_priv;
+
+ spin_lock_irq(&cm.lock);
+ cm_id_priv = cm_get_id(local_id, remote_id);
+ spin_unlock_irq(&cm.lock);
+
+ return cm_id_priv;
+}
+
+/*
+ * Trivial helpers to strip endian annotation and compare; the
+ * endianness doesn't actually matter since we just need a stable
+ * order for the RB tree.
+ */
+static int be32_lt(__be32 a, __be32 b)
+{
+ return (__force u32) a < (__force u32) b;
+}
+
+static int be32_gt(__be32 a, __be32 b)
+{
+ return (__force u32) a > (__force u32) b;
+}
+
+static int be64_lt(__be64 a, __be64 b)
+{
+ return (__force u64) a < (__force u64) b;
+}
+
+static int be64_gt(__be64 a, __be64 b)
+{
+ return (__force u64) a > (__force u64) b;
+}
+
+static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
+{
+ struct rb_node **link = &cm.listen_service_table.rb_node;
+ struct rb_node *parent = NULL;
+ struct cm_id_private *cur_cm_id_priv;
+ __be64 service_id = cm_id_priv->id.service_id;
+ __be64 service_mask = cm_id_priv->id.service_mask;
+
+ while (*link) {
+ parent = *link;
+ cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
+ service_node);
+ if ((cur_cm_id_priv->id.service_mask & service_id) ==
+ (service_mask & cur_cm_id_priv->id.service_id) &&
+ (cm_id_priv->id.device == cur_cm_id_priv->id.device))
+ return cur_cm_id_priv;
+
+ if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
+ link = &(*link)->rb_left;
+ else if (cm_id_priv->id.device > cur_cm_id_priv->id.device)
+ link = &(*link)->rb_right;
+ else if (be64_lt(service_id, cur_cm_id_priv->id.service_id))
+ link = &(*link)->rb_left;
+ else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))
+ link = &(*link)->rb_right;
+ else
+ link = &(*link)->rb_right;
+ }
+ rb_link_node(&cm_id_priv->service_node, parent, link);
+ rb_insert_color(&cm_id_priv->service_node, &cm.listen_service_table);
+ return NULL;
+}
+
+static struct cm_id_private * cm_find_listen(struct ib_device *device,
+ __be64 service_id)
+{
+ struct rb_node *node = cm.listen_service_table.rb_node;
+ struct cm_id_private *cm_id_priv;
+
+ while (node) {
+ cm_id_priv = rb_entry(node, struct cm_id_private, service_node);
+ if ((cm_id_priv->id.service_mask & service_id) ==
+ cm_id_priv->id.service_id &&
+ (cm_id_priv->id.device == device))
+ return cm_id_priv;
+
+ if (device < cm_id_priv->id.device)
+ node = node->rb_left;
+ else if (device > cm_id_priv->id.device)
+ node = node->rb_right;
+ else if (be64_lt(service_id, cm_id_priv->id.service_id))
+ node = node->rb_left;
+ else if (be64_gt(service_id, cm_id_priv->id.service_id))
+ node = node->rb_right;
+ else
+ node = node->rb_right;
+ }
+ return NULL;
+}
+
+static struct cm_timewait_info * cm_insert_remote_id(struct cm_timewait_info
+ *timewait_info)
+{
+ struct rb_node **link = &cm.remote_id_table.rb_node;
+ struct rb_node *parent = NULL;
+ struct cm_timewait_info *cur_timewait_info;
+ __be64 remote_ca_guid = timewait_info->remote_ca_guid;
+ __be32 remote_id = timewait_info->work.remote_id;
+
+ while (*link) {
+ parent = *link;
+ cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
+ remote_id_node);
+ if (be32_lt(remote_id, cur_timewait_info->work.remote_id))
+ link = &(*link)->rb_left;
+ else if (be32_gt(remote_id, cur_timewait_info->work.remote_id))
+ link = &(*link)->rb_right;
+ else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+ link = &(*link)->rb_left;
+ else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+ link = &(*link)->rb_right;
+ else
+ return cur_timewait_info;
+ }
+ timewait_info->inserted_remote_id = 1;
+ rb_link_node(&timewait_info->remote_id_node, parent, link);
+ rb_insert_color(&timewait_info->remote_id_node, &cm.remote_id_table);
+ return NULL;
+}
+
+static struct cm_timewait_info * cm_find_remote_id(__be64 remote_ca_guid,
+ __be32 remote_id)
+{
+ struct rb_node *node = cm.remote_id_table.rb_node;
+ struct cm_timewait_info *timewait_info;
+
+ while (node) {
+ timewait_info = rb_entry(node, struct cm_timewait_info,
+ remote_id_node);
+ if (be32_lt(remote_id, timewait_info->work.remote_id))
+ node = node->rb_left;
+ else if (be32_gt(remote_id, timewait_info->work.remote_id))
+ node = node->rb_right;
+ else if (be64_lt(remote_ca_guid, timewait_info->remote_ca_guid))
+ node = node->rb_left;
+ else if (be64_gt(remote_ca_guid, timewait_info->remote_ca_guid))
+ node = node->rb_right;
+ else
+ return timewait_info;
+ }
+ return NULL;
+}
+
+static struct cm_timewait_info * cm_insert_remote_qpn(struct cm_timewait_info
+ *timewait_info)
+{
+ struct rb_node **link = &cm.remote_qp_table.rb_node;
+ struct rb_node *parent = NULL;
+ struct cm_timewait_info *cur_timewait_info;
+ __be64 remote_ca_guid = timewait_info->remote_ca_guid;
+ __be32 remote_qpn = timewait_info->remote_qpn;
+
+ while (*link) {
+ parent = *link;
+ cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
+ remote_qp_node);
+ if (be32_lt(remote_qpn, cur_timewait_info->remote_qpn))
+ link = &(*link)->rb_left;
+ else if (be32_gt(remote_qpn, cur_timewait_info->remote_qpn))
+ link = &(*link)->rb_right;
+ else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+ link = &(*link)->rb_left;
+ else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+ link = &(*link)->rb_right;
+ else
+ return cur_timewait_info;
+ }
+ timewait_info->inserted_remote_qp = 1;
+ rb_link_node(&timewait_info->remote_qp_node, parent, link);
+ rb_insert_color(&timewait_info->remote_qp_node, &cm.remote_qp_table);
+ return NULL;
+}
+
+static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private
+ *cm_id_priv)
+{
+ struct rb_node **link = &cm.remote_sidr_table.rb_node;
+ struct rb_node *parent = NULL;
+ struct cm_id_private *cur_cm_id_priv;
+ union ib_gid *port_gid = &cm_id_priv->av.dgid;
+ __be32 remote_id = cm_id_priv->id.remote_id;
+
+ while (*link) {
+ parent = *link;
+ cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
+ sidr_id_node);
+ if (be32_lt(remote_id, cur_cm_id_priv->id.remote_id))
+ link = &(*link)->rb_left;
+ else if (be32_gt(remote_id, cur_cm_id_priv->id.remote_id))
+ link = &(*link)->rb_right;
+ else {
+ int cmp;
+ cmp = memcmp(port_gid, &cur_cm_id_priv->av.dgid,
+ sizeof *port_gid);
+ if (cmp < 0)
+ link = &(*link)->rb_left;
+ else if (cmp > 0)
+ link = &(*link)->rb_right;
+ else
+ return cur_cm_id_priv;
+ }
+ }
+ rb_link_node(&cm_id_priv->sidr_id_node, parent, link);
+ rb_insert_color(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+ return NULL;
+}
+
+static void cm_reject_sidr_req(struct cm_id_private *cm_id_priv,
+ enum ib_cm_sidr_status status)
+{
+ struct ib_cm_sidr_rep_param param;
+
+ memset(&param, 0, sizeof param);
+ param.status = status;
+ ib_send_cm_sidr_rep(&cm_id_priv->id, &param);
+}
+
+struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
+ ib_cm_handler cm_handler,
+ void *context)
+{
+ struct cm_id_private *cm_id_priv;
+ int ret;
+
+ cm_id_priv = kzalloc(sizeof *cm_id_priv, GFP_KERNEL);
+ if (!cm_id_priv)
+ return ERR_PTR(-ENOMEM);
+
+ cm_id_priv->id.state = IB_CM_IDLE;
+ cm_id_priv->id.device = device;
+ cm_id_priv->id.cm_handler = cm_handler;
+ cm_id_priv->id.context = context;
+ cm_id_priv->id.remote_cm_qpn = 1;
+ ret = cm_alloc_id(cm_id_priv);
+ if (ret)
+ goto error;
+
+ spin_lock_init(&cm_id_priv->lock);
+ init_completion(&cm_id_priv->comp);
+ INIT_LIST_HEAD(&cm_id_priv->work_list);
+ INIT_LIST_HEAD(&cm_id_priv->prim_list);
+ INIT_LIST_HEAD(&cm_id_priv->altr_list);
+ atomic_set(&cm_id_priv->work_count, -1);
+ atomic_set(&cm_id_priv->refcount, 1);
+ return &cm_id_priv->id;
+
+error:
+ kfree(cm_id_priv);
+ return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_create_cm_id);
+
+static struct cm_work * cm_dequeue_work(struct cm_id_private *cm_id_priv)
+{
+ struct cm_work *work;
+
+ if (list_empty(&cm_id_priv->work_list))
+ return NULL;
+
+ work = list_entry(cm_id_priv->work_list.next, struct cm_work, list);
+ list_del(&work->list);
+ return work;
+}
+
+static void cm_free_work(struct cm_work *work)
+{
+ if (work->mad_recv_wc)
+ ib_free_recv_mad(work->mad_recv_wc);
+ kfree(work);
+}
+
+static inline int cm_convert_to_ms(int iba_time)
+{
+ /* approximate conversion to ms from 4.096us x 2^iba_time */
+ return 1 << max(iba_time - 8, 0);
+}
+
+/*
+ * calculate: 4.096x2^ack_timeout = 4.096x2^ack_delay + 2x4.096x2^life_time
+ * Because of how ack_timeout is stored, adding one doubles the timeout.
+ * To avoid large timeouts, select the max(ack_delay, life_time + 1), and
+ * increment it (round up) only if the other is within 50%.
+ */
+static u8 cm_ack_timeout(u8 ca_ack_delay, u8 packet_life_time)
+{
+ int ack_timeout = packet_life_time + 1;
+
+ if (ack_timeout >= ca_ack_delay)
+ ack_timeout += (ca_ack_delay >= (ack_timeout - 1));
+ else
+ ack_timeout = ca_ack_delay +
+ (ack_timeout >= (ca_ack_delay - 1));
+
+ return min(31, ack_timeout);
+}
+
+static void cm_cleanup_timewait(struct cm_timewait_info *timewait_info)
+{
+ if (timewait_info->inserted_remote_id) {
+ rb_erase(&timewait_info->remote_id_node, &cm.remote_id_table);
+ timewait_info->inserted_remote_id = 0;
+ }
+
+ if (timewait_info->inserted_remote_qp) {
+ rb_erase(&timewait_info->remote_qp_node, &cm.remote_qp_table);
+ timewait_info->inserted_remote_qp = 0;
+ }
+}
+
+static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id)
+{
+ struct cm_timewait_info *timewait_info;
+
+ timewait_info = kzalloc(sizeof *timewait_info, GFP_KERNEL);
+ if (!timewait_info)
+ return ERR_PTR(-ENOMEM);
+
+ timewait_info->work.local_id = local_id;
+ INIT_DELAYED_WORK(&timewait_info->work.work, cm_work_handler);
+ timewait_info->work.cm_event.event = IB_CM_TIMEWAIT_EXIT;
+ return timewait_info;
+}
+
+static void cm_enter_timewait(struct cm_id_private *cm_id_priv)
+{
+ int wait_time;
+ unsigned long flags;
+ struct cm_device *cm_dev;
+
+ cm_dev = ib_get_client_data(cm_id_priv->id.device, &cm_client);
+ if (!cm_dev)
+ return;
+
+ spin_lock_irqsave(&cm.lock, flags);
+ cm_cleanup_timewait(cm_id_priv->timewait_info);
+ list_add_tail(&cm_id_priv->timewait_info->list, &cm.timewait_list);
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ /*
+ * The cm_id could be destroyed by the user before we exit timewait.
+ * To protect against this, we search for the cm_id after exiting
+ * timewait before notifying the user that we've exited timewait.
+ */
+ cm_id_priv->id.state = IB_CM_TIMEWAIT;
+ wait_time = cm_convert_to_ms(cm_id_priv->av.timeout);
+
+ /* Check if the device started its remove_one */
+ spin_lock_irqsave(&cm.lock, flags);
+ if (!cm_dev->going_down)
+ queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work,
+ msecs_to_jiffies(wait_time));
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ cm_id_priv->timewait_info = NULL;
+}
+
+static void cm_reset_to_idle(struct cm_id_private *cm_id_priv)
+{
+ unsigned long flags;
+
+ cm_id_priv->id.state = IB_CM_IDLE;
+ if (cm_id_priv->timewait_info) {
+ spin_lock_irqsave(&cm.lock, flags);
+ cm_cleanup_timewait(cm_id_priv->timewait_info);
+ spin_unlock_irqrestore(&cm.lock, flags);
+ kfree(cm_id_priv->timewait_info);
+ cm_id_priv->timewait_info = NULL;
+ }
+}
+
+static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_work *work;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+retest:
+ spin_lock_irq(&cm_id_priv->lock);
+ switch (cm_id->state) {
+ case IB_CM_LISTEN:
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ spin_lock_irq(&cm.lock);
+ if (--cm_id_priv->listen_sharecount > 0) {
+ /* The id is still shared. */
+ cm_deref_id(cm_id_priv);
+ spin_unlock_irq(&cm.lock);
+ return;
+ }
+ rb_erase(&cm_id_priv->service_node, &cm.listen_service_table);
+ spin_unlock_irq(&cm.lock);
+ break;
+ case IB_CM_SIDR_REQ_SENT:
+ cm_id->state = IB_CM_IDLE;
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ spin_unlock_irq(&cm_id_priv->lock);
+ break;
+ case IB_CM_SIDR_REQ_RCVD:
+ spin_unlock_irq(&cm_id_priv->lock);
+ cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT);
+ spin_lock_irq(&cm.lock);
+ if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node))
+ rb_erase(&cm_id_priv->sidr_id_node,
+ &cm.remote_sidr_table);
+ spin_unlock_irq(&cm.lock);
+ break;
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ spin_unlock_irq(&cm_id_priv->lock);
+ ib_send_cm_rej(cm_id, IB_CM_REJ_TIMEOUT,
+ &cm_id_priv->id.device->node_guid,
+ sizeof cm_id_priv->id.device->node_guid,
+ NULL, 0);
+ break;
+ case IB_CM_REQ_RCVD:
+ if (err == -ENOMEM) {
+ /* Do not reject to allow future retries. */
+ cm_reset_to_idle(cm_id_priv);
+ spin_unlock_irq(&cm_id_priv->lock);
+ } else {
+ spin_unlock_irq(&cm_id_priv->lock);
+ ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+ NULL, 0, NULL, 0);
+ }
+ break;
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ /* Fall through */
+ case IB_CM_MRA_REQ_SENT:
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ spin_unlock_irq(&cm_id_priv->lock);
+ ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+ NULL, 0, NULL, 0);
+ break;
+ case IB_CM_ESTABLISHED:
+ spin_unlock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->qp_type == IB_QPT_XRC_TGT)
+ break;
+ ib_send_cm_dreq(cm_id, NULL, 0);
+ goto retest;
+ case IB_CM_DREQ_SENT:
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ cm_enter_timewait(cm_id_priv);
+ spin_unlock_irq(&cm_id_priv->lock);
+ break;
+ case IB_CM_DREQ_RCVD:
+ spin_unlock_irq(&cm_id_priv->lock);
+ ib_send_cm_drep(cm_id, NULL, 0);
+ break;
+ default:
+ spin_unlock_irq(&cm_id_priv->lock);
+ break;
+ }
+
+ spin_lock_irq(&cm_id_priv->lock);
+ spin_lock(&cm.lock);
+ /* Required for cleanup paths related cm_req_handler() */
+ if (cm_id_priv->timewait_info) {
+ cm_cleanup_timewait(cm_id_priv->timewait_info);
+ kfree(cm_id_priv->timewait_info);
+ cm_id_priv->timewait_info = NULL;
+ }
+ if (!list_empty(&cm_id_priv->altr_list) &&
+ (!cm_id_priv->altr_send_port_not_ready))
+ list_del(&cm_id_priv->altr_list);
+ if (!list_empty(&cm_id_priv->prim_list) &&
+ (!cm_id_priv->prim_send_port_not_ready))
+ list_del(&cm_id_priv->prim_list);
+ spin_unlock(&cm.lock);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ cm_free_id(cm_id->local_id);
+ cm_deref_id(cm_id_priv);
+ wait_for_completion(&cm_id_priv->comp);
+ while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
+ cm_free_work(work);
+
+ rdma_destroy_ah_attr(&cm_id_priv->av.ah_attr);
+ rdma_destroy_ah_attr(&cm_id_priv->alt_av.ah_attr);
+ kfree(cm_id_priv->private_data);
+ kfree(cm_id_priv);
+}
+
+void ib_destroy_cm_id(struct ib_cm_id *cm_id)
+{
+ cm_destroy_id(cm_id, 0);
+}
+EXPORT_SYMBOL(ib_destroy_cm_id);
+
+/**
+ * __ib_cm_listen - Initiates listening on the specified service ID for
+ * connection and service ID resolution requests.
+ * @cm_id: Connection identifier associated with the listen request.
+ * @service_id: Service identifier matched against incoming connection
+ * and service ID resolution requests. The service ID should be specified
+ * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ * assign a service ID to the caller.
+ * @service_mask: Mask applied to service ID used to listen across a
+ * range of service IDs. If set to 0, the service ID is matched
+ * exactly. This parameter is ignored if %service_id is set to
+ * IB_CM_ASSIGN_SERVICE_ID.
+ */
+static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id,
+ __be64 service_mask)
+{
+ struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
+ int ret = 0;
+
+ service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
+ service_id &= service_mask;
+ if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID &&
+ (service_id != IB_CM_ASSIGN_SERVICE_ID))
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ if (cm_id->state != IB_CM_IDLE)
+ return -EINVAL;
+
+ cm_id->state = IB_CM_LISTEN;
+ ++cm_id_priv->listen_sharecount;
+
+ if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
+ cm_id->service_id = cpu_to_be64(cm.listen_service_id++);
+ cm_id->service_mask = ~cpu_to_be64(0);
+ } else {
+ cm_id->service_id = service_id;
+ cm_id->service_mask = service_mask;
+ }
+ cur_cm_id_priv = cm_insert_listen(cm_id_priv);
+
+ if (cur_cm_id_priv) {
+ cm_id->state = IB_CM_IDLE;
+ --cm_id_priv->listen_sharecount;
+ ret = -EBUSY;
+ }
+ return ret;
+}
+
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm.lock, flags);
+ ret = __ib_cm_listen(cm_id, service_id, service_mask);
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_cm_listen);
+
+/**
+ * Create a new listening ib_cm_id and listen on the given service ID.
+ *
+ * If there's an existing ID listening on that same device and service ID,
+ * return it.
+ *
+ * @device: Device associated with the cm_id. All related communication will
+ * be associated with the specified device.
+ * @cm_handler: Callback invoked to notify the user of CM events.
+ * @service_id: Service identifier matched against incoming connection
+ * and service ID resolution requests. The service ID should be specified
+ * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ * assign a service ID to the caller.
+ *
+ * Callers should call ib_destroy_cm_id when done with the listener ID.
+ */
+struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
+ ib_cm_handler cm_handler,
+ __be64 service_id)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_cm_id *cm_id;
+ unsigned long flags;
+ int err = 0;
+
+ /* Create an ID in advance, since the creation may sleep */
+ cm_id = ib_create_cm_id(device, cm_handler, NULL);
+ if (IS_ERR(cm_id))
+ return cm_id;
+
+ spin_lock_irqsave(&cm.lock, flags);
+
+ if (service_id == IB_CM_ASSIGN_SERVICE_ID)
+ goto new_id;
+
+ /* Find an existing ID */
+ cm_id_priv = cm_find_listen(device, service_id);
+ if (cm_id_priv) {
+ if (cm_id->cm_handler != cm_handler || cm_id->context) {
+ /* Sharing an ib_cm_id with different handlers is not
+ * supported */
+ spin_unlock_irqrestore(&cm.lock, flags);
+ ib_destroy_cm_id(cm_id);
+ return ERR_PTR(-EINVAL);
+ }
+ atomic_inc(&cm_id_priv->refcount);
+ ++cm_id_priv->listen_sharecount;
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ ib_destroy_cm_id(cm_id);
+ cm_id = &cm_id_priv->id;
+ return cm_id;
+ }
+
+new_id:
+ /* Use newly created ID */
+ err = __ib_cm_listen(cm_id, service_id, 0);
+
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ if (err) {
+ ib_destroy_cm_id(cm_id);
+ return ERR_PTR(err);
+ }
+ return cm_id;
+}
+EXPORT_SYMBOL(ib_cm_insert_listen);
+
+static __be64 cm_form_tid(struct cm_id_private *cm_id_priv)
+{
+ u64 hi_tid, low_tid;
+
+ hi_tid = ((u64) cm_id_priv->av.port->mad_agent->hi_tid) << 32;
+ low_tid = (u64)cm_id_priv->id.local_id;
+ return cpu_to_be64(hi_tid | low_tid);
+}
+
+static void cm_format_mad_hdr(struct ib_mad_hdr *hdr,
+ __be16 attr_id, __be64 tid)
+{
+ hdr->base_version = IB_MGMT_BASE_VERSION;
+ hdr->mgmt_class = IB_MGMT_CLASS_CM;
+ hdr->class_version = IB_CM_CLASS_VERSION;
+ hdr->method = IB_MGMT_METHOD_SEND;
+ hdr->attr_id = attr_id;
+ hdr->tid = tid;
+}
+
+static void cm_format_req(struct cm_req_msg *req_msg,
+ struct cm_id_private *cm_id_priv,
+ struct ib_cm_req_param *param)
+{
+ struct sa_path_rec *pri_path = param->primary_path;
+ struct sa_path_rec *alt_path = param->alternate_path;
+ bool pri_ext = false;
+
+ if (pri_path->rec_type == SA_PATH_REC_TYPE_OPA)
+ pri_ext = opa_is_extended_lid(pri_path->opa.dlid,
+ pri_path->opa.slid);
+
+ cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID,
+ cm_form_tid(cm_id_priv));
+
+ req_msg->local_comm_id = cm_id_priv->id.local_id;
+ req_msg->service_id = param->service_id;
+ req_msg->local_ca_guid = cm_id_priv->id.device->node_guid;
+ cm_req_set_local_qpn(req_msg, cpu_to_be32(param->qp_num));
+ cm_req_set_init_depth(req_msg, param->initiator_depth);
+ cm_req_set_remote_resp_timeout(req_msg,
+ param->remote_cm_response_timeout);
+ cm_req_set_qp_type(req_msg, param->qp_type);
+ cm_req_set_flow_ctrl(req_msg, param->flow_control);
+ cm_req_set_starting_psn(req_msg, cpu_to_be32(param->starting_psn));
+ cm_req_set_local_resp_timeout(req_msg,
+ param->local_cm_response_timeout);
+ req_msg->pkey = param->primary_path->pkey;
+ cm_req_set_path_mtu(req_msg, param->primary_path->mtu);
+ cm_req_set_max_cm_retries(req_msg, param->max_cm_retries);
+
+ if (param->qp_type != IB_QPT_XRC_INI) {
+ cm_req_set_resp_res(req_msg, param->responder_resources);
+ cm_req_set_retry_count(req_msg, param->retry_count);
+ cm_req_set_rnr_retry_count(req_msg, param->rnr_retry_count);
+ cm_req_set_srq(req_msg, param->srq);
+ }
+
+ req_msg->primary_local_gid = pri_path->sgid;
+ req_msg->primary_remote_gid = pri_path->dgid;
+ if (pri_ext) {
+ req_msg->primary_local_gid.global.interface_id
+ = OPA_MAKE_ID(be32_to_cpu(pri_path->opa.slid));
+ req_msg->primary_remote_gid.global.interface_id
+ = OPA_MAKE_ID(be32_to_cpu(pri_path->opa.dlid));
+ }
+ if (pri_path->hop_limit <= 1) {
+ req_msg->primary_local_lid = pri_ext ? 0 :
+ htons(ntohl(sa_path_get_slid(pri_path)));
+ req_msg->primary_remote_lid = pri_ext ? 0 :
+ htons(ntohl(sa_path_get_dlid(pri_path)));
+ } else {
+ /* Work-around until there's a way to obtain remote LID info */
+ req_msg->primary_local_lid = IB_LID_PERMISSIVE;
+ req_msg->primary_remote_lid = IB_LID_PERMISSIVE;
+ }
+ cm_req_set_primary_flow_label(req_msg, pri_path->flow_label);
+ cm_req_set_primary_packet_rate(req_msg, pri_path->rate);
+ req_msg->primary_traffic_class = pri_path->traffic_class;
+ req_msg->primary_hop_limit = pri_path->hop_limit;
+ cm_req_set_primary_sl(req_msg, pri_path->sl);
+ cm_req_set_primary_subnet_local(req_msg, (pri_path->hop_limit <= 1));
+ cm_req_set_primary_local_ack_timeout(req_msg,
+ cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+ pri_path->packet_life_time));
+
+ if (alt_path) {
+ bool alt_ext = false;
+
+ if (alt_path->rec_type == SA_PATH_REC_TYPE_OPA)
+ alt_ext = opa_is_extended_lid(alt_path->opa.dlid,
+ alt_path->opa.slid);
+
+ req_msg->alt_local_gid = alt_path->sgid;
+ req_msg->alt_remote_gid = alt_path->dgid;
+ if (alt_ext) {
+ req_msg->alt_local_gid.global.interface_id
+ = OPA_MAKE_ID(be32_to_cpu(alt_path->opa.slid));
+ req_msg->alt_remote_gid.global.interface_id
+ = OPA_MAKE_ID(be32_to_cpu(alt_path->opa.dlid));
+ }
+ if (alt_path->hop_limit <= 1) {
+ req_msg->alt_local_lid = alt_ext ? 0 :
+ htons(ntohl(sa_path_get_slid(alt_path)));
+ req_msg->alt_remote_lid = alt_ext ? 0 :
+ htons(ntohl(sa_path_get_dlid(alt_path)));
+ } else {
+ req_msg->alt_local_lid = IB_LID_PERMISSIVE;
+ req_msg->alt_remote_lid = IB_LID_PERMISSIVE;
+ }
+ cm_req_set_alt_flow_label(req_msg,
+ alt_path->flow_label);
+ cm_req_set_alt_packet_rate(req_msg, alt_path->rate);
+ req_msg->alt_traffic_class = alt_path->traffic_class;
+ req_msg->alt_hop_limit = alt_path->hop_limit;
+ cm_req_set_alt_sl(req_msg, alt_path->sl);
+ cm_req_set_alt_subnet_local(req_msg, (alt_path->hop_limit <= 1));
+ cm_req_set_alt_local_ack_timeout(req_msg,
+ cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+ alt_path->packet_life_time));
+ }
+
+ if (param->private_data && param->private_data_len)
+ memcpy(req_msg->private_data, param->private_data,
+ param->private_data_len);
+}
+
+static int cm_validate_req_param(struct ib_cm_req_param *param)
+{
+ /* peer-to-peer not supported */
+ if (param->peer_to_peer)
+ return -EINVAL;
+
+ if (!param->primary_path)
+ return -EINVAL;
+
+ if (param->qp_type != IB_QPT_RC && param->qp_type != IB_QPT_UC &&
+ param->qp_type != IB_QPT_XRC_INI)
+ return -EINVAL;
+
+ if (param->private_data &&
+ param->private_data_len > IB_CM_REQ_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ if (param->alternate_path &&
+ (param->alternate_path->pkey != param->primary_path->pkey ||
+ param->alternate_path->mtu != param->primary_path->mtu))
+ return -EINVAL;
+
+ return 0;
+}
+
+int ib_send_cm_req(struct ib_cm_id *cm_id,
+ struct ib_cm_req_param *param)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_req_msg *req_msg;
+ unsigned long flags;
+ int ret;
+
+ ret = cm_validate_req_param(param);
+ if (ret)
+ return ret;
+
+ /* Verify that we're not in timewait. */
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_IDLE || WARN_ON(cm_id_priv->timewait_info)) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ ret = -EINVAL;
+ goto out;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
+ id.local_id);
+ if (IS_ERR(cm_id_priv->timewait_info)) {
+ ret = PTR_ERR(cm_id_priv->timewait_info);
+ cm_id_priv->timewait_info = NULL;
+ goto out;
+ }
+
+ ret = cm_init_av_by_path(param->primary_path,
+ param->ppath_sgid_attr, &cm_id_priv->av,
+ cm_id_priv);
+ if (ret)
+ goto out;
+ if (param->alternate_path) {
+ ret = cm_init_av_by_path(param->alternate_path, NULL,
+ &cm_id_priv->alt_av, cm_id_priv);
+ if (ret)
+ goto out;
+ }
+ cm_id->service_id = param->service_id;
+ cm_id->service_mask = ~cpu_to_be64(0);
+ cm_id_priv->timeout_ms = cm_convert_to_ms(
+ param->primary_path->packet_life_time) * 2 +
+ cm_convert_to_ms(
+ param->remote_cm_response_timeout);
+ cm_id_priv->max_cm_retries = param->max_cm_retries;
+ cm_id_priv->initiator_depth = param->initiator_depth;
+ cm_id_priv->responder_resources = param->responder_resources;
+ cm_id_priv->retry_count = param->retry_count;
+ cm_id_priv->path_mtu = param->primary_path->mtu;
+ cm_id_priv->pkey = param->primary_path->pkey;
+ cm_id_priv->qp_type = param->qp_type;
+
+ ret = cm_alloc_msg(cm_id_priv, &cm_id_priv->msg);
+ if (ret)
+ goto out;
+
+ req_msg = (struct cm_req_msg *) cm_id_priv->msg->mad;
+ cm_format_req(req_msg, cm_id_priv, param);
+ cm_id_priv->tid = req_msg->hdr.tid;
+ cm_id_priv->msg->timeout_ms = cm_id_priv->timeout_ms;
+ cm_id_priv->msg->context[1] = (void *) (unsigned long) IB_CM_REQ_SENT;
+
+ cm_id_priv->local_qpn = cm_req_get_local_qpn(req_msg);
+ cm_id_priv->rq_psn = cm_req_get_starting_psn(req_msg);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ ret = ib_post_send_mad(cm_id_priv->msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ goto error2;
+ }
+ BUG_ON(cm_id->state != IB_CM_IDLE);
+ cm_id->state = IB_CM_REQ_SENT;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return 0;
+
+error2: cm_free_msg(cm_id_priv->msg);
+out: return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_req);
+
+static int cm_issue_rej(struct cm_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc,
+ enum ib_cm_rej_reason reason,
+ enum cm_msg_response msg_rejected,
+ void *ari, u8 ari_length)
+{
+ struct ib_mad_send_buf *msg = NULL;
+ struct cm_rej_msg *rej_msg, *rcv_msg;
+ int ret;
+
+ ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+ if (ret)
+ return ret;
+
+ /* We just need common CM header information. Cast to any message. */
+ rcv_msg = (struct cm_rej_msg *) mad_recv_wc->recv_buf.mad;
+ rej_msg = (struct cm_rej_msg *) msg->mad;
+
+ cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, rcv_msg->hdr.tid);
+ rej_msg->remote_comm_id = rcv_msg->local_comm_id;
+ rej_msg->local_comm_id = rcv_msg->remote_comm_id;
+ cm_rej_set_msg_rejected(rej_msg, msg_rejected);
+ rej_msg->reason = cpu_to_be16(reason);
+
+ if (ari && ari_length) {
+ cm_rej_set_reject_info_len(rej_msg, ari_length);
+ memcpy(rej_msg->ari, ari, ari_length);
+ }
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ cm_free_msg(msg);
+
+ return ret;
+}
+
+static inline int cm_is_active_peer(__be64 local_ca_guid, __be64 remote_ca_guid,
+ __be32 local_qpn, __be32 remote_qpn)
+{
+ return (be64_to_cpu(local_ca_guid) > be64_to_cpu(remote_ca_guid) ||
+ ((local_ca_guid == remote_ca_guid) &&
+ (be32_to_cpu(local_qpn) > be32_to_cpu(remote_qpn))));
+}
+
+static bool cm_req_has_alt_path(struct cm_req_msg *req_msg)
+{
+ return ((req_msg->alt_local_lid) ||
+ (ib_is_opa_gid(&req_msg->alt_local_gid)));
+}
+
+static void cm_path_set_rec_type(struct ib_device *ib_device, u8 port_num,
+ struct sa_path_rec *path, union ib_gid *gid)
+{
+ if (ib_is_opa_gid(gid) && rdma_cap_opa_ah(ib_device, port_num))
+ path->rec_type = SA_PATH_REC_TYPE_OPA;
+ else
+ path->rec_type = SA_PATH_REC_TYPE_IB;
+}
+
+static void cm_format_path_lid_from_req(struct cm_req_msg *req_msg,
+ struct sa_path_rec *primary_path,
+ struct sa_path_rec *alt_path)
+{
+ u32 lid;
+
+ if (primary_path->rec_type != SA_PATH_REC_TYPE_OPA) {
+ sa_path_set_dlid(primary_path,
+ ntohs(req_msg->primary_local_lid));
+ sa_path_set_slid(primary_path,
+ ntohs(req_msg->primary_remote_lid));
+ } else {
+ lid = opa_get_lid_from_gid(&req_msg->primary_local_gid);
+ sa_path_set_dlid(primary_path, lid);
+
+ lid = opa_get_lid_from_gid(&req_msg->primary_remote_gid);
+ sa_path_set_slid(primary_path, lid);
+ }
+
+ if (!cm_req_has_alt_path(req_msg))
+ return;
+
+ if (alt_path->rec_type != SA_PATH_REC_TYPE_OPA) {
+ sa_path_set_dlid(alt_path, ntohs(req_msg->alt_local_lid));
+ sa_path_set_slid(alt_path, ntohs(req_msg->alt_remote_lid));
+ } else {
+ lid = opa_get_lid_from_gid(&req_msg->alt_local_gid);
+ sa_path_set_dlid(alt_path, lid);
+
+ lid = opa_get_lid_from_gid(&req_msg->alt_remote_gid);
+ sa_path_set_slid(alt_path, lid);
+ }
+}
+
+static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
+ struct sa_path_rec *primary_path,
+ struct sa_path_rec *alt_path)
+{
+ primary_path->dgid = req_msg->primary_local_gid;
+ primary_path->sgid = req_msg->primary_remote_gid;
+ primary_path->flow_label = cm_req_get_primary_flow_label(req_msg);
+ primary_path->hop_limit = req_msg->primary_hop_limit;
+ primary_path->traffic_class = req_msg->primary_traffic_class;
+ primary_path->reversible = 1;
+ primary_path->pkey = req_msg->pkey;
+ primary_path->sl = cm_req_get_primary_sl(req_msg);
+ primary_path->mtu_selector = IB_SA_EQ;
+ primary_path->mtu = cm_req_get_path_mtu(req_msg);
+ primary_path->rate_selector = IB_SA_EQ;
+ primary_path->rate = cm_req_get_primary_packet_rate(req_msg);
+ primary_path->packet_life_time_selector = IB_SA_EQ;
+ primary_path->packet_life_time =
+ cm_req_get_primary_local_ack_timeout(req_msg);
+ primary_path->packet_life_time -= (primary_path->packet_life_time > 0);
+ primary_path->service_id = req_msg->service_id;
+ if (sa_path_is_roce(primary_path))
+ primary_path->roce.route_resolved = false;
+
+ if (cm_req_has_alt_path(req_msg)) {
+ alt_path->dgid = req_msg->alt_local_gid;
+ alt_path->sgid = req_msg->alt_remote_gid;
+ alt_path->flow_label = cm_req_get_alt_flow_label(req_msg);
+ alt_path->hop_limit = req_msg->alt_hop_limit;
+ alt_path->traffic_class = req_msg->alt_traffic_class;
+ alt_path->reversible = 1;
+ alt_path->pkey = req_msg->pkey;
+ alt_path->sl = cm_req_get_alt_sl(req_msg);
+ alt_path->mtu_selector = IB_SA_EQ;
+ alt_path->mtu = cm_req_get_path_mtu(req_msg);
+ alt_path->rate_selector = IB_SA_EQ;
+ alt_path->rate = cm_req_get_alt_packet_rate(req_msg);
+ alt_path->packet_life_time_selector = IB_SA_EQ;
+ alt_path->packet_life_time =
+ cm_req_get_alt_local_ack_timeout(req_msg);
+ alt_path->packet_life_time -= (alt_path->packet_life_time > 0);
+ alt_path->service_id = req_msg->service_id;
+
+ if (sa_path_is_roce(alt_path))
+ alt_path->roce.route_resolved = false;
+ }
+ cm_format_path_lid_from_req(req_msg, primary_path, alt_path);
+}
+
+static u16 cm_get_bth_pkey(struct cm_work *work)
+{
+ struct ib_device *ib_dev = work->port->cm_dev->ib_device;
+ u8 port_num = work->port->port_num;
+ u16 pkey_index = work->mad_recv_wc->wc->pkey_index;
+ u16 pkey;
+ int ret;
+
+ ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey);
+ if (ret) {
+ dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %d, pkey index %d). %d\n",
+ port_num, pkey_index, ret);
+ return 0;
+ }
+
+ return pkey;
+}
+
+/**
+ * Convert OPA SGID to IB SGID
+ * ULPs (such as IPoIB) do not understand OPA GIDs and will
+ * reject them as the local_gid will not match the sgid. Therefore,
+ * change the pathrec's SGID to an IB SGID.
+ *
+ * @work: Work completion
+ * @path: Path record
+ */
+static void cm_opa_to_ib_sgid(struct cm_work *work,
+ struct sa_path_rec *path)
+{
+ struct ib_device *dev = work->port->cm_dev->ib_device;
+ u8 port_num = work->port->port_num;
+
+ if (rdma_cap_opa_ah(dev, port_num) &&
+ (ib_is_opa_gid(&path->sgid))) {
+ union ib_gid sgid;
+
+ if (rdma_query_gid(dev, port_num, 0, &sgid)) {
+ dev_warn(&dev->dev,
+ "Error updating sgid in CM request\n");
+ return;
+ }
+
+ path->sgid = sgid;
+ }
+}
+
+static void cm_format_req_event(struct cm_work *work,
+ struct cm_id_private *cm_id_priv,
+ struct ib_cm_id *listen_id)
+{
+ struct cm_req_msg *req_msg;
+ struct ib_cm_req_event_param *param;
+
+ req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+ param = &work->cm_event.param.req_rcvd;
+ param->listen_id = listen_id;
+ param->bth_pkey = cm_get_bth_pkey(work);
+ param->port = cm_id_priv->av.port->port_num;
+ param->primary_path = &work->path[0];
+ cm_opa_to_ib_sgid(work, param->primary_path);
+ if (cm_req_has_alt_path(req_msg)) {
+ param->alternate_path = &work->path[1];
+ cm_opa_to_ib_sgid(work, param->alternate_path);
+ } else {
+ param->alternate_path = NULL;
+ }
+ param->remote_ca_guid = req_msg->local_ca_guid;
+ param->remote_qkey = be32_to_cpu(req_msg->local_qkey);
+ param->remote_qpn = be32_to_cpu(cm_req_get_local_qpn(req_msg));
+ param->qp_type = cm_req_get_qp_type(req_msg);
+ param->starting_psn = be32_to_cpu(cm_req_get_starting_psn(req_msg));
+ param->responder_resources = cm_req_get_init_depth(req_msg);
+ param->initiator_depth = cm_req_get_resp_res(req_msg);
+ param->local_cm_response_timeout =
+ cm_req_get_remote_resp_timeout(req_msg);
+ param->flow_control = cm_req_get_flow_ctrl(req_msg);
+ param->remote_cm_response_timeout =
+ cm_req_get_local_resp_timeout(req_msg);
+ param->retry_count = cm_req_get_retry_count(req_msg);
+ param->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg);
+ param->srq = cm_req_get_srq(req_msg);
+ param->ppath_sgid_attr = cm_id_priv->av.ah_attr.grh.sgid_attr;
+ work->cm_event.private_data = &req_msg->private_data;
+}
+
+static void cm_process_work(struct cm_id_private *cm_id_priv,
+ struct cm_work *work)
+{
+ int ret;
+
+ /* We will typically only have the current event to report. */
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event);
+ cm_free_work(work);
+
+ while (!ret && !atomic_add_negative(-1, &cm_id_priv->work_count)) {
+ spin_lock_irq(&cm_id_priv->lock);
+ work = cm_dequeue_work(cm_id_priv);
+ spin_unlock_irq(&cm_id_priv->lock);
+ if (!work)
+ return;
+
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id,
+ &work->cm_event);
+ cm_free_work(work);
+ }
+ cm_deref_id(cm_id_priv);
+ if (ret)
+ cm_destroy_id(&cm_id_priv->id, ret);
+}
+
+static void cm_format_mra(struct cm_mra_msg *mra_msg,
+ struct cm_id_private *cm_id_priv,
+ enum cm_msg_response msg_mraed, u8 service_timeout,
+ const void *private_data, u8 private_data_len)
+{
+ cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid);
+ cm_mra_set_msg_mraed(mra_msg, msg_mraed);
+ mra_msg->local_comm_id = cm_id_priv->id.local_id;
+ mra_msg->remote_comm_id = cm_id_priv->id.remote_id;
+ cm_mra_set_service_timeout(mra_msg, service_timeout);
+
+ if (private_data && private_data_len)
+ memcpy(mra_msg->private_data, private_data, private_data_len);
+}
+
+static void cm_format_rej(struct cm_rej_msg *rej_msg,
+ struct cm_id_private *cm_id_priv,
+ enum ib_cm_rej_reason reason,
+ void *ari,
+ u8 ari_length,
+ const void *private_data,
+ u8 private_data_len)
+{
+ cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, cm_id_priv->tid);
+ rej_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+ switch(cm_id_priv->id.state) {
+ case IB_CM_REQ_RCVD:
+ rej_msg->local_comm_id = 0;
+ cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ);
+ break;
+ case IB_CM_MRA_REQ_SENT:
+ rej_msg->local_comm_id = cm_id_priv->id.local_id;
+ cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ);
+ break;
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ rej_msg->local_comm_id = cm_id_priv->id.local_id;
+ cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REP);
+ break;
+ default:
+ rej_msg->local_comm_id = cm_id_priv->id.local_id;
+ cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_OTHER);
+ break;
+ }
+
+ rej_msg->reason = cpu_to_be16(reason);
+ if (ari && ari_length) {
+ cm_rej_set_reject_info_len(rej_msg, ari_length);
+ memcpy(rej_msg->ari, ari, ari_length);
+ }
+
+ if (private_data && private_data_len)
+ memcpy(rej_msg->private_data, private_data, private_data_len);
+}
+
+static void cm_dup_req_handler(struct cm_work *work,
+ struct cm_id_private *cm_id_priv)
+{
+ struct ib_mad_send_buf *msg = NULL;
+ int ret;
+
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_REQ_COUNTER]);
+
+ /* Quick state check to discard duplicate REQs. */
+ if (cm_id_priv->id.state == IB_CM_REQ_RCVD)
+ return;
+
+ ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+ if (ret)
+ return;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_MRA_REQ_SENT:
+ cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+ CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout,
+ cm_id_priv->private_data,
+ cm_id_priv->private_data_len);
+ break;
+ case IB_CM_TIMEWAIT:
+ cm_format_rej((struct cm_rej_msg *) msg->mad, cm_id_priv,
+ IB_CM_REJ_STALE_CONN, NULL, 0, NULL, 0);
+ break;
+ default:
+ goto unlock;
+ }
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ goto free;
+ return;
+
+unlock: spin_unlock_irq(&cm_id_priv->lock);
+free: cm_free_msg(msg);
+}
+
+static struct cm_id_private * cm_match_req(struct cm_work *work,
+ struct cm_id_private *cm_id_priv)
+{
+ struct cm_id_private *listen_cm_id_priv, *cur_cm_id_priv;
+ struct cm_timewait_info *timewait_info;
+ struct cm_req_msg *req_msg;
+ struct ib_cm_id *cm_id;
+
+ req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+
+ /* Check for possible duplicate REQ. */
+ spin_lock_irq(&cm.lock);
+ timewait_info = cm_insert_remote_id(cm_id_priv->timewait_info);
+ if (timewait_info) {
+ cur_cm_id_priv = cm_get_id(timewait_info->work.local_id,
+ timewait_info->work.remote_id);
+ spin_unlock_irq(&cm.lock);
+ if (cur_cm_id_priv) {
+ cm_dup_req_handler(work, cur_cm_id_priv);
+ cm_deref_id(cur_cm_id_priv);
+ }
+ return NULL;
+ }
+
+ /* Check for stale connections. */
+ timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info);
+ if (timewait_info) {
+ cm_cleanup_timewait(cm_id_priv->timewait_info);
+ cur_cm_id_priv = cm_get_id(timewait_info->work.local_id,
+ timewait_info->work.remote_id);
+
+ spin_unlock_irq(&cm.lock);
+ cm_issue_rej(work->port, work->mad_recv_wc,
+ IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ,
+ NULL, 0);
+ if (cur_cm_id_priv) {
+ cm_id = &cur_cm_id_priv->id;
+ ib_send_cm_dreq(cm_id, NULL, 0);
+ cm_deref_id(cur_cm_id_priv);
+ }
+ return NULL;
+ }
+
+ /* Find matching listen request. */
+ listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device,
+ req_msg->service_id);
+ if (!listen_cm_id_priv) {
+ cm_cleanup_timewait(cm_id_priv->timewait_info);
+ spin_unlock_irq(&cm.lock);
+ cm_issue_rej(work->port, work->mad_recv_wc,
+ IB_CM_REJ_INVALID_SERVICE_ID, CM_MSG_RESPONSE_REQ,
+ NULL, 0);
+ goto out;
+ }
+ atomic_inc(&listen_cm_id_priv->refcount);
+ atomic_inc(&cm_id_priv->refcount);
+ cm_id_priv->id.state = IB_CM_REQ_RCVD;
+ atomic_inc(&cm_id_priv->work_count);
+ spin_unlock_irq(&cm.lock);
+out:
+ return listen_cm_id_priv;
+}
+
+/*
+ * Work-around for inter-subnet connections. If the LIDs are permissive,
+ * we need to override the LID/SL data in the REQ with the LID information
+ * in the work completion.
+ */
+static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc)
+{
+ if (!cm_req_get_primary_subnet_local(req_msg)) {
+ if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) {
+ req_msg->primary_local_lid = ib_lid_be16(wc->slid);
+ cm_req_set_primary_sl(req_msg, wc->sl);
+ }
+
+ if (req_msg->primary_remote_lid == IB_LID_PERMISSIVE)
+ req_msg->primary_remote_lid = cpu_to_be16(wc->dlid_path_bits);
+ }
+
+ if (!cm_req_get_alt_subnet_local(req_msg)) {
+ if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) {
+ req_msg->alt_local_lid = ib_lid_be16(wc->slid);
+ cm_req_set_alt_sl(req_msg, wc->sl);
+ }
+
+ if (req_msg->alt_remote_lid == IB_LID_PERMISSIVE)
+ req_msg->alt_remote_lid = cpu_to_be16(wc->dlid_path_bits);
+ }
+}
+
+static int cm_req_handler(struct cm_work *work)
+{
+ struct ib_cm_id *cm_id;
+ struct cm_id_private *cm_id_priv, *listen_cm_id_priv;
+ struct cm_req_msg *req_msg;
+ const struct ib_global_route *grh;
+ const struct ib_gid_attr *gid_attr;
+ int ret;
+
+ req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+
+ cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
+ if (IS_ERR(cm_id))
+ return PTR_ERR(cm_id);
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ cm_id_priv->id.remote_id = req_msg->local_comm_id;
+ ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+ work->mad_recv_wc->recv_buf.grh,
+ &cm_id_priv->av);
+ if (ret)
+ goto destroy;
+ cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
+ id.local_id);
+ if (IS_ERR(cm_id_priv->timewait_info)) {
+ ret = PTR_ERR(cm_id_priv->timewait_info);
+ cm_id_priv->timewait_info = NULL;
+ goto destroy;
+ }
+ cm_id_priv->timewait_info->work.remote_id = req_msg->local_comm_id;
+ cm_id_priv->timewait_info->remote_ca_guid = req_msg->local_ca_guid;
+ cm_id_priv->timewait_info->remote_qpn = cm_req_get_local_qpn(req_msg);
+
+ listen_cm_id_priv = cm_match_req(work, cm_id_priv);
+ if (!listen_cm_id_priv) {
+ pr_debug("%s: local_id %d, no listen_cm_id_priv\n", __func__,
+ be32_to_cpu(cm_id->local_id));
+ ret = -EINVAL;
+ goto destroy;
+ }
+
+ cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler;
+ cm_id_priv->id.context = listen_cm_id_priv->id.context;
+ cm_id_priv->id.service_id = req_msg->service_id;
+ cm_id_priv->id.service_mask = ~cpu_to_be64(0);
+
+ cm_process_routed_req(req_msg, work->mad_recv_wc->wc);
+
+ memset(&work->path[0], 0, sizeof(work->path[0]));
+ if (cm_req_has_alt_path(req_msg))
+ memset(&work->path[1], 0, sizeof(work->path[1]));
+ grh = rdma_ah_read_grh(&cm_id_priv->av.ah_attr);
+ gid_attr = grh->sgid_attr;
+
+ if (gid_attr && gid_attr->ndev) {
+ work->path[0].rec_type =
+ sa_conv_gid_to_pathrec_type(gid_attr->gid_type);
+ } else {
+ /* If no GID attribute or ndev is null, it is not RoCE. */
+ cm_path_set_rec_type(work->port->cm_dev->ib_device,
+ work->port->port_num,
+ &work->path[0],
+ &req_msg->primary_local_gid);
+ }
+ if (cm_req_has_alt_path(req_msg))
+ work->path[1].rec_type = work->path[0].rec_type;
+ cm_format_paths_from_req(req_msg, &work->path[0],
+ &work->path[1]);
+ if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE)
+ sa_path_set_dmac(&work->path[0],
+ cm_id_priv->av.ah_attr.roce.dmac);
+ work->path[0].hop_limit = grh->hop_limit;
+ ret = cm_init_av_by_path(&work->path[0], gid_attr, &cm_id_priv->av,
+ cm_id_priv);
+ if (ret) {
+ int err;
+
+ err = rdma_query_gid(work->port->cm_dev->ib_device,
+ work->port->port_num, 0,
+ &work->path[0].sgid);
+ if (err)
+ ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
+ NULL, 0, NULL, 0);
+ else
+ ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
+ &work->path[0].sgid,
+ sizeof(work->path[0].sgid),
+ NULL, 0);
+ goto rejected;
+ }
+ if (cm_req_has_alt_path(req_msg)) {
+ ret = cm_init_av_by_path(&work->path[1], NULL,
+ &cm_id_priv->alt_av, cm_id_priv);
+ if (ret) {
+ ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID,
+ &work->path[0].sgid,
+ sizeof(work->path[0].sgid), NULL, 0);
+ goto rejected;
+ }
+ }
+ cm_id_priv->tid = req_msg->hdr.tid;
+ cm_id_priv->timeout_ms = cm_convert_to_ms(
+ cm_req_get_local_resp_timeout(req_msg));
+ cm_id_priv->max_cm_retries = cm_req_get_max_cm_retries(req_msg);
+ cm_id_priv->remote_qpn = cm_req_get_local_qpn(req_msg);
+ cm_id_priv->initiator_depth = cm_req_get_resp_res(req_msg);
+ cm_id_priv->responder_resources = cm_req_get_init_depth(req_msg);
+ cm_id_priv->path_mtu = cm_req_get_path_mtu(req_msg);
+ cm_id_priv->pkey = req_msg->pkey;
+ cm_id_priv->sq_psn = cm_req_get_starting_psn(req_msg);
+ cm_id_priv->retry_count = cm_req_get_retry_count(req_msg);
+ cm_id_priv->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg);
+ cm_id_priv->qp_type = cm_req_get_qp_type(req_msg);
+
+ cm_format_req_event(work, cm_id_priv, &listen_cm_id_priv->id);
+ cm_process_work(cm_id_priv, work);
+ cm_deref_id(listen_cm_id_priv);
+ return 0;
+
+rejected:
+ atomic_dec(&cm_id_priv->refcount);
+ cm_deref_id(listen_cm_id_priv);
+destroy:
+ ib_destroy_cm_id(cm_id);
+ return ret;
+}
+
+static void cm_format_rep(struct cm_rep_msg *rep_msg,
+ struct cm_id_private *cm_id_priv,
+ struct ib_cm_rep_param *param)
+{
+ cm_format_mad_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid);
+ rep_msg->local_comm_id = cm_id_priv->id.local_id;
+ rep_msg->remote_comm_id = cm_id_priv->id.remote_id;
+ cm_rep_set_starting_psn(rep_msg, cpu_to_be32(param->starting_psn));
+ rep_msg->resp_resources = param->responder_resources;
+ cm_rep_set_target_ack_delay(rep_msg,
+ cm_id_priv->av.port->cm_dev->ack_delay);
+ cm_rep_set_failover(rep_msg, param->failover_accepted);
+ cm_rep_set_rnr_retry_count(rep_msg, param->rnr_retry_count);
+ rep_msg->local_ca_guid = cm_id_priv->id.device->node_guid;
+
+ if (cm_id_priv->qp_type != IB_QPT_XRC_TGT) {
+ rep_msg->initiator_depth = param->initiator_depth;
+ cm_rep_set_flow_ctrl(rep_msg, param->flow_control);
+ cm_rep_set_srq(rep_msg, param->srq);
+ cm_rep_set_local_qpn(rep_msg, cpu_to_be32(param->qp_num));
+ } else {
+ cm_rep_set_srq(rep_msg, 1);
+ cm_rep_set_local_eecn(rep_msg, cpu_to_be32(param->qp_num));
+ }
+
+ if (param->private_data && param->private_data_len)
+ memcpy(rep_msg->private_data, param->private_data,
+ param->private_data_len);
+}
+
+int ib_send_cm_rep(struct ib_cm_id *cm_id,
+ struct ib_cm_rep_param *param)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ struct cm_rep_msg *rep_msg;
+ unsigned long flags;
+ int ret;
+
+ if (param->private_data &&
+ param->private_data_len > IB_CM_REP_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_REQ_RCVD &&
+ cm_id->state != IB_CM_MRA_REQ_SENT) {
+ pr_debug("%s: local_comm_id %d, cm_id->state: %d\n", __func__,
+ be32_to_cpu(cm_id_priv->id.local_id), cm_id->state);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto out;
+
+ rep_msg = (struct cm_rep_msg *) msg->mad;
+ cm_format_rep(rep_msg, cm_id_priv, param);
+ msg->timeout_ms = cm_id_priv->timeout_ms;
+ msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT;
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ return ret;
+ }
+
+ cm_id->state = IB_CM_REP_SENT;
+ cm_id_priv->msg = msg;
+ cm_id_priv->initiator_depth = param->initiator_depth;
+ cm_id_priv->responder_resources = param->responder_resources;
+ cm_id_priv->rq_psn = cm_rep_get_starting_psn(rep_msg);
+ cm_id_priv->local_qpn = cpu_to_be32(param->qp_num & 0xFFFFFF);
+
+out: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rep);
+
+static void cm_format_rtu(struct cm_rtu_msg *rtu_msg,
+ struct cm_id_private *cm_id_priv,
+ const void *private_data,
+ u8 private_data_len)
+{
+ cm_format_mad_hdr(&rtu_msg->hdr, CM_RTU_ATTR_ID, cm_id_priv->tid);
+ rtu_msg->local_comm_id = cm_id_priv->id.local_id;
+ rtu_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+ if (private_data && private_data_len)
+ memcpy(rtu_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_rtu(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ void *data;
+ int ret;
+
+ if (private_data && private_data_len > IB_CM_RTU_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ data = cm_copy_private_data(private_data, private_data_len);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_REP_RCVD &&
+ cm_id->state != IB_CM_MRA_REP_SENT) {
+ pr_debug("%s: local_id %d, cm_id->state %d\n", __func__,
+ be32_to_cpu(cm_id->local_id), cm_id->state);
+ ret = -EINVAL;
+ goto error;
+ }
+
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto error;
+
+ cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv,
+ private_data, private_data_len);
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ kfree(data);
+ return ret;
+ }
+
+ cm_id->state = IB_CM_ESTABLISHED;
+ cm_set_private_data(cm_id_priv, data, private_data_len);
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return 0;
+
+error: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ kfree(data);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rtu);
+
+static void cm_format_rep_event(struct cm_work *work, enum ib_qp_type qp_type)
+{
+ struct cm_rep_msg *rep_msg;
+ struct ib_cm_rep_event_param *param;
+
+ rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad;
+ param = &work->cm_event.param.rep_rcvd;
+ param->remote_ca_guid = rep_msg->local_ca_guid;
+ param->remote_qkey = be32_to_cpu(rep_msg->local_qkey);
+ param->remote_qpn = be32_to_cpu(cm_rep_get_qpn(rep_msg, qp_type));
+ param->starting_psn = be32_to_cpu(cm_rep_get_starting_psn(rep_msg));
+ param->responder_resources = rep_msg->initiator_depth;
+ param->initiator_depth = rep_msg->resp_resources;
+ param->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg);
+ param->failover_accepted = cm_rep_get_failover(rep_msg);
+ param->flow_control = cm_rep_get_flow_ctrl(rep_msg);
+ param->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg);
+ param->srq = cm_rep_get_srq(rep_msg);
+ work->cm_event.private_data = &rep_msg->private_data;
+}
+
+static void cm_dup_rep_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_rep_msg *rep_msg;
+ struct ib_mad_send_buf *msg = NULL;
+ int ret;
+
+ rep_msg = (struct cm_rep_msg *) work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id,
+ rep_msg->local_comm_id);
+ if (!cm_id_priv)
+ return;
+
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_REP_COUNTER]);
+ ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+ if (ret)
+ goto deref;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state == IB_CM_ESTABLISHED)
+ cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv,
+ cm_id_priv->private_data,
+ cm_id_priv->private_data_len);
+ else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT)
+ cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+ CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout,
+ cm_id_priv->private_data,
+ cm_id_priv->private_data_len);
+ else
+ goto unlock;
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ goto free;
+ goto deref;
+
+unlock: spin_unlock_irq(&cm_id_priv->lock);
+free: cm_free_msg(msg);
+deref: cm_deref_id(cm_id_priv);
+}
+
+static int cm_rep_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_rep_msg *rep_msg;
+ int ret;
+ struct cm_id_private *cur_cm_id_priv;
+ struct ib_cm_id *cm_id;
+ struct cm_timewait_info *timewait_info;
+
+ rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0);
+ if (!cm_id_priv) {
+ cm_dup_rep_handler(work);
+ pr_debug("%s: remote_comm_id %d, no cm_id_priv\n", __func__,
+ be32_to_cpu(rep_msg->remote_comm_id));
+ return -EINVAL;
+ }
+
+ cm_format_rep_event(work, cm_id_priv->qp_type);
+
+ spin_lock_irq(&cm_id_priv->lock);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ break;
+ default:
+ spin_unlock_irq(&cm_id_priv->lock);
+ ret = -EINVAL;
+ pr_debug("%s: cm_id_priv->id.state: %d, local_comm_id %d, remote_comm_id %d\n",
+ __func__, cm_id_priv->id.state,
+ be32_to_cpu(rep_msg->local_comm_id),
+ be32_to_cpu(rep_msg->remote_comm_id));
+ goto error;
+ }
+
+ cm_id_priv->timewait_info->work.remote_id = rep_msg->local_comm_id;
+ cm_id_priv->timewait_info->remote_ca_guid = rep_msg->local_ca_guid;
+ cm_id_priv->timewait_info->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type);
+
+ spin_lock(&cm.lock);
+ /* Check for duplicate REP. */
+ if (cm_insert_remote_id(cm_id_priv->timewait_info)) {
+ spin_unlock(&cm.lock);
+ spin_unlock_irq(&cm_id_priv->lock);
+ ret = -EINVAL;
+ pr_debug("%s: Failed to insert remote id %d\n", __func__,
+ be32_to_cpu(rep_msg->remote_comm_id));
+ goto error;
+ }
+ /* Check for a stale connection. */
+ timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info);
+ if (timewait_info) {
+ rb_erase(&cm_id_priv->timewait_info->remote_id_node,
+ &cm.remote_id_table);
+ cm_id_priv->timewait_info->inserted_remote_id = 0;
+ cur_cm_id_priv = cm_get_id(timewait_info->work.local_id,
+ timewait_info->work.remote_id);
+
+ spin_unlock(&cm.lock);
+ spin_unlock_irq(&cm_id_priv->lock);
+ cm_issue_rej(work->port, work->mad_recv_wc,
+ IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP,
+ NULL, 0);
+ ret = -EINVAL;
+ pr_debug("%s: Stale connection. local_comm_id %d, remote_comm_id %d\n",
+ __func__, be32_to_cpu(rep_msg->local_comm_id),
+ be32_to_cpu(rep_msg->remote_comm_id));
+
+ if (cur_cm_id_priv) {
+ cm_id = &cur_cm_id_priv->id;
+ ib_send_cm_dreq(cm_id, NULL, 0);
+ cm_deref_id(cur_cm_id_priv);
+ }
+
+ goto error;
+ }
+ spin_unlock(&cm.lock);
+
+ cm_id_priv->id.state = IB_CM_REP_RCVD;
+ cm_id_priv->id.remote_id = rep_msg->local_comm_id;
+ cm_id_priv->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type);
+ cm_id_priv->initiator_depth = rep_msg->resp_resources;
+ cm_id_priv->responder_resources = rep_msg->initiator_depth;
+ cm_id_priv->sq_psn = cm_rep_get_starting_psn(rep_msg);
+ cm_id_priv->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg);
+ cm_id_priv->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg);
+ cm_id_priv->av.timeout =
+ cm_ack_timeout(cm_id_priv->target_ack_delay,
+ cm_id_priv->av.timeout - 1);
+ cm_id_priv->alt_av.timeout =
+ cm_ack_timeout(cm_id_priv->target_ack_delay,
+ cm_id_priv->alt_av.timeout - 1);
+
+ /* todo: handle peer_to_peer */
+
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+
+error:
+ cm_deref_id(cm_id_priv);
+ return ret;
+}
+
+static int cm_establish_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ int ret;
+
+ /* See comment in cm_establish about lookup. */
+ cm_id_priv = cm_acquire_id(work->local_id, work->remote_id);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_ESTABLISHED) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto out;
+ }
+
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static int cm_rtu_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_rtu_msg *rtu_msg;
+ int ret;
+
+ rtu_msg = (struct cm_rtu_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(rtu_msg->remote_comm_id,
+ rtu_msg->local_comm_id);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ work->cm_event.private_data = &rtu_msg->private_data;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_REP_SENT &&
+ cm_id_priv->id.state != IB_CM_MRA_REP_RCVD) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_RTU_COUNTER]);
+ goto out;
+ }
+ cm_id_priv->id.state = IB_CM_ESTABLISHED;
+
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static void cm_format_dreq(struct cm_dreq_msg *dreq_msg,
+ struct cm_id_private *cm_id_priv,
+ const void *private_data,
+ u8 private_data_len)
+{
+ cm_format_mad_hdr(&dreq_msg->hdr, CM_DREQ_ATTR_ID,
+ cm_form_tid(cm_id_priv));
+ dreq_msg->local_comm_id = cm_id_priv->id.local_id;
+ dreq_msg->remote_comm_id = cm_id_priv->id.remote_id;
+ cm_dreq_set_remote_qpn(dreq_msg, cm_id_priv->remote_qpn);
+
+ if (private_data && private_data_len)
+ memcpy(dreq_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_dreq(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ int ret;
+
+ if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_ESTABLISHED) {
+ pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__,
+ be32_to_cpu(cm_id->local_id), cm_id->state);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (cm_id->lap_state == IB_CM_LAP_SENT ||
+ cm_id->lap_state == IB_CM_MRA_LAP_RCVD)
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret) {
+ cm_enter_timewait(cm_id_priv);
+ goto out;
+ }
+
+ cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv,
+ private_data, private_data_len);
+ msg->timeout_ms = cm_id_priv->timeout_ms;
+ msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT;
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ cm_enter_timewait(cm_id_priv);
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ return ret;
+ }
+
+ cm_id->state = IB_CM_DREQ_SENT;
+ cm_id_priv->msg = msg;
+out: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_dreq);
+
+static void cm_format_drep(struct cm_drep_msg *drep_msg,
+ struct cm_id_private *cm_id_priv,
+ const void *private_data,
+ u8 private_data_len)
+{
+ cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, cm_id_priv->tid);
+ drep_msg->local_comm_id = cm_id_priv->id.local_id;
+ drep_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+ if (private_data && private_data_len)
+ memcpy(drep_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_drep(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ void *data;
+ int ret;
+
+ if (private_data && private_data_len > IB_CM_DREP_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ data = cm_copy_private_data(private_data, private_data_len);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_DREQ_RCVD) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ kfree(data);
+ pr_debug("%s: local_id %d, cm_idcm_id->state(%d) != IB_CM_DREQ_RCVD\n",
+ __func__, be32_to_cpu(cm_id->local_id), cm_id->state);
+ return -EINVAL;
+ }
+
+ cm_set_private_data(cm_id_priv, data, private_data_len);
+ cm_enter_timewait(cm_id_priv);
+
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto out;
+
+ cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv,
+ private_data, private_data_len);
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ return ret;
+ }
+
+out: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_drep);
+
+static int cm_issue_drep(struct cm_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_mad_send_buf *msg = NULL;
+ struct cm_dreq_msg *dreq_msg;
+ struct cm_drep_msg *drep_msg;
+ int ret;
+
+ ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+ if (ret)
+ return ret;
+
+ dreq_msg = (struct cm_dreq_msg *) mad_recv_wc->recv_buf.mad;
+ drep_msg = (struct cm_drep_msg *) msg->mad;
+
+ cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, dreq_msg->hdr.tid);
+ drep_msg->remote_comm_id = dreq_msg->local_comm_id;
+ drep_msg->local_comm_id = dreq_msg->remote_comm_id;
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ cm_free_msg(msg);
+
+ return ret;
+}
+
+static int cm_dreq_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_dreq_msg *dreq_msg;
+ struct ib_mad_send_buf *msg = NULL;
+ int ret;
+
+ dreq_msg = (struct cm_dreq_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(dreq_msg->remote_comm_id,
+ dreq_msg->local_comm_id);
+ if (!cm_id_priv) {
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_DREQ_COUNTER]);
+ cm_issue_drep(work->port, work->mad_recv_wc);
+ pr_debug("%s: no cm_id_priv, local_comm_id %d, remote_comm_id %d\n",
+ __func__, be32_to_cpu(dreq_msg->local_comm_id),
+ be32_to_cpu(dreq_msg->remote_comm_id));
+ return -EINVAL;
+ }
+
+ work->cm_event.private_data = &dreq_msg->private_data;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->local_qpn != cm_dreq_get_remote_qpn(dreq_msg))
+ goto unlock;
+
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REP_SENT:
+ case IB_CM_DREQ_SENT:
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ break;
+ case IB_CM_ESTABLISHED:
+ if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT ||
+ cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ break;
+ case IB_CM_MRA_REP_RCVD:
+ break;
+ case IB_CM_TIMEWAIT:
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_DREQ_COUNTER]);
+ msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc);
+ if (IS_ERR(msg))
+ goto unlock;
+
+ cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv,
+ cm_id_priv->private_data,
+ cm_id_priv->private_data_len);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) ||
+ ib_post_send_mad(msg, NULL))
+ cm_free_msg(msg);
+ goto deref;
+ case IB_CM_DREQ_RCVD:
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_DREQ_COUNTER]);
+ goto unlock;
+ default:
+ pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n",
+ __func__, be32_to_cpu(cm_id_priv->id.local_id),
+ cm_id_priv->id.state);
+ goto unlock;
+ }
+ cm_id_priv->id.state = IB_CM_DREQ_RCVD;
+ cm_id_priv->tid = dreq_msg->hdr.tid;
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+
+unlock: spin_unlock_irq(&cm_id_priv->lock);
+deref: cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static int cm_drep_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_drep_msg *drep_msg;
+ int ret;
+
+ drep_msg = (struct cm_drep_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(drep_msg->remote_comm_id,
+ drep_msg->local_comm_id);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ work->cm_event.private_data = &drep_msg->private_data;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_DREQ_SENT &&
+ cm_id_priv->id.state != IB_CM_DREQ_RCVD) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto out;
+ }
+ cm_enter_timewait(cm_id_priv);
+
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+int ib_send_cm_rej(struct ib_cm_id *cm_id,
+ enum ib_cm_rej_reason reason,
+ void *ari,
+ u8 ari_length,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ int ret;
+
+ if ((private_data && private_data_len > IB_CM_REJ_PRIVATE_DATA_SIZE) ||
+ (ari && ari_length > IB_CM_REJ_ARI_LENGTH))
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id->state) {
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ case IB_CM_REQ_RCVD:
+ case IB_CM_MRA_REQ_SENT:
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (!ret)
+ cm_format_rej((struct cm_rej_msg *) msg->mad,
+ cm_id_priv, reason, ari, ari_length,
+ private_data, private_data_len);
+
+ cm_reset_to_idle(cm_id_priv);
+ break;
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (!ret)
+ cm_format_rej((struct cm_rej_msg *) msg->mad,
+ cm_id_priv, reason, ari, ari_length,
+ private_data, private_data_len);
+
+ cm_enter_timewait(cm_id_priv);
+ break;
+ default:
+ pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__,
+ be32_to_cpu(cm_id_priv->id.local_id), cm_id->state);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (ret)
+ goto out;
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ cm_free_msg(msg);
+
+out: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rej);
+
+static void cm_format_rej_event(struct cm_work *work)
+{
+ struct cm_rej_msg *rej_msg;
+ struct ib_cm_rej_event_param *param;
+
+ rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad;
+ param = &work->cm_event.param.rej_rcvd;
+ param->ari = rej_msg->ari;
+ param->ari_length = cm_rej_get_reject_info_len(rej_msg);
+ param->reason = __be16_to_cpu(rej_msg->reason);
+ work->cm_event.private_data = &rej_msg->private_data;
+}
+
+static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg)
+{
+ struct cm_timewait_info *timewait_info;
+ struct cm_id_private *cm_id_priv;
+ __be32 remote_id;
+
+ remote_id = rej_msg->local_comm_id;
+
+ if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_TIMEOUT) {
+ spin_lock_irq(&cm.lock);
+ timewait_info = cm_find_remote_id( *((__be64 *) rej_msg->ari),
+ remote_id);
+ if (!timewait_info) {
+ spin_unlock_irq(&cm.lock);
+ return NULL;
+ }
+ cm_id_priv = idr_find(&cm.local_id_table, (__force int)
+ (timewait_info->work.local_id ^
+ cm.random_id_operand));
+ if (cm_id_priv) {
+ if (cm_id_priv->id.remote_id == remote_id)
+ atomic_inc(&cm_id_priv->refcount);
+ else
+ cm_id_priv = NULL;
+ }
+ spin_unlock_irq(&cm.lock);
+ } else if (cm_rej_get_msg_rejected(rej_msg) == CM_MSG_RESPONSE_REQ)
+ cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, 0);
+ else
+ cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, remote_id);
+
+ return cm_id_priv;
+}
+
+static int cm_rej_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_rej_msg *rej_msg;
+ int ret;
+
+ rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_rejected_id(rej_msg);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ cm_format_rej_event(work);
+
+ spin_lock_irq(&cm_id_priv->lock);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ /* fall through */
+ case IB_CM_REQ_RCVD:
+ case IB_CM_MRA_REQ_SENT:
+ if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_STALE_CONN)
+ cm_enter_timewait(cm_id_priv);
+ else
+ cm_reset_to_idle(cm_id_priv);
+ break;
+ case IB_CM_DREQ_SENT:
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ /* fall through */
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ cm_enter_timewait(cm_id_priv);
+ break;
+ case IB_CM_ESTABLISHED:
+ if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT ||
+ cm_id_priv->id.lap_state == IB_CM_LAP_SENT) {
+ if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT)
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent,
+ cm_id_priv->msg);
+ cm_enter_timewait(cm_id_priv);
+ break;
+ }
+ /* fall through */
+ default:
+ spin_unlock_irq(&cm_id_priv->lock);
+ pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n",
+ __func__, be32_to_cpu(cm_id_priv->id.local_id),
+ cm_id_priv->id.state);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+int ib_send_cm_mra(struct ib_cm_id *cm_id,
+ u8 service_timeout,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ enum ib_cm_state cm_state;
+ enum ib_cm_lap_state lap_state;
+ enum cm_msg_response msg_response;
+ void *data;
+ unsigned long flags;
+ int ret;
+
+ if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ data = cm_copy_private_data(private_data, private_data_len);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch(cm_id_priv->id.state) {
+ case IB_CM_REQ_RCVD:
+ cm_state = IB_CM_MRA_REQ_SENT;
+ lap_state = cm_id->lap_state;
+ msg_response = CM_MSG_RESPONSE_REQ;
+ break;
+ case IB_CM_REP_RCVD:
+ cm_state = IB_CM_MRA_REP_SENT;
+ lap_state = cm_id->lap_state;
+ msg_response = CM_MSG_RESPONSE_REP;
+ break;
+ case IB_CM_ESTABLISHED:
+ if (cm_id->lap_state == IB_CM_LAP_RCVD) {
+ cm_state = cm_id->state;
+ lap_state = IB_CM_MRA_LAP_SENT;
+ msg_response = CM_MSG_RESPONSE_OTHER;
+ break;
+ }
+ /* fall through */
+ default:
+ pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n",
+ __func__, be32_to_cpu(cm_id_priv->id.local_id),
+ cm_id_priv->id.state);
+ ret = -EINVAL;
+ goto error1;
+ }
+
+ if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) {
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto error1;
+
+ cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+ msg_response, service_timeout,
+ private_data, private_data_len);
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ goto error2;
+ }
+
+ cm_id->state = cm_state;
+ cm_id->lap_state = lap_state;
+ cm_id_priv->service_timeout = service_timeout;
+ cm_set_private_data(cm_id_priv, data, private_data_len);
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return 0;
+
+error1: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ kfree(data);
+ return ret;
+
+error2: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ kfree(data);
+ cm_free_msg(msg);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_mra);
+
+static struct cm_id_private * cm_acquire_mraed_id(struct cm_mra_msg *mra_msg)
+{
+ switch (cm_mra_get_msg_mraed(mra_msg)) {
+ case CM_MSG_RESPONSE_REQ:
+ return cm_acquire_id(mra_msg->remote_comm_id, 0);
+ case CM_MSG_RESPONSE_REP:
+ case CM_MSG_RESPONSE_OTHER:
+ return cm_acquire_id(mra_msg->remote_comm_id,
+ mra_msg->local_comm_id);
+ default:
+ return NULL;
+ }
+}
+
+static int cm_mra_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_mra_msg *mra_msg;
+ int timeout, ret;
+
+ mra_msg = (struct cm_mra_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_mraed_id(mra_msg);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ work->cm_event.private_data = &mra_msg->private_data;
+ work->cm_event.param.mra_rcvd.service_timeout =
+ cm_mra_get_service_timeout(mra_msg);
+ timeout = cm_convert_to_ms(cm_mra_get_service_timeout(mra_msg)) +
+ cm_convert_to_ms(cm_id_priv->av.timeout);
+
+ spin_lock_irq(&cm_id_priv->lock);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REQ_SENT:
+ if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REQ ||
+ ib_modify_mad(cm_id_priv->av.port->mad_agent,
+ cm_id_priv->msg, timeout))
+ goto out;
+ cm_id_priv->id.state = IB_CM_MRA_REQ_RCVD;
+ break;
+ case IB_CM_REP_SENT:
+ if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REP ||
+ ib_modify_mad(cm_id_priv->av.port->mad_agent,
+ cm_id_priv->msg, timeout))
+ goto out;
+ cm_id_priv->id.state = IB_CM_MRA_REP_RCVD;
+ break;
+ case IB_CM_ESTABLISHED:
+ if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_OTHER ||
+ cm_id_priv->id.lap_state != IB_CM_LAP_SENT ||
+ ib_modify_mad(cm_id_priv->av.port->mad_agent,
+ cm_id_priv->msg, timeout)) {
+ if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
+ atomic_long_inc(&work->port->
+ counter_group[CM_RECV_DUPLICATES].
+ counter[CM_MRA_COUNTER]);
+ goto out;
+ }
+ cm_id_priv->id.lap_state = IB_CM_MRA_LAP_RCVD;
+ break;
+ case IB_CM_MRA_REQ_RCVD:
+ case IB_CM_MRA_REP_RCVD:
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_MRA_COUNTER]);
+ /* fall through */
+ default:
+ pr_debug("%s local_id %d, cm_id_priv->id.state: %d\n",
+ __func__, be32_to_cpu(cm_id_priv->id.local_id),
+ cm_id_priv->id.state);
+ goto out;
+ }
+
+ cm_id_priv->msg->context[1] = (void *) (unsigned long)
+ cm_id_priv->id.state;
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+out:
+ spin_unlock_irq(&cm_id_priv->lock);
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static void cm_format_lap(struct cm_lap_msg *lap_msg,
+ struct cm_id_private *cm_id_priv,
+ struct sa_path_rec *alternate_path,
+ const void *private_data,
+ u8 private_data_len)
+{
+ bool alt_ext = false;
+
+ if (alternate_path->rec_type == SA_PATH_REC_TYPE_OPA)
+ alt_ext = opa_is_extended_lid(alternate_path->opa.dlid,
+ alternate_path->opa.slid);
+ cm_format_mad_hdr(&lap_msg->hdr, CM_LAP_ATTR_ID,
+ cm_form_tid(cm_id_priv));
+ lap_msg->local_comm_id = cm_id_priv->id.local_id;
+ lap_msg->remote_comm_id = cm_id_priv->id.remote_id;
+ cm_lap_set_remote_qpn(lap_msg, cm_id_priv->remote_qpn);
+ /* todo: need remote CM response timeout */
+ cm_lap_set_remote_resp_timeout(lap_msg, 0x1F);
+ lap_msg->alt_local_lid =
+ htons(ntohl(sa_path_get_slid(alternate_path)));
+ lap_msg->alt_remote_lid =
+ htons(ntohl(sa_path_get_dlid(alternate_path)));
+ lap_msg->alt_local_gid = alternate_path->sgid;
+ lap_msg->alt_remote_gid = alternate_path->dgid;
+ if (alt_ext) {
+ lap_msg->alt_local_gid.global.interface_id
+ = OPA_MAKE_ID(be32_to_cpu(alternate_path->opa.slid));
+ lap_msg->alt_remote_gid.global.interface_id
+ = OPA_MAKE_ID(be32_to_cpu(alternate_path->opa.dlid));
+ }
+ cm_lap_set_flow_label(lap_msg, alternate_path->flow_label);
+ cm_lap_set_traffic_class(lap_msg, alternate_path->traffic_class);
+ lap_msg->alt_hop_limit = alternate_path->hop_limit;
+ cm_lap_set_packet_rate(lap_msg, alternate_path->rate);
+ cm_lap_set_sl(lap_msg, alternate_path->sl);
+ cm_lap_set_subnet_local(lap_msg, 1); /* local only... */
+ cm_lap_set_local_ack_timeout(lap_msg,
+ cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+ alternate_path->packet_life_time));
+
+ if (private_data && private_data_len)
+ memcpy(lap_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_lap(struct ib_cm_id *cm_id,
+ struct sa_path_rec *alternate_path,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ int ret;
+
+ if (private_data && private_data_len > IB_CM_LAP_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_ESTABLISHED ||
+ (cm_id->lap_state != IB_CM_LAP_UNINIT &&
+ cm_id->lap_state != IB_CM_LAP_IDLE)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = cm_init_av_by_path(alternate_path, NULL, &cm_id_priv->alt_av,
+ cm_id_priv);
+ if (ret)
+ goto out;
+ cm_id_priv->alt_av.timeout =
+ cm_ack_timeout(cm_id_priv->target_ack_delay,
+ cm_id_priv->alt_av.timeout - 1);
+
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto out;
+
+ cm_format_lap((struct cm_lap_msg *) msg->mad, cm_id_priv,
+ alternate_path, private_data, private_data_len);
+ msg->timeout_ms = cm_id_priv->timeout_ms;
+ msg->context[1] = (void *) (unsigned long) IB_CM_ESTABLISHED;
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ return ret;
+ }
+
+ cm_id->lap_state = IB_CM_LAP_SENT;
+ cm_id_priv->msg = msg;
+
+out: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_lap);
+
+static void cm_format_path_lid_from_lap(struct cm_lap_msg *lap_msg,
+ struct sa_path_rec *path)
+{
+ u32 lid;
+
+ if (path->rec_type != SA_PATH_REC_TYPE_OPA) {
+ sa_path_set_dlid(path, ntohs(lap_msg->alt_local_lid));
+ sa_path_set_slid(path, ntohs(lap_msg->alt_remote_lid));
+ } else {
+ lid = opa_get_lid_from_gid(&lap_msg->alt_local_gid);
+ sa_path_set_dlid(path, lid);
+
+ lid = opa_get_lid_from_gid(&lap_msg->alt_remote_gid);
+ sa_path_set_slid(path, lid);
+ }
+}
+
+static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv,
+ struct sa_path_rec *path,
+ struct cm_lap_msg *lap_msg)
+{
+ path->dgid = lap_msg->alt_local_gid;
+ path->sgid = lap_msg->alt_remote_gid;
+ path->flow_label = cm_lap_get_flow_label(lap_msg);
+ path->hop_limit = lap_msg->alt_hop_limit;
+ path->traffic_class = cm_lap_get_traffic_class(lap_msg);
+ path->reversible = 1;
+ path->pkey = cm_id_priv->pkey;
+ path->sl = cm_lap_get_sl(lap_msg);
+ path->mtu_selector = IB_SA_EQ;
+ path->mtu = cm_id_priv->path_mtu;
+ path->rate_selector = IB_SA_EQ;
+ path->rate = cm_lap_get_packet_rate(lap_msg);
+ path->packet_life_time_selector = IB_SA_EQ;
+ path->packet_life_time = cm_lap_get_local_ack_timeout(lap_msg);
+ path->packet_life_time -= (path->packet_life_time > 0);
+ cm_format_path_lid_from_lap(lap_msg, path);
+}
+
+static int cm_lap_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_lap_msg *lap_msg;
+ struct ib_cm_lap_event_param *param;
+ struct ib_mad_send_buf *msg = NULL;
+ int ret;
+
+ /* Currently Alternate path messages are not supported for
+ * RoCE link layer.
+ */
+ if (rdma_protocol_roce(work->port->cm_dev->ib_device,
+ work->port->port_num))
+ return -EINVAL;
+
+ /* todo: verify LAP request and send reject APR if invalid. */
+ lap_msg = (struct cm_lap_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(lap_msg->remote_comm_id,
+ lap_msg->local_comm_id);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ param = &work->cm_event.param.lap_rcvd;
+ memset(&work->path[0], 0, sizeof(work->path[1]));
+ cm_path_set_rec_type(work->port->cm_dev->ib_device,
+ work->port->port_num,
+ &work->path[0],
+ &lap_msg->alt_local_gid);
+ param->alternate_path = &work->path[0];
+ cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg);
+ work->cm_event.private_data = &lap_msg->private_data;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_ESTABLISHED)
+ goto unlock;
+
+ switch (cm_id_priv->id.lap_state) {
+ case IB_CM_LAP_UNINIT:
+ case IB_CM_LAP_IDLE:
+ break;
+ case IB_CM_MRA_LAP_SENT:
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_LAP_COUNTER]);
+ msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc);
+ if (IS_ERR(msg))
+ goto unlock;
+
+ cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+ CM_MSG_RESPONSE_OTHER,
+ cm_id_priv->service_timeout,
+ cm_id_priv->private_data,
+ cm_id_priv->private_data_len);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) ||
+ ib_post_send_mad(msg, NULL))
+ cm_free_msg(msg);
+ goto deref;
+ case IB_CM_LAP_RCVD:
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_LAP_COUNTER]);
+ goto unlock;
+ default:
+ goto unlock;
+ }
+
+ ret = cm_init_av_for_lap(work->port, work->mad_recv_wc->wc,
+ work->mad_recv_wc->recv_buf.grh,
+ &cm_id_priv->av);
+ if (ret)
+ goto unlock;
+
+ ret = cm_init_av_by_path(param->alternate_path, NULL,
+ &cm_id_priv->alt_av, cm_id_priv);
+ if (ret)
+ goto unlock;
+
+ cm_id_priv->id.lap_state = IB_CM_LAP_RCVD;
+ cm_id_priv->tid = lap_msg->hdr.tid;
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+
+unlock: spin_unlock_irq(&cm_id_priv->lock);
+deref: cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static void cm_format_apr(struct cm_apr_msg *apr_msg,
+ struct cm_id_private *cm_id_priv,
+ enum ib_cm_apr_status status,
+ void *info,
+ u8 info_length,
+ const void *private_data,
+ u8 private_data_len)
+{
+ cm_format_mad_hdr(&apr_msg->hdr, CM_APR_ATTR_ID, cm_id_priv->tid);
+ apr_msg->local_comm_id = cm_id_priv->id.local_id;
+ apr_msg->remote_comm_id = cm_id_priv->id.remote_id;
+ apr_msg->ap_status = (u8) status;
+
+ if (info && info_length) {
+ apr_msg->info_length = info_length;
+ memcpy(apr_msg->info, info, info_length);
+ }
+
+ if (private_data && private_data_len)
+ memcpy(apr_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_apr(struct ib_cm_id *cm_id,
+ enum ib_cm_apr_status status,
+ void *info,
+ u8 info_length,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ int ret;
+
+ if ((private_data && private_data_len > IB_CM_APR_PRIVATE_DATA_SIZE) ||
+ (info && info_length > IB_CM_APR_INFO_LENGTH))
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_ESTABLISHED ||
+ (cm_id->lap_state != IB_CM_LAP_RCVD &&
+ cm_id->lap_state != IB_CM_MRA_LAP_SENT)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto out;
+
+ cm_format_apr((struct cm_apr_msg *) msg->mad, cm_id_priv, status,
+ info, info_length, private_data, private_data_len);
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ return ret;
+ }
+
+ cm_id->lap_state = IB_CM_LAP_IDLE;
+out: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_apr);
+
+static int cm_apr_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_apr_msg *apr_msg;
+ int ret;
+
+ /* Currently Alternate path messages are not supported for
+ * RoCE link layer.
+ */
+ if (rdma_protocol_roce(work->port->cm_dev->ib_device,
+ work->port->port_num))
+ return -EINVAL;
+
+ apr_msg = (struct cm_apr_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(apr_msg->remote_comm_id,
+ apr_msg->local_comm_id);
+ if (!cm_id_priv)
+ return -EINVAL; /* Unmatched reply. */
+
+ work->cm_event.param.apr_rcvd.ap_status = apr_msg->ap_status;
+ work->cm_event.param.apr_rcvd.apr_info = &apr_msg->info;
+ work->cm_event.param.apr_rcvd.info_len = apr_msg->info_length;
+ work->cm_event.private_data = &apr_msg->private_data;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_ESTABLISHED ||
+ (cm_id_priv->id.lap_state != IB_CM_LAP_SENT &&
+ cm_id_priv->id.lap_state != IB_CM_MRA_LAP_RCVD)) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto out;
+ }
+ cm_id_priv->id.lap_state = IB_CM_LAP_IDLE;
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ cm_id_priv->msg = NULL;
+
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static int cm_timewait_handler(struct cm_work *work)
+{
+ struct cm_timewait_info *timewait_info;
+ struct cm_id_private *cm_id_priv;
+ int ret;
+
+ timewait_info = (struct cm_timewait_info *)work;
+ spin_lock_irq(&cm.lock);
+ list_del(&timewait_info->list);
+ spin_unlock_irq(&cm.lock);
+
+ cm_id_priv = cm_acquire_id(timewait_info->work.local_id,
+ timewait_info->work.remote_id);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_TIMEWAIT ||
+ cm_id_priv->remote_qpn != timewait_info->remote_qpn) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto out;
+ }
+ cm_id_priv->id.state = IB_CM_IDLE;
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static void cm_format_sidr_req(struct cm_sidr_req_msg *sidr_req_msg,
+ struct cm_id_private *cm_id_priv,
+ struct ib_cm_sidr_req_param *param)
+{
+ cm_format_mad_hdr(&sidr_req_msg->hdr, CM_SIDR_REQ_ATTR_ID,
+ cm_form_tid(cm_id_priv));
+ sidr_req_msg->request_id = cm_id_priv->id.local_id;
+ sidr_req_msg->pkey = param->path->pkey;
+ sidr_req_msg->service_id = param->service_id;
+
+ if (param->private_data && param->private_data_len)
+ memcpy(sidr_req_msg->private_data, param->private_data,
+ param->private_data_len);
+}
+
+int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
+ struct ib_cm_sidr_req_param *param)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ int ret;
+
+ if (!param->path || (param->private_data &&
+ param->private_data_len > IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE))
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ ret = cm_init_av_by_path(param->path, param->sgid_attr,
+ &cm_id_priv->av,
+ cm_id_priv);
+ if (ret)
+ goto out;
+
+ cm_id->service_id = param->service_id;
+ cm_id->service_mask = ~cpu_to_be64(0);
+ cm_id_priv->timeout_ms = param->timeout_ms;
+ cm_id_priv->max_cm_retries = param->max_cm_retries;
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto out;
+
+ cm_format_sidr_req((struct cm_sidr_req_msg *) msg->mad, cm_id_priv,
+ param);
+ msg->timeout_ms = cm_id_priv->timeout_ms;
+ msg->context[1] = (void *) (unsigned long) IB_CM_SIDR_REQ_SENT;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state == IB_CM_IDLE)
+ ret = ib_post_send_mad(msg, NULL);
+ else
+ ret = -EINVAL;
+
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ goto out;
+ }
+ cm_id->state = IB_CM_SIDR_REQ_SENT;
+ cm_id_priv->msg = msg;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+out:
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_sidr_req);
+
+static void cm_format_sidr_req_event(struct cm_work *work,
+ const struct cm_id_private *rx_cm_id,
+ struct ib_cm_id *listen_id)
+{
+ struct cm_sidr_req_msg *sidr_req_msg;
+ struct ib_cm_sidr_req_event_param *param;
+
+ sidr_req_msg = (struct cm_sidr_req_msg *)
+ work->mad_recv_wc->recv_buf.mad;
+ param = &work->cm_event.param.sidr_req_rcvd;
+ param->pkey = __be16_to_cpu(sidr_req_msg->pkey);
+ param->listen_id = listen_id;
+ param->service_id = sidr_req_msg->service_id;
+ param->bth_pkey = cm_get_bth_pkey(work);
+ param->port = work->port->port_num;
+ param->sgid_attr = rx_cm_id->av.ah_attr.grh.sgid_attr;
+ work->cm_event.private_data = &sidr_req_msg->private_data;
+}
+
+static int cm_sidr_req_handler(struct cm_work *work)
+{
+ struct ib_cm_id *cm_id;
+ struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
+ struct cm_sidr_req_msg *sidr_req_msg;
+ struct ib_wc *wc;
+ int ret;
+
+ cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
+ if (IS_ERR(cm_id))
+ return PTR_ERR(cm_id);
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+ /* Record SGID/SLID and request ID for lookup. */
+ sidr_req_msg = (struct cm_sidr_req_msg *)
+ work->mad_recv_wc->recv_buf.mad;
+ wc = work->mad_recv_wc->wc;
+ cm_id_priv->av.dgid.global.subnet_prefix = cpu_to_be64(wc->slid);
+ cm_id_priv->av.dgid.global.interface_id = 0;
+ ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+ work->mad_recv_wc->recv_buf.grh,
+ &cm_id_priv->av);
+ if (ret)
+ goto out;
+
+ cm_id_priv->id.remote_id = sidr_req_msg->request_id;
+ cm_id_priv->tid = sidr_req_msg->hdr.tid;
+ atomic_inc(&cm_id_priv->work_count);
+
+ spin_lock_irq(&cm.lock);
+ cur_cm_id_priv = cm_insert_remote_sidr(cm_id_priv);
+ if (cur_cm_id_priv) {
+ spin_unlock_irq(&cm.lock);
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_SIDR_REQ_COUNTER]);
+ goto out; /* Duplicate message. */
+ }
+ cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
+ cur_cm_id_priv = cm_find_listen(cm_id->device,
+ sidr_req_msg->service_id);
+ if (!cur_cm_id_priv) {
+ spin_unlock_irq(&cm.lock);
+ cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED);
+ goto out; /* No match. */
+ }
+ atomic_inc(&cur_cm_id_priv->refcount);
+ atomic_inc(&cm_id_priv->refcount);
+ spin_unlock_irq(&cm.lock);
+
+ cm_id_priv->id.cm_handler = cur_cm_id_priv->id.cm_handler;
+ cm_id_priv->id.context = cur_cm_id_priv->id.context;
+ cm_id_priv->id.service_id = sidr_req_msg->service_id;
+ cm_id_priv->id.service_mask = ~cpu_to_be64(0);
+
+ cm_format_sidr_req_event(work, cm_id_priv, &cur_cm_id_priv->id);
+ cm_process_work(cm_id_priv, work);
+ cm_deref_id(cur_cm_id_priv);
+ return 0;
+out:
+ ib_destroy_cm_id(&cm_id_priv->id);
+ return -EINVAL;
+}
+
+static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg,
+ struct cm_id_private *cm_id_priv,
+ struct ib_cm_sidr_rep_param *param)
+{
+ cm_format_mad_hdr(&sidr_rep_msg->hdr, CM_SIDR_REP_ATTR_ID,
+ cm_id_priv->tid);
+ sidr_rep_msg->request_id = cm_id_priv->id.remote_id;
+ sidr_rep_msg->status = param->status;
+ cm_sidr_rep_set_qpn(sidr_rep_msg, cpu_to_be32(param->qp_num));
+ sidr_rep_msg->service_id = cm_id_priv->id.service_id;
+ sidr_rep_msg->qkey = cpu_to_be32(param->qkey);
+
+ if (param->info && param->info_length)
+ memcpy(sidr_rep_msg->info, param->info, param->info_length);
+
+ if (param->private_data && param->private_data_len)
+ memcpy(sidr_rep_msg->private_data, param->private_data,
+ param->private_data_len);
+}
+
+int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
+ struct ib_cm_sidr_rep_param *param)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ int ret;
+
+ if ((param->info && param->info_length > IB_CM_SIDR_REP_INFO_LENGTH) ||
+ (param->private_data &&
+ param->private_data_len > IB_CM_SIDR_REP_PRIVATE_DATA_SIZE))
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_SIDR_REQ_RCVD) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto error;
+
+ cm_format_sidr_rep((struct cm_sidr_rep_msg *) msg->mad, cm_id_priv,
+ param);
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ return ret;
+ }
+ cm_id->state = IB_CM_IDLE;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ spin_lock_irqsave(&cm.lock, flags);
+ if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) {
+ rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+ RB_CLEAR_NODE(&cm_id_priv->sidr_id_node);
+ }
+ spin_unlock_irqrestore(&cm.lock, flags);
+ return 0;
+
+error: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_sidr_rep);
+
+static void cm_format_sidr_rep_event(struct cm_work *work,
+ const struct cm_id_private *cm_id_priv)
+{
+ struct cm_sidr_rep_msg *sidr_rep_msg;
+ struct ib_cm_sidr_rep_event_param *param;
+
+ sidr_rep_msg = (struct cm_sidr_rep_msg *)
+ work->mad_recv_wc->recv_buf.mad;
+ param = &work->cm_event.param.sidr_rep_rcvd;
+ param->status = sidr_rep_msg->status;
+ param->qkey = be32_to_cpu(sidr_rep_msg->qkey);
+ param->qpn = be32_to_cpu(cm_sidr_rep_get_qpn(sidr_rep_msg));
+ param->info = &sidr_rep_msg->info;
+ param->info_len = sidr_rep_msg->info_length;
+ param->sgid_attr = cm_id_priv->av.ah_attr.grh.sgid_attr;
+ work->cm_event.private_data = &sidr_rep_msg->private_data;
+}
+
+static int cm_sidr_rep_handler(struct cm_work *work)
+{
+ struct cm_sidr_rep_msg *sidr_rep_msg;
+ struct cm_id_private *cm_id_priv;
+
+ sidr_rep_msg = (struct cm_sidr_rep_msg *)
+ work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(sidr_rep_msg->request_id, 0);
+ if (!cm_id_priv)
+ return -EINVAL; /* Unmatched reply. */
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_SIDR_REQ_SENT) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto out;
+ }
+ cm_id_priv->id.state = IB_CM_IDLE;
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ cm_format_sidr_rep_event(work, cm_id_priv);
+ cm_process_work(cm_id_priv, work);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static void cm_process_send_error(struct ib_mad_send_buf *msg,
+ enum ib_wc_status wc_status)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_cm_event cm_event;
+ enum ib_cm_state state;
+ int ret;
+
+ memset(&cm_event, 0, sizeof cm_event);
+ cm_id_priv = msg->context[0];
+
+ /* Discard old sends or ones without a response. */
+ spin_lock_irq(&cm_id_priv->lock);
+ state = (enum ib_cm_state) (unsigned long) msg->context[1];
+ if (msg != cm_id_priv->msg || state != cm_id_priv->id.state)
+ goto discard;
+
+ pr_debug_ratelimited("CM: failed sending MAD in state %d. (%s)\n",
+ state, ib_wc_status_msg(wc_status));
+ switch (state) {
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ cm_reset_to_idle(cm_id_priv);
+ cm_event.event = IB_CM_REQ_ERROR;
+ break;
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ cm_reset_to_idle(cm_id_priv);
+ cm_event.event = IB_CM_REP_ERROR;
+ break;
+ case IB_CM_DREQ_SENT:
+ cm_enter_timewait(cm_id_priv);
+ cm_event.event = IB_CM_DREQ_ERROR;
+ break;
+ case IB_CM_SIDR_REQ_SENT:
+ cm_id_priv->id.state = IB_CM_IDLE;
+ cm_event.event = IB_CM_SIDR_REQ_ERROR;
+ break;
+ default:
+ goto discard;
+ }
+ spin_unlock_irq(&cm_id_priv->lock);
+ cm_event.param.send_status = wc_status;
+
+ /* No other events can occur on the cm_id at this point. */
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &cm_event);
+ cm_free_msg(msg);
+ if (ret)
+ ib_destroy_cm_id(&cm_id_priv->id);
+ return;
+discard:
+ spin_unlock_irq(&cm_id_priv->lock);
+ cm_free_msg(msg);
+}
+
+static void cm_send_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_mad_send_buf *msg = mad_send_wc->send_buf;
+ struct cm_port *port;
+ u16 attr_index;
+
+ port = mad_agent->context;
+ attr_index = be16_to_cpu(((struct ib_mad_hdr *)
+ msg->mad)->attr_id) - CM_ATTR_ID_OFFSET;
+
+ /*
+ * If the send was in response to a received message (context[0] is not
+ * set to a cm_id), and is not a REJ, then it is a send that was
+ * manually retried.
+ */
+ if (!msg->context[0] && (attr_index != CM_REJ_COUNTER))
+ msg->retries = 1;
+
+ atomic_long_add(1 + msg->retries,
+ &port->counter_group[CM_XMIT].counter[attr_index]);
+ if (msg->retries)
+ atomic_long_add(msg->retries,
+ &port->counter_group[CM_XMIT_RETRIES].
+ counter[attr_index]);
+
+ switch (mad_send_wc->status) {
+ case IB_WC_SUCCESS:
+ case IB_WC_WR_FLUSH_ERR:
+ cm_free_msg(msg);
+ break;
+ default:
+ if (msg->context[0] && msg->context[1])
+ cm_process_send_error(msg, mad_send_wc->status);
+ else
+ cm_free_msg(msg);
+ break;
+ }
+}
+
+static void cm_work_handler(struct work_struct *_work)
+{
+ struct cm_work *work = container_of(_work, struct cm_work, work.work);
+ int ret;
+
+ switch (work->cm_event.event) {
+ case IB_CM_REQ_RECEIVED:
+ ret = cm_req_handler(work);
+ break;
+ case IB_CM_MRA_RECEIVED:
+ ret = cm_mra_handler(work);
+ break;
+ case IB_CM_REJ_RECEIVED:
+ ret = cm_rej_handler(work);
+ break;
+ case IB_CM_REP_RECEIVED:
+ ret = cm_rep_handler(work);
+ break;
+ case IB_CM_RTU_RECEIVED:
+ ret = cm_rtu_handler(work);
+ break;
+ case IB_CM_USER_ESTABLISHED:
+ ret = cm_establish_handler(work);
+ break;
+ case IB_CM_DREQ_RECEIVED:
+ ret = cm_dreq_handler(work);
+ break;
+ case IB_CM_DREP_RECEIVED:
+ ret = cm_drep_handler(work);
+ break;
+ case IB_CM_SIDR_REQ_RECEIVED:
+ ret = cm_sidr_req_handler(work);
+ break;
+ case IB_CM_SIDR_REP_RECEIVED:
+ ret = cm_sidr_rep_handler(work);
+ break;
+ case IB_CM_LAP_RECEIVED:
+ ret = cm_lap_handler(work);
+ break;
+ case IB_CM_APR_RECEIVED:
+ ret = cm_apr_handler(work);
+ break;
+ case IB_CM_TIMEWAIT_EXIT:
+ ret = cm_timewait_handler(work);
+ break;
+ default:
+ pr_debug("cm_event.event: 0x%x\n", work->cm_event.event);
+ ret = -EINVAL;
+ break;
+ }
+ if (ret)
+ cm_free_work(work);
+}
+
+static int cm_establish(struct ib_cm_id *cm_id)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_work *work;
+ unsigned long flags;
+ int ret = 0;
+ struct cm_device *cm_dev;
+
+ cm_dev = ib_get_client_data(cm_id->device, &cm_client);
+ if (!cm_dev)
+ return -ENODEV;
+
+ work = kmalloc(sizeof *work, GFP_ATOMIC);
+ if (!work)
+ return -ENOMEM;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id->state)
+ {
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ cm_id->state = IB_CM_ESTABLISHED;
+ break;
+ case IB_CM_ESTABLISHED:
+ ret = -EISCONN;
+ break;
+ default:
+ pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__,
+ be32_to_cpu(cm_id->local_id), cm_id->state);
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ if (ret) {
+ kfree(work);
+ goto out;
+ }
+
+ /*
+ * The CM worker thread may try to destroy the cm_id before it
+ * can execute this work item. To prevent potential deadlock,
+ * we need to find the cm_id once we're in the context of the
+ * worker thread, rather than holding a reference on it.
+ */
+ INIT_DELAYED_WORK(&work->work, cm_work_handler);
+ work->local_id = cm_id->local_id;
+ work->remote_id = cm_id->remote_id;
+ work->mad_recv_wc = NULL;
+ work->cm_event.event = IB_CM_USER_ESTABLISHED;
+
+ /* Check if the device started its remove_one */
+ spin_lock_irqsave(&cm.lock, flags);
+ if (!cm_dev->going_down) {
+ queue_delayed_work(cm.wq, &work->work, 0);
+ } else {
+ kfree(work);
+ ret = -ENODEV;
+ }
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+out:
+ return ret;
+}
+
+static int cm_migrate(struct ib_cm_id *cm_id)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_av tmp_av;
+ unsigned long flags;
+ int tmp_send_port_not_ready;
+ int ret = 0;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state == IB_CM_ESTABLISHED &&
+ (cm_id->lap_state == IB_CM_LAP_UNINIT ||
+ cm_id->lap_state == IB_CM_LAP_IDLE)) {
+ cm_id->lap_state = IB_CM_LAP_IDLE;
+ /* Swap address vector */
+ tmp_av = cm_id_priv->av;
+ cm_id_priv->av = cm_id_priv->alt_av;
+ cm_id_priv->alt_av = tmp_av;
+ /* Swap port send ready state */
+ tmp_send_port_not_ready = cm_id_priv->prim_send_port_not_ready;
+ cm_id_priv->prim_send_port_not_ready = cm_id_priv->altr_send_port_not_ready;
+ cm_id_priv->altr_send_port_not_ready = tmp_send_port_not_ready;
+ } else
+ ret = -EINVAL;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ return ret;
+}
+
+int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event)
+{
+ int ret;
+
+ switch (event) {
+ case IB_EVENT_COMM_EST:
+ ret = cm_establish(cm_id);
+ break;
+ case IB_EVENT_PATH_MIG:
+ ret = cm_migrate(cm_id);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(ib_cm_notify);
+
+static void cm_recv_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct cm_port *port = mad_agent->context;
+ struct cm_work *work;
+ enum ib_cm_event_type event;
+ bool alt_path = false;
+ u16 attr_id;
+ int paths = 0;
+ int going_down = 0;
+
+ switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) {
+ case CM_REQ_ATTR_ID:
+ alt_path = cm_req_has_alt_path((struct cm_req_msg *)
+ mad_recv_wc->recv_buf.mad);
+ paths = 1 + (alt_path != 0);
+ event = IB_CM_REQ_RECEIVED;
+ break;
+ case CM_MRA_ATTR_ID:
+ event = IB_CM_MRA_RECEIVED;
+ break;
+ case CM_REJ_ATTR_ID:
+ event = IB_CM_REJ_RECEIVED;
+ break;
+ case CM_REP_ATTR_ID:
+ event = IB_CM_REP_RECEIVED;
+ break;
+ case CM_RTU_ATTR_ID:
+ event = IB_CM_RTU_RECEIVED;
+ break;
+ case CM_DREQ_ATTR_ID:
+ event = IB_CM_DREQ_RECEIVED;
+ break;
+ case CM_DREP_ATTR_ID:
+ event = IB_CM_DREP_RECEIVED;
+ break;
+ case CM_SIDR_REQ_ATTR_ID:
+ event = IB_CM_SIDR_REQ_RECEIVED;
+ break;
+ case CM_SIDR_REP_ATTR_ID:
+ event = IB_CM_SIDR_REP_RECEIVED;
+ break;
+ case CM_LAP_ATTR_ID:
+ paths = 1;
+ event = IB_CM_LAP_RECEIVED;
+ break;
+ case CM_APR_ATTR_ID:
+ event = IB_CM_APR_RECEIVED;
+ break;
+ default:
+ ib_free_recv_mad(mad_recv_wc);
+ return;
+ }
+
+ attr_id = be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id);
+ atomic_long_inc(&port->counter_group[CM_RECV].
+ counter[attr_id - CM_ATTR_ID_OFFSET]);
+
+ work = kmalloc(sizeof(*work) + sizeof(struct sa_path_rec) * paths,
+ GFP_KERNEL);
+ if (!work) {
+ ib_free_recv_mad(mad_recv_wc);
+ return;
+ }
+
+ INIT_DELAYED_WORK(&work->work, cm_work_handler);
+ work->cm_event.event = event;
+ work->mad_recv_wc = mad_recv_wc;
+ work->port = port;
+
+ /* Check if the device started its remove_one */
+ spin_lock_irq(&cm.lock);
+ if (!port->cm_dev->going_down)
+ queue_delayed_work(cm.wq, &work->work, 0);
+ else
+ going_down = 1;
+ spin_unlock_irq(&cm.lock);
+
+ if (going_down) {
+ kfree(work);
+ ib_free_recv_mad(mad_recv_wc);
+ }
+}
+
+static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ case IB_CM_REQ_RCVD:
+ case IB_CM_MRA_REQ_SENT:
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ case IB_CM_ESTABLISHED:
+ *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX | IB_QP_PORT;
+ qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE;
+ if (cm_id_priv->responder_resources)
+ qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_ATOMIC;
+ qp_attr->pkey_index = cm_id_priv->av.pkey_index;
+ qp_attr->port_num = cm_id_priv->av.port->port_num;
+ ret = 0;
+ break;
+ default:
+ pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n",
+ __func__, be32_to_cpu(cm_id_priv->id.local_id),
+ cm_id_priv->id.state);
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REQ_RCVD:
+ case IB_CM_MRA_REQ_SENT:
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ case IB_CM_ESTABLISHED:
+ *qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN | IB_QP_RQ_PSN;
+ qp_attr->ah_attr = cm_id_priv->av.ah_attr;
+ qp_attr->path_mtu = cm_id_priv->path_mtu;
+ qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn);
+ qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn);
+ if (cm_id_priv->qp_type == IB_QPT_RC ||
+ cm_id_priv->qp_type == IB_QPT_XRC_TGT) {
+ *qp_attr_mask |= IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_MIN_RNR_TIMER;
+ qp_attr->max_dest_rd_atomic =
+ cm_id_priv->responder_resources;
+ qp_attr->min_rnr_timer = 0;
+ }
+ if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr)) {
+ *qp_attr_mask |= IB_QP_ALT_PATH;
+ qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num;
+ qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
+ qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
+ qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
+ }
+ ret = 0;
+ break;
+ default:
+ pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n",
+ __func__, be32_to_cpu(cm_id_priv->id.local_id),
+ cm_id_priv->id.state);
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->id.state) {
+ /* Allow transition to RTS before sending REP */
+ case IB_CM_REQ_RCVD:
+ case IB_CM_MRA_REQ_SENT:
+
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ case IB_CM_ESTABLISHED:
+ if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT) {
+ *qp_attr_mask = IB_QP_STATE | IB_QP_SQ_PSN;
+ qp_attr->sq_psn = be32_to_cpu(cm_id_priv->sq_psn);
+ switch (cm_id_priv->qp_type) {
+ case IB_QPT_RC:
+ case IB_QPT_XRC_INI:
+ *qp_attr_mask |= IB_QP_RETRY_CNT | IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC;
+ qp_attr->retry_cnt = cm_id_priv->retry_count;
+ qp_attr->rnr_retry = cm_id_priv->rnr_retry_count;
+ qp_attr->max_rd_atomic = cm_id_priv->initiator_depth;
+ /* fall through */
+ case IB_QPT_XRC_TGT:
+ *qp_attr_mask |= IB_QP_TIMEOUT;
+ qp_attr->timeout = cm_id_priv->av.timeout;
+ break;
+ default:
+ break;
+ }
+ if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr)) {
+ *qp_attr_mask |= IB_QP_PATH_MIG_STATE;
+ qp_attr->path_mig_state = IB_MIG_REARM;
+ }
+ } else {
+ *qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE;
+ qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num;
+ qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
+ qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
+ qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
+ qp_attr->path_mig_state = IB_MIG_REARM;
+ }
+ ret = 0;
+ break;
+ default:
+ pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n",
+ __func__, be32_to_cpu(cm_id_priv->id.local_id),
+ cm_id_priv->id.state);
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ struct cm_id_private *cm_id_priv;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ switch (qp_attr->qp_state) {
+ case IB_QPS_INIT:
+ ret = cm_init_qp_init_attr(cm_id_priv, qp_attr, qp_attr_mask);
+ break;
+ case IB_QPS_RTR:
+ ret = cm_init_qp_rtr_attr(cm_id_priv, qp_attr, qp_attr_mask);
+ break;
+ case IB_QPS_RTS:
+ ret = cm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(ib_cm_init_qp_attr);
+
+static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr,
+ char *buf)
+{
+ struct cm_counter_group *group;
+ struct cm_counter_attribute *cm_attr;
+
+ group = container_of(obj, struct cm_counter_group, obj);
+ cm_attr = container_of(attr, struct cm_counter_attribute, attr);
+
+ return sprintf(buf, "%ld\n",
+ atomic_long_read(&group->counter[cm_attr->index]));
+}
+
+static const struct sysfs_ops cm_counter_ops = {
+ .show = cm_show_counter
+};
+
+static struct kobj_type cm_counter_obj_type = {
+ .sysfs_ops = &cm_counter_ops,
+ .default_attrs = cm_counter_default_attrs
+};
+
+static void cm_release_port_obj(struct kobject *obj)
+{
+ struct cm_port *cm_port;
+
+ cm_port = container_of(obj, struct cm_port, port_obj);
+ kfree(cm_port);
+}
+
+static struct kobj_type cm_port_obj_type = {
+ .release = cm_release_port_obj
+};
+
+static char *cm_devnode(struct device *dev, umode_t *mode)
+{
+ if (mode)
+ *mode = 0666;
+ return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+struct class cm_class = {
+ .owner = THIS_MODULE,
+ .name = "infiniband_cm",
+ .devnode = cm_devnode,
+};
+EXPORT_SYMBOL(cm_class);
+
+static int cm_create_port_fs(struct cm_port *port)
+{
+ int i, ret;
+
+ ret = kobject_init_and_add(&port->port_obj, &cm_port_obj_type,
+ &port->cm_dev->device->kobj,
+ "%d", port->port_num);
+ if (ret) {
+ kfree(port);
+ return ret;
+ }
+
+ for (i = 0; i < CM_COUNTER_GROUPS; i++) {
+ ret = kobject_init_and_add(&port->counter_group[i].obj,
+ &cm_counter_obj_type,
+ &port->port_obj,
+ "%s", counter_group_names[i]);
+ if (ret)
+ goto error;
+ }
+
+ return 0;
+
+error:
+ while (i--)
+ kobject_put(&port->counter_group[i].obj);
+ kobject_put(&port->port_obj);
+ return ret;
+
+}
+
+static void cm_remove_port_fs(struct cm_port *port)
+{
+ int i;
+
+ for (i = 0; i < CM_COUNTER_GROUPS; i++)
+ kobject_put(&port->counter_group[i].obj);
+
+ kobject_put(&port->port_obj);
+}
+
+static void cm_add_one(struct ib_device *ib_device)
+{
+ struct cm_device *cm_dev;
+ struct cm_port *port;
+ struct ib_mad_reg_req reg_req = {
+ .mgmt_class = IB_MGMT_CLASS_CM,
+ .mgmt_class_version = IB_CM_CLASS_VERSION,
+ };
+ struct ib_port_modify port_modify = {
+ .set_port_cap_mask = IB_PORT_CM_SUP
+ };
+ unsigned long flags;
+ int ret;
+ int count = 0;
+ u8 i;
+
+ cm_dev = kzalloc(struct_size(cm_dev, port, ib_device->phys_port_cnt),
+ GFP_KERNEL);
+ if (!cm_dev)
+ return;
+
+ cm_dev->ib_device = ib_device;
+ cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
+ cm_dev->going_down = 0;
+ cm_dev->device = device_create(&cm_class, &ib_device->dev,
+ MKDEV(0, 0), NULL,
+ "%s", ib_device->name);
+ if (IS_ERR(cm_dev->device)) {
+ kfree(cm_dev);
+ return;
+ }
+
+ set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
+ for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+ if (!rdma_cap_ib_cm(ib_device, i))
+ continue;
+
+ port = kzalloc(sizeof *port, GFP_KERNEL);
+ if (!port)
+ goto error1;
+
+ cm_dev->port[i-1] = port;
+ port->cm_dev = cm_dev;
+ port->port_num = i;
+
+ INIT_LIST_HEAD(&port->cm_priv_prim_list);
+ INIT_LIST_HEAD(&port->cm_priv_altr_list);
+
+ ret = cm_create_port_fs(port);
+ if (ret)
+ goto error1;
+
+ port->mad_agent = ib_register_mad_agent(ib_device, i,
+ IB_QPT_GSI,
+ &reg_req,
+ 0,
+ cm_send_handler,
+ cm_recv_handler,
+ port,
+ 0);
+ if (IS_ERR(port->mad_agent))
+ goto error2;
+
+ ret = ib_modify_port(ib_device, i, 0, &port_modify);
+ if (ret)
+ goto error3;
+
+ count++;
+ }
+
+ if (!count)
+ goto free;
+
+ ib_set_client_data(ib_device, &cm_client, cm_dev);
+
+ write_lock_irqsave(&cm.device_lock, flags);
+ list_add_tail(&cm_dev->list, &cm.device_list);
+ write_unlock_irqrestore(&cm.device_lock, flags);
+ return;
+
+error3:
+ ib_unregister_mad_agent(port->mad_agent);
+error2:
+ cm_remove_port_fs(port);
+error1:
+ port_modify.set_port_cap_mask = 0;
+ port_modify.clr_port_cap_mask = IB_PORT_CM_SUP;
+ while (--i) {
+ if (!rdma_cap_ib_cm(ib_device, i))
+ continue;
+
+ port = cm_dev->port[i-1];
+ ib_modify_port(ib_device, port->port_num, 0, &port_modify);
+ ib_unregister_mad_agent(port->mad_agent);
+ cm_remove_port_fs(port);
+ }
+free:
+ device_unregister(cm_dev->device);
+ kfree(cm_dev);
+}
+
+static void cm_remove_one(struct ib_device *ib_device, void *client_data)
+{
+ struct cm_device *cm_dev = client_data;
+ struct cm_port *port;
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_agent *cur_mad_agent;
+ struct ib_port_modify port_modify = {
+ .clr_port_cap_mask = IB_PORT_CM_SUP
+ };
+ unsigned long flags;
+ int i;
+
+ if (!cm_dev)
+ return;
+
+ write_lock_irqsave(&cm.device_lock, flags);
+ list_del(&cm_dev->list);
+ write_unlock_irqrestore(&cm.device_lock, flags);
+
+ spin_lock_irq(&cm.lock);
+ cm_dev->going_down = 1;
+ spin_unlock_irq(&cm.lock);
+
+ for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+ if (!rdma_cap_ib_cm(ib_device, i))
+ continue;
+
+ port = cm_dev->port[i-1];
+ ib_modify_port(ib_device, port->port_num, 0, &port_modify);
+ /* Mark all the cm_id's as not valid */
+ spin_lock_irq(&cm.lock);
+ list_for_each_entry(cm_id_priv, &port->cm_priv_altr_list, altr_list)
+ cm_id_priv->altr_send_port_not_ready = 1;
+ list_for_each_entry(cm_id_priv, &port->cm_priv_prim_list, prim_list)
+ cm_id_priv->prim_send_port_not_ready = 1;
+ spin_unlock_irq(&cm.lock);
+ /*
+ * We flush the queue here after the going_down set, this
+ * verify that no new works will be queued in the recv handler,
+ * after that we can call the unregister_mad_agent
+ */
+ flush_workqueue(cm.wq);
+ spin_lock_irq(&cm.state_lock);
+ cur_mad_agent = port->mad_agent;
+ port->mad_agent = NULL;
+ spin_unlock_irq(&cm.state_lock);
+ ib_unregister_mad_agent(cur_mad_agent);
+ cm_remove_port_fs(port);
+ }
+
+ device_unregister(cm_dev->device);
+ kfree(cm_dev);
+}
+
+static int __init ib_cm_init(void)
+{
+ int ret;
+
+ memset(&cm, 0, sizeof cm);
+ INIT_LIST_HEAD(&cm.device_list);
+ rwlock_init(&cm.device_lock);
+ spin_lock_init(&cm.lock);
+ spin_lock_init(&cm.state_lock);
+ cm.listen_service_table = RB_ROOT;
+ cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID);
+ cm.remote_id_table = RB_ROOT;
+ cm.remote_qp_table = RB_ROOT;
+ cm.remote_sidr_table = RB_ROOT;
+ idr_init(&cm.local_id_table);
+ get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand);
+ INIT_LIST_HEAD(&cm.timewait_list);
+
+ ret = class_register(&cm_class);
+ if (ret) {
+ ret = -ENOMEM;
+ goto error1;
+ }
+
+ cm.wq = alloc_workqueue("ib_cm", 0, 1);
+ if (!cm.wq) {
+ ret = -ENOMEM;
+ goto error2;
+ }
+
+ ret = ib_register_client(&cm_client);
+ if (ret)
+ goto error3;
+
+ return 0;
+error3:
+ destroy_workqueue(cm.wq);
+error2:
+ class_unregister(&cm_class);
+error1:
+ idr_destroy(&cm.local_id_table);
+ return ret;
+}
+
+static void __exit ib_cm_cleanup(void)
+{
+ struct cm_timewait_info *timewait_info, *tmp;
+
+ spin_lock_irq(&cm.lock);
+ list_for_each_entry(timewait_info, &cm.timewait_list, list)
+ cancel_delayed_work(&timewait_info->work.work);
+ spin_unlock_irq(&cm.lock);
+
+ ib_unregister_client(&cm_client);
+ destroy_workqueue(cm.wq);
+
+ list_for_each_entry_safe(timewait_info, tmp, &cm.timewait_list, list) {
+ list_del(&timewait_info->list);
+ kfree(timewait_info);
+ }
+
+ class_unregister(&cm_class);
+ idr_destroy(&cm.local_id_table);
+}
+
+module_init(ib_cm_init);
+module_exit(ib_cm_cleanup);
+
diff --git a/drivers/infiniband/core/cm_msgs.h b/drivers/infiniband/core/cm_msgs.h
new file mode 100644
index 000000000..476d43095
--- /dev/null
+++ b/drivers/infiniband/core/cm_msgs.h
@@ -0,0 +1,829 @@
+/*
+ * Copyright (c) 2004, 2011 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING the madirectory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use source and binary forms, with or
+ * withmodification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retathe above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHWARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS THE
+ * SOFTWARE.
+ */
+#if !defined(CM_MSGS_H)
+#define CM_MSGS_H
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_cm.h>
+
+/*
+ * Parameters to routines below should be in network-byte order, and values
+ * are returned in network-byte order.
+ */
+
+#define IB_CM_CLASS_VERSION 2 /* IB specification 1.2 */
+
+struct cm_req_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 rsvd4;
+ __be64 service_id;
+ __be64 local_ca_guid;
+ __be32 rsvd24;
+ __be32 local_qkey;
+ /* local QPN:24, responder resources:8 */
+ __be32 offset32;
+ /* local EECN:24, initiator depth:8 */
+ __be32 offset36;
+ /*
+ * remote EECN:24, remote CM response timeout:5,
+ * transport service type:2, end-to-end flow control:1
+ */
+ __be32 offset40;
+ /* starting PSN:24, local CM response timeout:5, retry count:3 */
+ __be32 offset44;
+ __be16 pkey;
+ /* path MTU:4, RDC exists:1, RNR retry count:3. */
+ u8 offset50;
+ /* max CM Retries:4, SRQ:1, extended transport type:3 */
+ u8 offset51;
+
+ __be16 primary_local_lid;
+ __be16 primary_remote_lid;
+ union ib_gid primary_local_gid;
+ union ib_gid primary_remote_gid;
+ /* flow label:20, rsvd:6, packet rate:6 */
+ __be32 primary_offset88;
+ u8 primary_traffic_class;
+ u8 primary_hop_limit;
+ /* SL:4, subnet local:1, rsvd:3 */
+ u8 primary_offset94;
+ /* local ACK timeout:5, rsvd:3 */
+ u8 primary_offset95;
+
+ __be16 alt_local_lid;
+ __be16 alt_remote_lid;
+ union ib_gid alt_local_gid;
+ union ib_gid alt_remote_gid;
+ /* flow label:20, rsvd:6, packet rate:6 */
+ __be32 alt_offset132;
+ u8 alt_traffic_class;
+ u8 alt_hop_limit;
+ /* SL:4, subnet local:1, rsvd:3 */
+ u8 alt_offset138;
+ /* local ACK timeout:5, rsvd:3 */
+ u8 alt_offset139;
+
+ u32 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE / sizeof(u32)];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_req_get_local_qpn(struct cm_req_msg *req_msg)
+{
+ return cpu_to_be32(be32_to_cpu(req_msg->offset32) >> 8);
+}
+
+static inline void cm_req_set_local_qpn(struct cm_req_msg *req_msg, __be32 qpn)
+{
+ req_msg->offset32 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+ (be32_to_cpu(req_msg->offset32) &
+ 0x000000FF));
+}
+
+static inline u8 cm_req_get_resp_res(struct cm_req_msg *req_msg)
+{
+ return (u8) be32_to_cpu(req_msg->offset32);
+}
+
+static inline void cm_req_set_resp_res(struct cm_req_msg *req_msg, u8 resp_res)
+{
+ req_msg->offset32 = cpu_to_be32(resp_res |
+ (be32_to_cpu(req_msg->offset32) &
+ 0xFFFFFF00));
+}
+
+static inline u8 cm_req_get_init_depth(struct cm_req_msg *req_msg)
+{
+ return (u8) be32_to_cpu(req_msg->offset36);
+}
+
+static inline void cm_req_set_init_depth(struct cm_req_msg *req_msg,
+ u8 init_depth)
+{
+ req_msg->offset36 = cpu_to_be32(init_depth |
+ (be32_to_cpu(req_msg->offset36) &
+ 0xFFFFFF00));
+}
+
+static inline u8 cm_req_get_remote_resp_timeout(struct cm_req_msg *req_msg)
+{
+ return (u8) ((be32_to_cpu(req_msg->offset40) & 0xF8) >> 3);
+}
+
+static inline void cm_req_set_remote_resp_timeout(struct cm_req_msg *req_msg,
+ u8 resp_timeout)
+{
+ req_msg->offset40 = cpu_to_be32((resp_timeout << 3) |
+ (be32_to_cpu(req_msg->offset40) &
+ 0xFFFFFF07));
+}
+
+static inline enum ib_qp_type cm_req_get_qp_type(struct cm_req_msg *req_msg)
+{
+ u8 transport_type = (u8) (be32_to_cpu(req_msg->offset40) & 0x06) >> 1;
+ switch(transport_type) {
+ case 0: return IB_QPT_RC;
+ case 1: return IB_QPT_UC;
+ case 3:
+ switch (req_msg->offset51 & 0x7) {
+ case 1: return IB_QPT_XRC_TGT;
+ default: return 0;
+ }
+ default: return 0;
+ }
+}
+
+static inline void cm_req_set_qp_type(struct cm_req_msg *req_msg,
+ enum ib_qp_type qp_type)
+{
+ switch(qp_type) {
+ case IB_QPT_UC:
+ req_msg->offset40 = cpu_to_be32((be32_to_cpu(
+ req_msg->offset40) &
+ 0xFFFFFFF9) | 0x2);
+ break;
+ case IB_QPT_XRC_INI:
+ req_msg->offset40 = cpu_to_be32((be32_to_cpu(
+ req_msg->offset40) &
+ 0xFFFFFFF9) | 0x6);
+ req_msg->offset51 = (req_msg->offset51 & 0xF8) | 1;
+ break;
+ default:
+ req_msg->offset40 = cpu_to_be32(be32_to_cpu(
+ req_msg->offset40) &
+ 0xFFFFFFF9);
+ }
+}
+
+static inline u8 cm_req_get_flow_ctrl(struct cm_req_msg *req_msg)
+{
+ return be32_to_cpu(req_msg->offset40) & 0x1;
+}
+
+static inline void cm_req_set_flow_ctrl(struct cm_req_msg *req_msg,
+ u8 flow_ctrl)
+{
+ req_msg->offset40 = cpu_to_be32((flow_ctrl & 0x1) |
+ (be32_to_cpu(req_msg->offset40) &
+ 0xFFFFFFFE));
+}
+
+static inline __be32 cm_req_get_starting_psn(struct cm_req_msg *req_msg)
+{
+ return cpu_to_be32(be32_to_cpu(req_msg->offset44) >> 8);
+}
+
+static inline void cm_req_set_starting_psn(struct cm_req_msg *req_msg,
+ __be32 starting_psn)
+{
+ req_msg->offset44 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) |
+ (be32_to_cpu(req_msg->offset44) & 0x000000FF));
+}
+
+static inline u8 cm_req_get_local_resp_timeout(struct cm_req_msg *req_msg)
+{
+ return (u8) ((be32_to_cpu(req_msg->offset44) & 0xF8) >> 3);
+}
+
+static inline void cm_req_set_local_resp_timeout(struct cm_req_msg *req_msg,
+ u8 resp_timeout)
+{
+ req_msg->offset44 = cpu_to_be32((resp_timeout << 3) |
+ (be32_to_cpu(req_msg->offset44) & 0xFFFFFF07));
+}
+
+static inline u8 cm_req_get_retry_count(struct cm_req_msg *req_msg)
+{
+ return (u8) (be32_to_cpu(req_msg->offset44) & 0x7);
+}
+
+static inline void cm_req_set_retry_count(struct cm_req_msg *req_msg,
+ u8 retry_count)
+{
+ req_msg->offset44 = cpu_to_be32((retry_count & 0x7) |
+ (be32_to_cpu(req_msg->offset44) & 0xFFFFFFF8));
+}
+
+static inline u8 cm_req_get_path_mtu(struct cm_req_msg *req_msg)
+{
+ return req_msg->offset50 >> 4;
+}
+
+static inline void cm_req_set_path_mtu(struct cm_req_msg *req_msg, u8 path_mtu)
+{
+ req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF) | (path_mtu << 4));
+}
+
+static inline u8 cm_req_get_rnr_retry_count(struct cm_req_msg *req_msg)
+{
+ return req_msg->offset50 & 0x7;
+}
+
+static inline void cm_req_set_rnr_retry_count(struct cm_req_msg *req_msg,
+ u8 rnr_retry_count)
+{
+ req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF8) |
+ (rnr_retry_count & 0x7));
+}
+
+static inline u8 cm_req_get_max_cm_retries(struct cm_req_msg *req_msg)
+{
+ return req_msg->offset51 >> 4;
+}
+
+static inline void cm_req_set_max_cm_retries(struct cm_req_msg *req_msg,
+ u8 retries)
+{
+ req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF) | (retries << 4));
+}
+
+static inline u8 cm_req_get_srq(struct cm_req_msg *req_msg)
+{
+ return (req_msg->offset51 & 0x8) >> 3;
+}
+
+static inline void cm_req_set_srq(struct cm_req_msg *req_msg, u8 srq)
+{
+ req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF7) |
+ ((srq & 0x1) << 3));
+}
+
+static inline __be32 cm_req_get_primary_flow_label(struct cm_req_msg *req_msg)
+{
+ return cpu_to_be32(be32_to_cpu(req_msg->primary_offset88) >> 12);
+}
+
+static inline void cm_req_set_primary_flow_label(struct cm_req_msg *req_msg,
+ __be32 flow_label)
+{
+ req_msg->primary_offset88 = cpu_to_be32(
+ (be32_to_cpu(req_msg->primary_offset88) &
+ 0x00000FFF) |
+ (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_req_get_primary_packet_rate(struct cm_req_msg *req_msg)
+{
+ return (u8) (be32_to_cpu(req_msg->primary_offset88) & 0x3F);
+}
+
+static inline void cm_req_set_primary_packet_rate(struct cm_req_msg *req_msg,
+ u8 rate)
+{
+ req_msg->primary_offset88 = cpu_to_be32(
+ (be32_to_cpu(req_msg->primary_offset88) &
+ 0xFFFFFFC0) | (rate & 0x3F));
+}
+
+static inline u8 cm_req_get_primary_sl(struct cm_req_msg *req_msg)
+{
+ return (u8) (req_msg->primary_offset94 >> 4);
+}
+
+static inline void cm_req_set_primary_sl(struct cm_req_msg *req_msg, u8 sl)
+{
+ req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0x0F) |
+ (sl << 4));
+}
+
+static inline u8 cm_req_get_primary_subnet_local(struct cm_req_msg *req_msg)
+{
+ return (u8) ((req_msg->primary_offset94 & 0x08) >> 3);
+}
+
+static inline void cm_req_set_primary_subnet_local(struct cm_req_msg *req_msg,
+ u8 subnet_local)
+{
+ req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0xF7) |
+ ((subnet_local & 0x1) << 3));
+}
+
+static inline u8 cm_req_get_primary_local_ack_timeout(struct cm_req_msg *req_msg)
+{
+ return (u8) (req_msg->primary_offset95 >> 3);
+}
+
+static inline void cm_req_set_primary_local_ack_timeout(struct cm_req_msg *req_msg,
+ u8 local_ack_timeout)
+{
+ req_msg->primary_offset95 = (u8) ((req_msg->primary_offset95 & 0x07) |
+ (local_ack_timeout << 3));
+}
+
+static inline __be32 cm_req_get_alt_flow_label(struct cm_req_msg *req_msg)
+{
+ return cpu_to_be32(be32_to_cpu(req_msg->alt_offset132) >> 12);
+}
+
+static inline void cm_req_set_alt_flow_label(struct cm_req_msg *req_msg,
+ __be32 flow_label)
+{
+ req_msg->alt_offset132 = cpu_to_be32(
+ (be32_to_cpu(req_msg->alt_offset132) &
+ 0x00000FFF) |
+ (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_req_get_alt_packet_rate(struct cm_req_msg *req_msg)
+{
+ return (u8) (be32_to_cpu(req_msg->alt_offset132) & 0x3F);
+}
+
+static inline void cm_req_set_alt_packet_rate(struct cm_req_msg *req_msg,
+ u8 rate)
+{
+ req_msg->alt_offset132 = cpu_to_be32(
+ (be32_to_cpu(req_msg->alt_offset132) &
+ 0xFFFFFFC0) | (rate & 0x3F));
+}
+
+static inline u8 cm_req_get_alt_sl(struct cm_req_msg *req_msg)
+{
+ return (u8) (req_msg->alt_offset138 >> 4);
+}
+
+static inline void cm_req_set_alt_sl(struct cm_req_msg *req_msg, u8 sl)
+{
+ req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0x0F) |
+ (sl << 4));
+}
+
+static inline u8 cm_req_get_alt_subnet_local(struct cm_req_msg *req_msg)
+{
+ return (u8) ((req_msg->alt_offset138 & 0x08) >> 3);
+}
+
+static inline void cm_req_set_alt_subnet_local(struct cm_req_msg *req_msg,
+ u8 subnet_local)
+{
+ req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0xF7) |
+ ((subnet_local & 0x1) << 3));
+}
+
+static inline u8 cm_req_get_alt_local_ack_timeout(struct cm_req_msg *req_msg)
+{
+ return (u8) (req_msg->alt_offset139 >> 3);
+}
+
+static inline void cm_req_set_alt_local_ack_timeout(struct cm_req_msg *req_msg,
+ u8 local_ack_timeout)
+{
+ req_msg->alt_offset139 = (u8) ((req_msg->alt_offset139 & 0x07) |
+ (local_ack_timeout << 3));
+}
+
+/* Message REJected or MRAed */
+enum cm_msg_response {
+ CM_MSG_RESPONSE_REQ = 0x0,
+ CM_MSG_RESPONSE_REP = 0x1,
+ CM_MSG_RESPONSE_OTHER = 0x2
+};
+
+ struct cm_mra_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+ /* message MRAed:2, rsvd:6 */
+ u8 offset8;
+ /* service timeout:5, rsvd:3 */
+ u8 offset9;
+
+ u8 private_data[IB_CM_MRA_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline u8 cm_mra_get_msg_mraed(struct cm_mra_msg *mra_msg)
+{
+ return (u8) (mra_msg->offset8 >> 6);
+}
+
+static inline void cm_mra_set_msg_mraed(struct cm_mra_msg *mra_msg, u8 msg)
+{
+ mra_msg->offset8 = (u8) ((mra_msg->offset8 & 0x3F) | (msg << 6));
+}
+
+static inline u8 cm_mra_get_service_timeout(struct cm_mra_msg *mra_msg)
+{
+ return (u8) (mra_msg->offset9 >> 3);
+}
+
+static inline void cm_mra_set_service_timeout(struct cm_mra_msg *mra_msg,
+ u8 service_timeout)
+{
+ mra_msg->offset9 = (u8) ((mra_msg->offset9 & 0x07) |
+ (service_timeout << 3));
+}
+
+struct cm_rej_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+ /* message REJected:2, rsvd:6 */
+ u8 offset8;
+ /* reject info length:7, rsvd:1. */
+ u8 offset9;
+ __be16 reason;
+ u8 ari[IB_CM_REJ_ARI_LENGTH];
+
+ u8 private_data[IB_CM_REJ_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline u8 cm_rej_get_msg_rejected(struct cm_rej_msg *rej_msg)
+{
+ return (u8) (rej_msg->offset8 >> 6);
+}
+
+static inline void cm_rej_set_msg_rejected(struct cm_rej_msg *rej_msg, u8 msg)
+{
+ rej_msg->offset8 = (u8) ((rej_msg->offset8 & 0x3F) | (msg << 6));
+}
+
+static inline u8 cm_rej_get_reject_info_len(struct cm_rej_msg *rej_msg)
+{
+ return (u8) (rej_msg->offset9 >> 1);
+}
+
+static inline void cm_rej_set_reject_info_len(struct cm_rej_msg *rej_msg,
+ u8 len)
+{
+ rej_msg->offset9 = (u8) ((rej_msg->offset9 & 0x1) | (len << 1));
+}
+
+struct cm_rep_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+ __be32 local_qkey;
+ /* local QPN:24, rsvd:8 */
+ __be32 offset12;
+ /* local EECN:24, rsvd:8 */
+ __be32 offset16;
+ /* starting PSN:24 rsvd:8 */
+ __be32 offset20;
+ u8 resp_resources;
+ u8 initiator_depth;
+ /* target ACK delay:5, failover accepted:2, end-to-end flow control:1 */
+ u8 offset26;
+ /* RNR retry count:3, SRQ:1, rsvd:5 */
+ u8 offset27;
+ __be64 local_ca_guid;
+
+ u8 private_data[IB_CM_REP_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_rep_get_local_qpn(struct cm_rep_msg *rep_msg)
+{
+ return cpu_to_be32(be32_to_cpu(rep_msg->offset12) >> 8);
+}
+
+static inline void cm_rep_set_local_qpn(struct cm_rep_msg *rep_msg, __be32 qpn)
+{
+ rep_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+ (be32_to_cpu(rep_msg->offset12) & 0x000000FF));
+}
+
+static inline __be32 cm_rep_get_local_eecn(struct cm_rep_msg *rep_msg)
+{
+ return cpu_to_be32(be32_to_cpu(rep_msg->offset16) >> 8);
+}
+
+static inline void cm_rep_set_local_eecn(struct cm_rep_msg *rep_msg, __be32 eecn)
+{
+ rep_msg->offset16 = cpu_to_be32((be32_to_cpu(eecn) << 8) |
+ (be32_to_cpu(rep_msg->offset16) & 0x000000FF));
+}
+
+static inline __be32 cm_rep_get_qpn(struct cm_rep_msg *rep_msg, enum ib_qp_type qp_type)
+{
+ return (qp_type == IB_QPT_XRC_INI) ?
+ cm_rep_get_local_eecn(rep_msg) : cm_rep_get_local_qpn(rep_msg);
+}
+
+static inline __be32 cm_rep_get_starting_psn(struct cm_rep_msg *rep_msg)
+{
+ return cpu_to_be32(be32_to_cpu(rep_msg->offset20) >> 8);
+}
+
+static inline void cm_rep_set_starting_psn(struct cm_rep_msg *rep_msg,
+ __be32 starting_psn)
+{
+ rep_msg->offset20 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) |
+ (be32_to_cpu(rep_msg->offset20) & 0x000000FF));
+}
+
+static inline u8 cm_rep_get_target_ack_delay(struct cm_rep_msg *rep_msg)
+{
+ return (u8) (rep_msg->offset26 >> 3);
+}
+
+static inline void cm_rep_set_target_ack_delay(struct cm_rep_msg *rep_msg,
+ u8 target_ack_delay)
+{
+ rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0x07) |
+ (target_ack_delay << 3));
+}
+
+static inline u8 cm_rep_get_failover(struct cm_rep_msg *rep_msg)
+{
+ return (u8) ((rep_msg->offset26 & 0x06) >> 1);
+}
+
+static inline void cm_rep_set_failover(struct cm_rep_msg *rep_msg, u8 failover)
+{
+ rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xF9) |
+ ((failover & 0x3) << 1));
+}
+
+static inline u8 cm_rep_get_flow_ctrl(struct cm_rep_msg *rep_msg)
+{
+ return (u8) (rep_msg->offset26 & 0x01);
+}
+
+static inline void cm_rep_set_flow_ctrl(struct cm_rep_msg *rep_msg,
+ u8 flow_ctrl)
+{
+ rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xFE) |
+ (flow_ctrl & 0x1));
+}
+
+static inline u8 cm_rep_get_rnr_retry_count(struct cm_rep_msg *rep_msg)
+{
+ return (u8) (rep_msg->offset27 >> 5);
+}
+
+static inline void cm_rep_set_rnr_retry_count(struct cm_rep_msg *rep_msg,
+ u8 rnr_retry_count)
+{
+ rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0x1F) |
+ (rnr_retry_count << 5));
+}
+
+static inline u8 cm_rep_get_srq(struct cm_rep_msg *rep_msg)
+{
+ return (u8) ((rep_msg->offset27 >> 4) & 0x1);
+}
+
+static inline void cm_rep_set_srq(struct cm_rep_msg *rep_msg, u8 srq)
+{
+ rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0xEF) |
+ ((srq & 0x1) << 4));
+}
+
+struct cm_rtu_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+
+ u8 private_data[IB_CM_RTU_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+struct cm_dreq_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+ /* remote QPN/EECN:24, rsvd:8 */
+ __be32 offset8;
+
+ u8 private_data[IB_CM_DREQ_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_dreq_get_remote_qpn(struct cm_dreq_msg *dreq_msg)
+{
+ return cpu_to_be32(be32_to_cpu(dreq_msg->offset8) >> 8);
+}
+
+static inline void cm_dreq_set_remote_qpn(struct cm_dreq_msg *dreq_msg, __be32 qpn)
+{
+ dreq_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+ (be32_to_cpu(dreq_msg->offset8) & 0x000000FF));
+}
+
+struct cm_drep_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+
+ u8 private_data[IB_CM_DREP_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+struct cm_lap_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+
+ __be32 rsvd8;
+ /* remote QPN/EECN:24, remote CM response timeout:5, rsvd:3 */
+ __be32 offset12;
+ __be32 rsvd16;
+
+ __be16 alt_local_lid;
+ __be16 alt_remote_lid;
+ union ib_gid alt_local_gid;
+ union ib_gid alt_remote_gid;
+ /* flow label:20, rsvd:4, traffic class:8 */
+ __be32 offset56;
+ u8 alt_hop_limit;
+ /* rsvd:2, packet rate:6 */
+ u8 offset61;
+ /* SL:4, subnet local:1, rsvd:3 */
+ u8 offset62;
+ /* local ACK timeout:5, rsvd:3 */
+ u8 offset63;
+
+ u8 private_data[IB_CM_LAP_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+static inline __be32 cm_lap_get_remote_qpn(struct cm_lap_msg *lap_msg)
+{
+ return cpu_to_be32(be32_to_cpu(lap_msg->offset12) >> 8);
+}
+
+static inline void cm_lap_set_remote_qpn(struct cm_lap_msg *lap_msg, __be32 qpn)
+{
+ lap_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+ (be32_to_cpu(lap_msg->offset12) &
+ 0x000000FF));
+}
+
+static inline u8 cm_lap_get_remote_resp_timeout(struct cm_lap_msg *lap_msg)
+{
+ return (u8) ((be32_to_cpu(lap_msg->offset12) & 0xF8) >> 3);
+}
+
+static inline void cm_lap_set_remote_resp_timeout(struct cm_lap_msg *lap_msg,
+ u8 resp_timeout)
+{
+ lap_msg->offset12 = cpu_to_be32((resp_timeout << 3) |
+ (be32_to_cpu(lap_msg->offset12) &
+ 0xFFFFFF07));
+}
+
+static inline __be32 cm_lap_get_flow_label(struct cm_lap_msg *lap_msg)
+{
+ return cpu_to_be32(be32_to_cpu(lap_msg->offset56) >> 12);
+}
+
+static inline void cm_lap_set_flow_label(struct cm_lap_msg *lap_msg,
+ __be32 flow_label)
+{
+ lap_msg->offset56 = cpu_to_be32(
+ (be32_to_cpu(lap_msg->offset56) & 0x00000FFF) |
+ (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_lap_get_traffic_class(struct cm_lap_msg *lap_msg)
+{
+ return (u8) be32_to_cpu(lap_msg->offset56);
+}
+
+static inline void cm_lap_set_traffic_class(struct cm_lap_msg *lap_msg,
+ u8 traffic_class)
+{
+ lap_msg->offset56 = cpu_to_be32(traffic_class |
+ (be32_to_cpu(lap_msg->offset56) &
+ 0xFFFFFF00));
+}
+
+static inline u8 cm_lap_get_packet_rate(struct cm_lap_msg *lap_msg)
+{
+ return lap_msg->offset61 & 0x3F;
+}
+
+static inline void cm_lap_set_packet_rate(struct cm_lap_msg *lap_msg,
+ u8 packet_rate)
+{
+ lap_msg->offset61 = (packet_rate & 0x3F) | (lap_msg->offset61 & 0xC0);
+}
+
+static inline u8 cm_lap_get_sl(struct cm_lap_msg *lap_msg)
+{
+ return lap_msg->offset62 >> 4;
+}
+
+static inline void cm_lap_set_sl(struct cm_lap_msg *lap_msg, u8 sl)
+{
+ lap_msg->offset62 = (sl << 4) | (lap_msg->offset62 & 0x0F);
+}
+
+static inline u8 cm_lap_get_subnet_local(struct cm_lap_msg *lap_msg)
+{
+ return (lap_msg->offset62 >> 3) & 0x1;
+}
+
+static inline void cm_lap_set_subnet_local(struct cm_lap_msg *lap_msg,
+ u8 subnet_local)
+{
+ lap_msg->offset62 = ((subnet_local & 0x1) << 3) |
+ (lap_msg->offset61 & 0xF7);
+}
+static inline u8 cm_lap_get_local_ack_timeout(struct cm_lap_msg *lap_msg)
+{
+ return lap_msg->offset63 >> 3;
+}
+
+static inline void cm_lap_set_local_ack_timeout(struct cm_lap_msg *lap_msg,
+ u8 local_ack_timeout)
+{
+ lap_msg->offset63 = (local_ack_timeout << 3) |
+ (lap_msg->offset63 & 0x07);
+}
+
+struct cm_apr_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+
+ u8 info_length;
+ u8 ap_status;
+ __be16 rsvd;
+ u8 info[IB_CM_APR_INFO_LENGTH];
+
+ u8 private_data[IB_CM_APR_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+struct cm_sidr_req_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 request_id;
+ __be16 pkey;
+ __be16 rsvd;
+ __be64 service_id;
+
+ u32 private_data[IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE / sizeof(u32)];
+} __attribute__ ((packed));
+
+struct cm_sidr_rep_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 request_id;
+ u8 status;
+ u8 info_length;
+ __be16 rsvd;
+ /* QPN:24, rsvd:8 */
+ __be32 offset8;
+ __be64 service_id;
+ __be32 qkey;
+ u8 info[IB_CM_SIDR_REP_INFO_LENGTH];
+
+ u8 private_data[IB_CM_SIDR_REP_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+static inline __be32 cm_sidr_rep_get_qpn(struct cm_sidr_rep_msg *sidr_rep_msg)
+{
+ return cpu_to_be32(be32_to_cpu(sidr_rep_msg->offset8) >> 8);
+}
+
+static inline void cm_sidr_rep_set_qpn(struct cm_sidr_rep_msg *sidr_rep_msg,
+ __be32 qpn)
+{
+ sidr_rep_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+ (be32_to_cpu(sidr_rep_msg->offset8) &
+ 0x000000FF));
+}
+
+#endif /* CM_MSGS_H */
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
new file mode 100644
index 000000000..842a30947
--- /dev/null
+++ b/drivers/infiniband/core/cma.c
@@ -0,0 +1,4702 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc. All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/mutex.h>
+#include <linux/random.h>
+#include <linux/igmp.h>
+#include <linux/idr.h>
+#include <linux/inetdevice.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/route.h>
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/ip_fib.h>
+#include <net/ip6_route.h>
+
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
+#include <rdma/rdma_netlink.h>
+#include <rdma/ib.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_sa.h>
+#include <rdma/iw_cm.h>
+
+#include "core_priv.h"
+#include "cma_priv.h"
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("Generic RDMA CM Agent");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#define CMA_CM_RESPONSE_TIMEOUT 20
+#define CMA_QUERY_CLASSPORT_INFO_TIMEOUT 3000
+#define CMA_MAX_CM_RETRIES 15
+#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
+#define CMA_IBOE_PACKET_LIFETIME 18
+#define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP
+
+static const char * const cma_events[] = {
+ [RDMA_CM_EVENT_ADDR_RESOLVED] = "address resolved",
+ [RDMA_CM_EVENT_ADDR_ERROR] = "address error",
+ [RDMA_CM_EVENT_ROUTE_RESOLVED] = "route resolved ",
+ [RDMA_CM_EVENT_ROUTE_ERROR] = "route error",
+ [RDMA_CM_EVENT_CONNECT_REQUEST] = "connect request",
+ [RDMA_CM_EVENT_CONNECT_RESPONSE] = "connect response",
+ [RDMA_CM_EVENT_CONNECT_ERROR] = "connect error",
+ [RDMA_CM_EVENT_UNREACHABLE] = "unreachable",
+ [RDMA_CM_EVENT_REJECTED] = "rejected",
+ [RDMA_CM_EVENT_ESTABLISHED] = "established",
+ [RDMA_CM_EVENT_DISCONNECTED] = "disconnected",
+ [RDMA_CM_EVENT_DEVICE_REMOVAL] = "device removal",
+ [RDMA_CM_EVENT_MULTICAST_JOIN] = "multicast join",
+ [RDMA_CM_EVENT_MULTICAST_ERROR] = "multicast error",
+ [RDMA_CM_EVENT_ADDR_CHANGE] = "address change",
+ [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit",
+};
+
+const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event)
+{
+ size_t index = event;
+
+ return (index < ARRAY_SIZE(cma_events) && cma_events[index]) ?
+ cma_events[index] : "unrecognized event";
+}
+EXPORT_SYMBOL(rdma_event_msg);
+
+const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id,
+ int reason)
+{
+ if (rdma_ib_or_roce(id->device, id->port_num))
+ return ibcm_reject_msg(reason);
+
+ if (rdma_protocol_iwarp(id->device, id->port_num))
+ return iwcm_reject_msg(reason);
+
+ WARN_ON_ONCE(1);
+ return "unrecognized transport";
+}
+EXPORT_SYMBOL(rdma_reject_msg);
+
+bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason)
+{
+ if (rdma_ib_or_roce(id->device, id->port_num))
+ return reason == IB_CM_REJ_CONSUMER_DEFINED;
+
+ if (rdma_protocol_iwarp(id->device, id->port_num))
+ return reason == -ECONNREFUSED;
+
+ WARN_ON_ONCE(1);
+ return false;
+}
+EXPORT_SYMBOL(rdma_is_consumer_reject);
+
+const void *rdma_consumer_reject_data(struct rdma_cm_id *id,
+ struct rdma_cm_event *ev, u8 *data_len)
+{
+ const void *p;
+
+ if (rdma_is_consumer_reject(id, ev->status)) {
+ *data_len = ev->param.conn.private_data_len;
+ p = ev->param.conn.private_data;
+ } else {
+ *data_len = 0;
+ p = NULL;
+ }
+ return p;
+}
+EXPORT_SYMBOL(rdma_consumer_reject_data);
+
+/**
+ * rdma_iw_cm_id() - return the iw_cm_id pointer for this cm_id.
+ * @id: Communication Identifier
+ */
+struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id)
+{
+ struct rdma_id_private *id_priv;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (id->device->node_type == RDMA_NODE_RNIC)
+ return id_priv->cm_id.iw;
+ return NULL;
+}
+EXPORT_SYMBOL(rdma_iw_cm_id);
+
+/**
+ * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack.
+ * @res: rdma resource tracking entry pointer
+ */
+struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res)
+{
+ struct rdma_id_private *id_priv =
+ container_of(res, struct rdma_id_private, res);
+
+ return &id_priv->id;
+}
+EXPORT_SYMBOL(rdma_res_to_id);
+
+static void cma_add_one(struct ib_device *device);
+static void cma_remove_one(struct ib_device *device, void *client_data);
+
+static struct ib_client cma_client = {
+ .name = "cma",
+ .add = cma_add_one,
+ .remove = cma_remove_one
+};
+
+static struct ib_sa_client sa_client;
+static LIST_HEAD(dev_list);
+static LIST_HEAD(listen_any_list);
+static DEFINE_MUTEX(lock);
+static struct workqueue_struct *cma_wq;
+static unsigned int cma_pernet_id;
+
+struct cma_pernet {
+ struct idr tcp_ps;
+ struct idr udp_ps;
+ struct idr ipoib_ps;
+ struct idr ib_ps;
+};
+
+static struct cma_pernet *cma_pernet(struct net *net)
+{
+ return net_generic(net, cma_pernet_id);
+}
+
+static struct idr *cma_pernet_idr(struct net *net, enum rdma_ucm_port_space ps)
+{
+ struct cma_pernet *pernet = cma_pernet(net);
+
+ switch (ps) {
+ case RDMA_PS_TCP:
+ return &pernet->tcp_ps;
+ case RDMA_PS_UDP:
+ return &pernet->udp_ps;
+ case RDMA_PS_IPOIB:
+ return &pernet->ipoib_ps;
+ case RDMA_PS_IB:
+ return &pernet->ib_ps;
+ default:
+ return NULL;
+ }
+}
+
+struct cma_device {
+ struct list_head list;
+ struct ib_device *device;
+ struct completion comp;
+ atomic_t refcount;
+ struct list_head id_list;
+ enum ib_gid_type *default_gid_type;
+ u8 *default_roce_tos;
+};
+
+struct rdma_bind_list {
+ enum rdma_ucm_port_space ps;
+ struct hlist_head owners;
+ unsigned short port;
+};
+
+struct class_port_info_context {
+ struct ib_class_port_info *class_port_info;
+ struct ib_device *device;
+ struct completion done;
+ struct ib_sa_query *sa_query;
+ u8 port_num;
+};
+
+static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps,
+ struct rdma_bind_list *bind_list, int snum)
+{
+ struct idr *idr = cma_pernet_idr(net, ps);
+
+ return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL);
+}
+
+static struct rdma_bind_list *cma_ps_find(struct net *net,
+ enum rdma_ucm_port_space ps, int snum)
+{
+ struct idr *idr = cma_pernet_idr(net, ps);
+
+ return idr_find(idr, snum);
+}
+
+static void cma_ps_remove(struct net *net, enum rdma_ucm_port_space ps,
+ int snum)
+{
+ struct idr *idr = cma_pernet_idr(net, ps);
+
+ idr_remove(idr, snum);
+}
+
+enum {
+ CMA_OPTION_AFONLY,
+};
+
+void cma_ref_dev(struct cma_device *cma_dev)
+{
+ atomic_inc(&cma_dev->refcount);
+}
+
+struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter,
+ void *cookie)
+{
+ struct cma_device *cma_dev;
+ struct cma_device *found_cma_dev = NULL;
+
+ mutex_lock(&lock);
+
+ list_for_each_entry(cma_dev, &dev_list, list)
+ if (filter(cma_dev->device, cookie)) {
+ found_cma_dev = cma_dev;
+ break;
+ }
+
+ if (found_cma_dev)
+ cma_ref_dev(found_cma_dev);
+ mutex_unlock(&lock);
+ return found_cma_dev;
+}
+
+int cma_get_default_gid_type(struct cma_device *cma_dev,
+ unsigned int port)
+{
+ if (!rdma_is_port_valid(cma_dev->device, port))
+ return -EINVAL;
+
+ return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)];
+}
+
+int cma_set_default_gid_type(struct cma_device *cma_dev,
+ unsigned int port,
+ enum ib_gid_type default_gid_type)
+{
+ unsigned long supported_gids;
+
+ if (!rdma_is_port_valid(cma_dev->device, port))
+ return -EINVAL;
+
+ supported_gids = roce_gid_type_mask_support(cma_dev->device, port);
+
+ if (!(supported_gids & 1 << default_gid_type))
+ return -EINVAL;
+
+ cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] =
+ default_gid_type;
+
+ return 0;
+}
+
+int cma_get_default_roce_tos(struct cma_device *cma_dev, unsigned int port)
+{
+ if (!rdma_is_port_valid(cma_dev->device, port))
+ return -EINVAL;
+
+ return cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)];
+}
+
+int cma_set_default_roce_tos(struct cma_device *cma_dev, unsigned int port,
+ u8 default_roce_tos)
+{
+ if (!rdma_is_port_valid(cma_dev->device, port))
+ return -EINVAL;
+
+ cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)] =
+ default_roce_tos;
+
+ return 0;
+}
+struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev)
+{
+ return cma_dev->device;
+}
+
+/*
+ * Device removal can occur at anytime, so we need extra handling to
+ * serialize notifying the user of device removal with other callbacks.
+ * We do this by disabling removal notification while a callback is in process,
+ * and reporting it after the callback completes.
+ */
+
+struct cma_multicast {
+ struct rdma_id_private *id_priv;
+ union {
+ struct ib_sa_multicast *ib;
+ } multicast;
+ struct list_head list;
+ void *context;
+ struct sockaddr_storage addr;
+ struct kref mcref;
+ u8 join_state;
+};
+
+struct cma_work {
+ struct work_struct work;
+ struct rdma_id_private *id;
+ enum rdma_cm_state old_state;
+ enum rdma_cm_state new_state;
+ struct rdma_cm_event event;
+};
+
+struct cma_ndev_work {
+ struct work_struct work;
+ struct rdma_id_private *id;
+ struct rdma_cm_event event;
+};
+
+struct iboe_mcast_work {
+ struct work_struct work;
+ struct rdma_id_private *id;
+ struct cma_multicast *mc;
+};
+
+union cma_ip_addr {
+ struct in6_addr ip6;
+ struct {
+ __be32 pad[3];
+ __be32 addr;
+ } ip4;
+};
+
+struct cma_hdr {
+ u8 cma_version;
+ u8 ip_version; /* IP version: 7:4 */
+ __be16 port;
+ union cma_ip_addr src_addr;
+ union cma_ip_addr dst_addr;
+};
+
+#define CMA_VERSION 0x00
+
+struct cma_req_info {
+ struct sockaddr_storage listen_addr_storage;
+ struct sockaddr_storage src_addr_storage;
+ struct ib_device *device;
+ union ib_gid local_gid;
+ __be64 service_id;
+ int port;
+ bool has_gid;
+ u16 pkey;
+};
+
+static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&id_priv->lock, flags);
+ ret = (id_priv->state == comp);
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+ return ret;
+}
+
+static int cma_comp_exch(struct rdma_id_private *id_priv,
+ enum rdma_cm_state comp, enum rdma_cm_state exch)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&id_priv->lock, flags);
+ if ((ret = (id_priv->state == comp)))
+ id_priv->state = exch;
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+ return ret;
+}
+
+static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv,
+ enum rdma_cm_state exch)
+{
+ unsigned long flags;
+ enum rdma_cm_state old;
+
+ spin_lock_irqsave(&id_priv->lock, flags);
+ old = id_priv->state;
+ id_priv->state = exch;
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+ return old;
+}
+
+static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr)
+{
+ return hdr->ip_version >> 4;
+}
+
+static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
+{
+ hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF);
+}
+
+static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join)
+{
+ struct in_device *in_dev = NULL;
+
+ if (ndev) {
+ rtnl_lock();
+ in_dev = __in_dev_get_rtnl(ndev);
+ if (in_dev) {
+ if (join)
+ ip_mc_inc_group(in_dev,
+ *(__be32 *)(mgid->raw + 12));
+ else
+ ip_mc_dec_group(in_dev,
+ *(__be32 *)(mgid->raw + 12));
+ }
+ rtnl_unlock();
+ }
+ return (in_dev) ? 0 : -ENODEV;
+}
+
+static void _cma_attach_to_dev(struct rdma_id_private *id_priv,
+ struct cma_device *cma_dev)
+{
+ cma_ref_dev(cma_dev);
+ id_priv->cma_dev = cma_dev;
+ id_priv->id.device = cma_dev->device;
+ id_priv->id.route.addr.dev_addr.transport =
+ rdma_node_get_transport(cma_dev->device->node_type);
+ list_add_tail(&id_priv->list, &cma_dev->id_list);
+ rdma_restrack_add(&id_priv->res);
+}
+
+static void cma_attach_to_dev(struct rdma_id_private *id_priv,
+ struct cma_device *cma_dev)
+{
+ _cma_attach_to_dev(id_priv, cma_dev);
+ id_priv->gid_type =
+ cma_dev->default_gid_type[id_priv->id.port_num -
+ rdma_start_port(cma_dev->device)];
+}
+
+void cma_deref_dev(struct cma_device *cma_dev)
+{
+ if (atomic_dec_and_test(&cma_dev->refcount))
+ complete(&cma_dev->comp);
+}
+
+static inline void release_mc(struct kref *kref)
+{
+ struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref);
+
+ kfree(mc->multicast.ib);
+ kfree(mc);
+}
+
+static void cma_release_dev(struct rdma_id_private *id_priv)
+{
+ mutex_lock(&lock);
+ list_del(&id_priv->list);
+ cma_deref_dev(id_priv->cma_dev);
+ id_priv->cma_dev = NULL;
+ mutex_unlock(&lock);
+}
+
+static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv)
+{
+ return (struct sockaddr *) &id_priv->id.route.addr.src_addr;
+}
+
+static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv)
+{
+ return (struct sockaddr *) &id_priv->id.route.addr.dst_addr;
+}
+
+static inline unsigned short cma_family(struct rdma_id_private *id_priv)
+{
+ return id_priv->id.route.addr.src_addr.ss_family;
+}
+
+static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey)
+{
+ struct ib_sa_mcmember_rec rec;
+ int ret = 0;
+
+ if (id_priv->qkey) {
+ if (qkey && id_priv->qkey != qkey)
+ return -EINVAL;
+ return 0;
+ }
+
+ if (qkey) {
+ id_priv->qkey = qkey;
+ return 0;
+ }
+
+ switch (id_priv->id.ps) {
+ case RDMA_PS_UDP:
+ case RDMA_PS_IB:
+ id_priv->qkey = RDMA_UDP_QKEY;
+ break;
+ case RDMA_PS_IPOIB:
+ ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid);
+ ret = ib_sa_get_mcmember_rec(id_priv->id.device,
+ id_priv->id.port_num, &rec.mgid,
+ &rec);
+ if (!ret)
+ id_priv->qkey = be32_to_cpu(rec.qkey);
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr)
+{
+ dev_addr->dev_type = ARPHRD_INFINIBAND;
+ rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr);
+ ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey));
+}
+
+static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr)
+{
+ int ret;
+
+ if (addr->sa_family != AF_IB) {
+ ret = rdma_translate_ip(addr, dev_addr);
+ } else {
+ cma_translate_ib((struct sockaddr_ib *) addr, dev_addr);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static const struct ib_gid_attr *
+cma_validate_port(struct ib_device *device, u8 port,
+ enum ib_gid_type gid_type,
+ union ib_gid *gid,
+ struct rdma_id_private *id_priv)
+{
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ int bound_if_index = dev_addr->bound_dev_if;
+ const struct ib_gid_attr *sgid_attr;
+ int dev_type = dev_addr->dev_type;
+ struct net_device *ndev = NULL;
+
+ if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port))
+ return ERR_PTR(-ENODEV);
+
+ if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port))
+ return ERR_PTR(-ENODEV);
+
+ if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) {
+ ndev = dev_get_by_index(dev_addr->net, bound_if_index);
+ if (!ndev)
+ return ERR_PTR(-ENODEV);
+ } else {
+ gid_type = IB_GID_TYPE_IB;
+ }
+
+ sgid_attr = rdma_find_gid_by_port(device, gid, gid_type, port, ndev);
+ if (ndev)
+ dev_put(ndev);
+ return sgid_attr;
+}
+
+static void cma_bind_sgid_attr(struct rdma_id_private *id_priv,
+ const struct ib_gid_attr *sgid_attr)
+{
+ WARN_ON(id_priv->id.route.addr.dev_addr.sgid_attr);
+ id_priv->id.route.addr.dev_addr.sgid_attr = sgid_attr;
+}
+
+static int cma_acquire_dev(struct rdma_id_private *id_priv,
+ const struct rdma_id_private *listen_id_priv)
+{
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ const struct ib_gid_attr *sgid_attr;
+ struct cma_device *cma_dev;
+ union ib_gid gid, iboe_gid, *gidp;
+ enum ib_gid_type gid_type;
+ int ret = -ENODEV;
+ u8 port;
+
+ if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
+ id_priv->id.ps == RDMA_PS_IPOIB)
+ return -EINVAL;
+
+ mutex_lock(&lock);
+ rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+ &iboe_gid);
+
+ memcpy(&gid, dev_addr->src_dev_addr +
+ rdma_addr_gid_offset(dev_addr), sizeof gid);
+
+ if (listen_id_priv) {
+ cma_dev = listen_id_priv->cma_dev;
+ port = listen_id_priv->id.port_num;
+ gidp = rdma_protocol_roce(cma_dev->device, port) ?
+ &iboe_gid : &gid;
+ gid_type = listen_id_priv->gid_type;
+ sgid_attr = cma_validate_port(cma_dev->device, port,
+ gid_type, gidp, id_priv);
+ if (!IS_ERR(sgid_attr)) {
+ id_priv->id.port_num = port;
+ cma_bind_sgid_attr(id_priv, sgid_attr);
+ ret = 0;
+ goto out;
+ }
+ }
+
+ list_for_each_entry(cma_dev, &dev_list, list) {
+ for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) {
+ if (listen_id_priv &&
+ listen_id_priv->cma_dev == cma_dev &&
+ listen_id_priv->id.port_num == port)
+ continue;
+
+ gidp = rdma_protocol_roce(cma_dev->device, port) ?
+ &iboe_gid : &gid;
+ gid_type = cma_dev->default_gid_type[port - 1];
+ sgid_attr = cma_validate_port(cma_dev->device, port,
+ gid_type, gidp, id_priv);
+ if (!IS_ERR(sgid_attr)) {
+ id_priv->id.port_num = port;
+ cma_bind_sgid_attr(id_priv, sgid_attr);
+ ret = 0;
+ goto out;
+ }
+ }
+ }
+
+out:
+ if (!ret)
+ cma_attach_to_dev(id_priv, cma_dev);
+
+ mutex_unlock(&lock);
+ return ret;
+}
+
+/*
+ * Select the source IB device and address to reach the destination IB address.
+ */
+static int cma_resolve_ib_dev(struct rdma_id_private *id_priv)
+{
+ struct cma_device *cma_dev, *cur_dev;
+ struct sockaddr_ib *addr;
+ union ib_gid gid, sgid, *dgid;
+ u16 pkey, index;
+ u8 p;
+ enum ib_port_state port_state;
+ int i;
+
+ cma_dev = NULL;
+ addr = (struct sockaddr_ib *) cma_dst_addr(id_priv);
+ dgid = (union ib_gid *) &addr->sib_addr;
+ pkey = ntohs(addr->sib_pkey);
+
+ mutex_lock(&lock);
+ list_for_each_entry(cur_dev, &dev_list, list) {
+ for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) {
+ if (!rdma_cap_af_ib(cur_dev->device, p))
+ continue;
+
+ if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index))
+ continue;
+
+ if (ib_get_cached_port_state(cur_dev->device, p, &port_state))
+ continue;
+ for (i = 0; !rdma_query_gid(cur_dev->device,
+ p, i, &gid);
+ i++) {
+ if (!memcmp(&gid, dgid, sizeof(gid))) {
+ cma_dev = cur_dev;
+ sgid = gid;
+ id_priv->id.port_num = p;
+ goto found;
+ }
+
+ if (!cma_dev && (gid.global.subnet_prefix ==
+ dgid->global.subnet_prefix) &&
+ port_state == IB_PORT_ACTIVE) {
+ cma_dev = cur_dev;
+ sgid = gid;
+ id_priv->id.port_num = p;
+ goto found;
+ }
+ }
+ }
+ }
+ mutex_unlock(&lock);
+ return -ENODEV;
+
+found:
+ cma_attach_to_dev(id_priv, cma_dev);
+ mutex_unlock(&lock);
+ addr = (struct sockaddr_ib *)cma_src_addr(id_priv);
+ memcpy(&addr->sib_addr, &sgid, sizeof(sgid));
+ cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr);
+ return 0;
+}
+
+static void cma_deref_id(struct rdma_id_private *id_priv)
+{
+ if (atomic_dec_and_test(&id_priv->refcount))
+ complete(&id_priv->comp);
+}
+
+struct rdma_cm_id *__rdma_create_id(struct net *net,
+ rdma_cm_event_handler event_handler,
+ void *context, enum rdma_ucm_port_space ps,
+ enum ib_qp_type qp_type, const char *caller)
+{
+ struct rdma_id_private *id_priv;
+
+ id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL);
+ if (!id_priv)
+ return ERR_PTR(-ENOMEM);
+
+ if (caller)
+ id_priv->res.kern_name = caller;
+ else
+ rdma_restrack_set_task(&id_priv->res, current);
+ id_priv->res.type = RDMA_RESTRACK_CM_ID;
+ id_priv->state = RDMA_CM_IDLE;
+ id_priv->id.context = context;
+ id_priv->id.event_handler = event_handler;
+ id_priv->id.ps = ps;
+ id_priv->id.qp_type = qp_type;
+ id_priv->tos_set = false;
+ id_priv->gid_type = IB_GID_TYPE_IB;
+ spin_lock_init(&id_priv->lock);
+ mutex_init(&id_priv->qp_mutex);
+ init_completion(&id_priv->comp);
+ atomic_set(&id_priv->refcount, 1);
+ mutex_init(&id_priv->handler_mutex);
+ INIT_LIST_HEAD(&id_priv->listen_list);
+ INIT_LIST_HEAD(&id_priv->mc_list);
+ get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
+ id_priv->id.route.addr.dev_addr.net = get_net(net);
+ id_priv->seq_num &= 0x00ffffff;
+
+ return &id_priv->id;
+}
+EXPORT_SYMBOL(__rdma_create_id);
+
+static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ qp_attr.qp_state = IB_QPS_INIT;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ return ret;
+
+ ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+ if (ret)
+ return ret;
+
+ qp_attr.qp_state = IB_QPS_RTR;
+ ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+ if (ret)
+ return ret;
+
+ qp_attr.qp_state = IB_QPS_RTS;
+ qp_attr.sq_psn = 0;
+ ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN);
+
+ return ret;
+}
+
+static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ qp_attr.qp_state = IB_QPS_INIT;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ return ret;
+
+ return ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+}
+
+int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ struct rdma_id_private *id_priv;
+ struct ib_qp *qp;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (id->device != pd->device)
+ return -EINVAL;
+
+ qp_init_attr->port_num = id->port_num;
+ qp = ib_create_qp(pd, qp_init_attr);
+ if (IS_ERR(qp))
+ return PTR_ERR(qp);
+
+ if (id->qp_type == IB_QPT_UD)
+ ret = cma_init_ud_qp(id_priv, qp);
+ else
+ ret = cma_init_conn_qp(id_priv, qp);
+ if (ret)
+ goto err;
+
+ id->qp = qp;
+ id_priv->qp_num = qp->qp_num;
+ id_priv->srq = (qp->srq != NULL);
+ return 0;
+err:
+ ib_destroy_qp(qp);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_create_qp);
+
+void rdma_destroy_qp(struct rdma_cm_id *id)
+{
+ struct rdma_id_private *id_priv;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ mutex_lock(&id_priv->qp_mutex);
+ ib_destroy_qp(id_priv->id.qp);
+ id_priv->id.qp = NULL;
+ mutex_unlock(&id_priv->qp_mutex);
+}
+EXPORT_SYMBOL(rdma_destroy_qp);
+
+static int cma_modify_qp_rtr(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ mutex_lock(&id_priv->qp_mutex);
+ if (!id_priv->id.qp) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Need to update QP attributes from default values. */
+ qp_attr.qp_state = IB_QPS_INIT;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ goto out;
+
+ ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+ if (ret)
+ goto out;
+
+ qp_attr.qp_state = IB_QPS_RTR;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ goto out;
+
+ BUG_ON(id_priv->cma_dev->device != id_priv->id.device);
+
+ if (conn_param)
+ qp_attr.max_dest_rd_atomic = conn_param->responder_resources;
+ ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+out:
+ mutex_unlock(&id_priv->qp_mutex);
+ return ret;
+}
+
+static int cma_modify_qp_rts(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ mutex_lock(&id_priv->qp_mutex);
+ if (!id_priv->id.qp) {
+ ret = 0;
+ goto out;
+ }
+
+ qp_attr.qp_state = IB_QPS_RTS;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ goto out;
+
+ if (conn_param)
+ qp_attr.max_rd_atomic = conn_param->initiator_depth;
+ ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+out:
+ mutex_unlock(&id_priv->qp_mutex);
+ return ret;
+}
+
+static int cma_modify_qp_err(struct rdma_id_private *id_priv)
+{
+ struct ib_qp_attr qp_attr;
+ int ret;
+
+ mutex_lock(&id_priv->qp_mutex);
+ if (!id_priv->id.qp) {
+ ret = 0;
+ goto out;
+ }
+
+ qp_attr.qp_state = IB_QPS_ERR;
+ ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE);
+out:
+ mutex_unlock(&id_priv->qp_mutex);
+ return ret;
+}
+
+static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
+ struct ib_qp_attr *qp_attr, int *qp_attr_mask)
+{
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ int ret;
+ u16 pkey;
+
+ if (rdma_cap_eth_ah(id_priv->id.device, id_priv->id.port_num))
+ pkey = 0xffff;
+ else
+ pkey = ib_addr_get_pkey(dev_addr);
+
+ ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num,
+ pkey, &qp_attr->pkey_index);
+ if (ret)
+ return ret;
+
+ qp_attr->port_num = id_priv->id.port_num;
+ *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT;
+
+ if (id_priv->id.qp_type == IB_QPT_UD) {
+ ret = cma_set_qkey(id_priv, 0);
+ if (ret)
+ return ret;
+
+ qp_attr->qkey = id_priv->qkey;
+ *qp_attr_mask |= IB_QP_QKEY;
+ } else {
+ qp_attr->qp_access_flags = 0;
+ *qp_attr_mask |= IB_QP_ACCESS_FLAGS;
+ }
+ return 0;
+}
+
+int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ struct rdma_id_private *id_priv;
+ int ret = 0;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (rdma_cap_ib_cm(id->device, id->port_num)) {
+ if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD))
+ ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask);
+ else
+ ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr,
+ qp_attr_mask);
+
+ if (qp_attr->qp_state == IB_QPS_RTR)
+ qp_attr->rq_psn = id_priv->seq_num;
+ } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
+ if (!id_priv->cm_id.iw) {
+ qp_attr->qp_access_flags = 0;
+ *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
+ } else
+ ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr,
+ qp_attr_mask);
+ qp_attr->port_num = id_priv->id.port_num;
+ *qp_attr_mask |= IB_QP_PORT;
+ } else
+ ret = -ENOSYS;
+
+ return ret;
+}
+EXPORT_SYMBOL(rdma_init_qp_attr);
+
+static inline bool cma_zero_addr(const struct sockaddr *addr)
+{
+ switch (addr->sa_family) {
+ case AF_INET:
+ return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr);
+ case AF_INET6:
+ return ipv6_addr_any(&((struct sockaddr_in6 *)addr)->sin6_addr);
+ case AF_IB:
+ return ib_addr_any(&((struct sockaddr_ib *)addr)->sib_addr);
+ default:
+ return false;
+ }
+}
+
+static inline bool cma_loopback_addr(const struct sockaddr *addr)
+{
+ switch (addr->sa_family) {
+ case AF_INET:
+ return ipv4_is_loopback(
+ ((struct sockaddr_in *)addr)->sin_addr.s_addr);
+ case AF_INET6:
+ return ipv6_addr_loopback(
+ &((struct sockaddr_in6 *)addr)->sin6_addr);
+ case AF_IB:
+ return ib_addr_loopback(
+ &((struct sockaddr_ib *)addr)->sib_addr);
+ default:
+ return false;
+ }
+}
+
+static inline bool cma_any_addr(const struct sockaddr *addr)
+{
+ return cma_zero_addr(addr) || cma_loopback_addr(addr);
+}
+
+static int cma_addr_cmp(const struct sockaddr *src, const struct sockaddr *dst)
+{
+ if (src->sa_family != dst->sa_family)
+ return -1;
+
+ switch (src->sa_family) {
+ case AF_INET:
+ return ((struct sockaddr_in *)src)->sin_addr.s_addr !=
+ ((struct sockaddr_in *)dst)->sin_addr.s_addr;
+ case AF_INET6: {
+ struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)src;
+ struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst;
+ bool link_local;
+
+ if (ipv6_addr_cmp(&src_addr6->sin6_addr,
+ &dst_addr6->sin6_addr))
+ return 1;
+ link_local = ipv6_addr_type(&dst_addr6->sin6_addr) &
+ IPV6_ADDR_LINKLOCAL;
+ /* Link local must match their scope_ids */
+ return link_local ? (src_addr6->sin6_scope_id !=
+ dst_addr6->sin6_scope_id) :
+ 0;
+ }
+
+ default:
+ return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr,
+ &((struct sockaddr_ib *) dst)->sib_addr);
+ }
+}
+
+static __be16 cma_port(const struct sockaddr *addr)
+{
+ struct sockaddr_ib *sib;
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ return ((struct sockaddr_in *) addr)->sin_port;
+ case AF_INET6:
+ return ((struct sockaddr_in6 *) addr)->sin6_port;
+ case AF_IB:
+ sib = (struct sockaddr_ib *) addr;
+ return htons((u16) (be64_to_cpu(sib->sib_sid) &
+ be64_to_cpu(sib->sib_sid_mask)));
+ default:
+ return 0;
+ }
+}
+
+static inline int cma_any_port(const struct sockaddr *addr)
+{
+ return !cma_port(addr);
+}
+
+static void cma_save_ib_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ const struct rdma_cm_id *listen_id,
+ const struct sa_path_rec *path)
+{
+ struct sockaddr_ib *listen_ib, *ib;
+
+ listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr;
+ if (src_addr) {
+ ib = (struct sockaddr_ib *)src_addr;
+ ib->sib_family = AF_IB;
+ if (path) {
+ ib->sib_pkey = path->pkey;
+ ib->sib_flowinfo = path->flow_label;
+ memcpy(&ib->sib_addr, &path->sgid, 16);
+ ib->sib_sid = path->service_id;
+ ib->sib_scope_id = 0;
+ } else {
+ ib->sib_pkey = listen_ib->sib_pkey;
+ ib->sib_flowinfo = listen_ib->sib_flowinfo;
+ ib->sib_addr = listen_ib->sib_addr;
+ ib->sib_sid = listen_ib->sib_sid;
+ ib->sib_scope_id = listen_ib->sib_scope_id;
+ }
+ ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
+ }
+ if (dst_addr) {
+ ib = (struct sockaddr_ib *)dst_addr;
+ ib->sib_family = AF_IB;
+ if (path) {
+ ib->sib_pkey = path->pkey;
+ ib->sib_flowinfo = path->flow_label;
+ memcpy(&ib->sib_addr, &path->dgid, 16);
+ }
+ }
+}
+
+static void cma_save_ip4_info(struct sockaddr_in *src_addr,
+ struct sockaddr_in *dst_addr,
+ struct cma_hdr *hdr,
+ __be16 local_port)
+{
+ if (src_addr) {
+ *src_addr = (struct sockaddr_in) {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = hdr->dst_addr.ip4.addr,
+ .sin_port = local_port,
+ };
+ }
+
+ if (dst_addr) {
+ *dst_addr = (struct sockaddr_in) {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = hdr->src_addr.ip4.addr,
+ .sin_port = hdr->port,
+ };
+ }
+}
+
+static void cma_save_ip6_info(struct sockaddr_in6 *src_addr,
+ struct sockaddr_in6 *dst_addr,
+ struct cma_hdr *hdr,
+ __be16 local_port)
+{
+ if (src_addr) {
+ *src_addr = (struct sockaddr_in6) {
+ .sin6_family = AF_INET6,
+ .sin6_addr = hdr->dst_addr.ip6,
+ .sin6_port = local_port,
+ };
+ }
+
+ if (dst_addr) {
+ *dst_addr = (struct sockaddr_in6) {
+ .sin6_family = AF_INET6,
+ .sin6_addr = hdr->src_addr.ip6,
+ .sin6_port = hdr->port,
+ };
+ }
+}
+
+static u16 cma_port_from_service_id(__be64 service_id)
+{
+ return (u16)be64_to_cpu(service_id);
+}
+
+static int cma_save_ip_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ const struct ib_cm_event *ib_event,
+ __be64 service_id)
+{
+ struct cma_hdr *hdr;
+ __be16 port;
+
+ hdr = ib_event->private_data;
+ if (hdr->cma_version != CMA_VERSION)
+ return -EINVAL;
+
+ port = htons(cma_port_from_service_id(service_id));
+
+ switch (cma_get_ip_ver(hdr)) {
+ case 4:
+ cma_save_ip4_info((struct sockaddr_in *)src_addr,
+ (struct sockaddr_in *)dst_addr, hdr, port);
+ break;
+ case 6:
+ cma_save_ip6_info((struct sockaddr_in6 *)src_addr,
+ (struct sockaddr_in6 *)dst_addr, hdr, port);
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ return 0;
+}
+
+static int cma_save_net_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ const struct rdma_cm_id *listen_id,
+ const struct ib_cm_event *ib_event,
+ sa_family_t sa_family, __be64 service_id)
+{
+ if (sa_family == AF_IB) {
+ if (ib_event->event == IB_CM_REQ_RECEIVED)
+ cma_save_ib_info(src_addr, dst_addr, listen_id,
+ ib_event->param.req_rcvd.primary_path);
+ else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
+ cma_save_ib_info(src_addr, dst_addr, listen_id, NULL);
+ return 0;
+ }
+
+ return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id);
+}
+
+static int cma_save_req_info(const struct ib_cm_event *ib_event,
+ struct cma_req_info *req)
+{
+ const struct ib_cm_req_event_param *req_param =
+ &ib_event->param.req_rcvd;
+ const struct ib_cm_sidr_req_event_param *sidr_param =
+ &ib_event->param.sidr_req_rcvd;
+
+ switch (ib_event->event) {
+ case IB_CM_REQ_RECEIVED:
+ req->device = req_param->listen_id->device;
+ req->port = req_param->port;
+ memcpy(&req->local_gid, &req_param->primary_path->sgid,
+ sizeof(req->local_gid));
+ req->has_gid = true;
+ req->service_id = req_param->primary_path->service_id;
+ req->pkey = be16_to_cpu(req_param->primary_path->pkey);
+ if (req->pkey != req_param->bth_pkey)
+ pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and primary path P_Key (0x%x)\n"
+ "RDMA CMA: in the future this may cause the request to be dropped\n",
+ req_param->bth_pkey, req->pkey);
+ break;
+ case IB_CM_SIDR_REQ_RECEIVED:
+ req->device = sidr_param->listen_id->device;
+ req->port = sidr_param->port;
+ req->has_gid = false;
+ req->service_id = sidr_param->service_id;
+ req->pkey = sidr_param->pkey;
+ if (req->pkey != sidr_param->bth_pkey)
+ pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and SIDR request payload P_Key (0x%x)\n"
+ "RDMA CMA: in the future this may cause the request to be dropped\n",
+ sidr_param->bth_pkey, req->pkey);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static bool validate_ipv4_net_dev(struct net_device *net_dev,
+ const struct sockaddr_in *dst_addr,
+ const struct sockaddr_in *src_addr)
+{
+ __be32 daddr = dst_addr->sin_addr.s_addr,
+ saddr = src_addr->sin_addr.s_addr;
+ struct fib_result res;
+ struct flowi4 fl4;
+ int err;
+ bool ret;
+
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
+ ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) ||
+ ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) ||
+ ipv4_is_loopback(saddr))
+ return false;
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.flowi4_iif = net_dev->ifindex;
+ fl4.daddr = daddr;
+ fl4.saddr = saddr;
+
+ rcu_read_lock();
+ err = fib_lookup(dev_net(net_dev), &fl4, &res, 0);
+ ret = err == 0 && FIB_RES_DEV(res) == net_dev;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static bool validate_ipv6_net_dev(struct net_device *net_dev,
+ const struct sockaddr_in6 *dst_addr,
+ const struct sockaddr_in6 *src_addr)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ const int strict = ipv6_addr_type(&dst_addr->sin6_addr) &
+ IPV6_ADDR_LINKLOCAL;
+ struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr,
+ &src_addr->sin6_addr, net_dev->ifindex,
+ NULL, strict);
+ bool ret;
+
+ if (!rt)
+ return false;
+
+ ret = rt->rt6i_idev->dev == net_dev;
+ ip6_rt_put(rt);
+
+ return ret;
+#else
+ return false;
+#endif
+}
+
+static bool validate_net_dev(struct net_device *net_dev,
+ const struct sockaddr *daddr,
+ const struct sockaddr *saddr)
+{
+ const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr;
+ const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr;
+ const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr;
+ const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr;
+
+ switch (daddr->sa_family) {
+ case AF_INET:
+ return saddr->sa_family == AF_INET &&
+ validate_ipv4_net_dev(net_dev, daddr4, saddr4);
+
+ case AF_INET6:
+ return saddr->sa_family == AF_INET6 &&
+ validate_ipv6_net_dev(net_dev, daddr6, saddr6);
+
+ default:
+ return false;
+ }
+}
+
+static struct net_device *
+roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event)
+{
+ const struct ib_gid_attr *sgid_attr = NULL;
+
+ if (ib_event->event == IB_CM_REQ_RECEIVED)
+ sgid_attr = ib_event->param.req_rcvd.ppath_sgid_attr;
+ else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
+ sgid_attr = ib_event->param.sidr_req_rcvd.sgid_attr;
+
+ if (!sgid_attr)
+ return NULL;
+ dev_hold(sgid_attr->ndev);
+ return sgid_attr->ndev;
+}
+
+static struct net_device *cma_get_net_dev(const struct ib_cm_event *ib_event,
+ struct cma_req_info *req)
+{
+ struct sockaddr *listen_addr =
+ (struct sockaddr *)&req->listen_addr_storage;
+ struct sockaddr *src_addr = (struct sockaddr *)&req->src_addr_storage;
+ struct net_device *net_dev;
+ const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL;
+ int err;
+
+ err = cma_save_ip_info(listen_addr, src_addr, ib_event,
+ req->service_id);
+ if (err)
+ return ERR_PTR(err);
+
+ if (rdma_protocol_roce(req->device, req->port))
+ net_dev = roce_get_net_dev_by_cm_event(ib_event);
+ else
+ net_dev = ib_get_net_dev_by_params(req->device, req->port,
+ req->pkey,
+ gid, listen_addr);
+ if (!net_dev)
+ return ERR_PTR(-ENODEV);
+
+ return net_dev;
+}
+
+static enum rdma_ucm_port_space rdma_ps_from_service_id(__be64 service_id)
+{
+ return (be64_to_cpu(service_id) >> 16) & 0xffff;
+}
+
+static bool cma_match_private_data(struct rdma_id_private *id_priv,
+ const struct cma_hdr *hdr)
+{
+ struct sockaddr *addr = cma_src_addr(id_priv);
+ __be32 ip4_addr;
+ struct in6_addr ip6_addr;
+
+ if (cma_any_addr(addr) && !id_priv->afonly)
+ return true;
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr;
+ if (cma_get_ip_ver(hdr) != 4)
+ return false;
+ if (!cma_any_addr(addr) &&
+ hdr->dst_addr.ip4.addr != ip4_addr)
+ return false;
+ break;
+ case AF_INET6:
+ ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr;
+ if (cma_get_ip_ver(hdr) != 6)
+ return false;
+ if (!cma_any_addr(addr) &&
+ memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr)))
+ return false;
+ break;
+ case AF_IB:
+ return true;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static bool cma_protocol_roce(const struct rdma_cm_id *id)
+{
+ struct ib_device *device = id->device;
+ const int port_num = id->port_num ?: rdma_start_port(device);
+
+ return rdma_protocol_roce(device, port_num);
+}
+
+static bool cma_match_net_dev(const struct rdma_cm_id *id,
+ const struct net_device *net_dev,
+ u8 port_num)
+{
+ const struct rdma_addr *addr = &id->route.addr;
+
+ if (!net_dev)
+ /* This request is an AF_IB request */
+ return (!id->port_num || id->port_num == port_num) &&
+ (addr->src_addr.ss_family == AF_IB);
+
+ /*
+ * Net namespaces must match, and if the listner is listening
+ * on a specific netdevice than netdevice must match as well.
+ */
+ if (net_eq(dev_net(net_dev), addr->dev_addr.net) &&
+ (!!addr->dev_addr.bound_dev_if ==
+ (addr->dev_addr.bound_dev_if == net_dev->ifindex)))
+ return true;
+ else
+ return false;
+}
+
+static struct rdma_id_private *cma_find_listener(
+ const struct rdma_bind_list *bind_list,
+ const struct ib_cm_id *cm_id,
+ const struct ib_cm_event *ib_event,
+ const struct cma_req_info *req,
+ const struct net_device *net_dev)
+{
+ struct rdma_id_private *id_priv, *id_priv_dev;
+
+ lockdep_assert_held(&lock);
+
+ if (!bind_list)
+ return ERR_PTR(-EINVAL);
+
+ hlist_for_each_entry(id_priv, &bind_list->owners, node) {
+ if (cma_match_private_data(id_priv, ib_event->private_data)) {
+ if (id_priv->id.device == cm_id->device &&
+ cma_match_net_dev(&id_priv->id, net_dev, req->port))
+ return id_priv;
+ list_for_each_entry(id_priv_dev,
+ &id_priv->listen_list,
+ listen_list) {
+ if (id_priv_dev->id.device == cm_id->device &&
+ cma_match_net_dev(&id_priv_dev->id, net_dev, req->port))
+ return id_priv_dev;
+ }
+ }
+ }
+
+ return ERR_PTR(-EINVAL);
+}
+
+static struct rdma_id_private *
+cma_ib_id_from_event(struct ib_cm_id *cm_id,
+ const struct ib_cm_event *ib_event,
+ struct net_device **net_dev)
+{
+ struct cma_req_info req;
+ struct rdma_bind_list *bind_list;
+ struct rdma_id_private *id_priv;
+ int err;
+
+ err = cma_save_req_info(ib_event, &req);
+ if (err)
+ return ERR_PTR(err);
+
+ *net_dev = cma_get_net_dev(ib_event, &req);
+ if (IS_ERR(*net_dev)) {
+ if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) {
+ /* Assuming the protocol is AF_IB */
+ *net_dev = NULL;
+ } else {
+ return ERR_CAST(*net_dev);
+ }
+ }
+
+ mutex_lock(&lock);
+ /*
+ * Net namespace might be getting deleted while route lookup,
+ * cm_id lookup is in progress. Therefore, perform netdevice
+ * validation, cm_id lookup under rcu lock.
+ * RCU lock along with netdevice state check, synchronizes with
+ * netdevice migrating to different net namespace and also avoids
+ * case where net namespace doesn't get deleted while lookup is in
+ * progress.
+ * If the device state is not IFF_UP, its properties such as ifindex
+ * and nd_net cannot be trusted to remain valid without rcu lock.
+ * net/core/dev.c change_net_namespace() ensures to synchronize with
+ * ongoing operations on net device after device is closed using
+ * synchronize_net().
+ */
+ rcu_read_lock();
+ if (*net_dev) {
+ /*
+ * If netdevice is down, it is likely that it is administratively
+ * down or it might be migrating to different namespace.
+ * In that case avoid further processing, as the net namespace
+ * or ifindex may change.
+ */
+ if (((*net_dev)->flags & IFF_UP) == 0) {
+ id_priv = ERR_PTR(-EHOSTUNREACH);
+ goto err;
+ }
+
+ if (!validate_net_dev(*net_dev,
+ (struct sockaddr *)&req.listen_addr_storage,
+ (struct sockaddr *)&req.src_addr_storage)) {
+ id_priv = ERR_PTR(-EHOSTUNREACH);
+ goto err;
+ }
+ }
+
+ bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net,
+ rdma_ps_from_service_id(req.service_id),
+ cma_port_from_service_id(req.service_id));
+ id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev);
+err:
+ rcu_read_unlock();
+ mutex_unlock(&lock);
+ if (IS_ERR(id_priv) && *net_dev) {
+ dev_put(*net_dev);
+ *net_dev = NULL;
+ }
+ return id_priv;
+}
+
+static inline u8 cma_user_data_offset(struct rdma_id_private *id_priv)
+{
+ return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr);
+}
+
+static void cma_cancel_route(struct rdma_id_private *id_priv)
+{
+ if (rdma_cap_ib_sa(id_priv->id.device, id_priv->id.port_num)) {
+ if (id_priv->query)
+ ib_sa_cancel_query(id_priv->query_id, id_priv->query);
+ }
+}
+
+static void cma_cancel_listens(struct rdma_id_private *id_priv)
+{
+ struct rdma_id_private *dev_id_priv;
+
+ /*
+ * Remove from listen_any_list to prevent added devices from spawning
+ * additional listen requests.
+ */
+ mutex_lock(&lock);
+ list_del(&id_priv->list);
+
+ while (!list_empty(&id_priv->listen_list)) {
+ dev_id_priv = list_entry(id_priv->listen_list.next,
+ struct rdma_id_private, listen_list);
+ /* sync with device removal to avoid duplicate destruction */
+ list_del_init(&dev_id_priv->list);
+ list_del(&dev_id_priv->listen_list);
+ mutex_unlock(&lock);
+
+ rdma_destroy_id(&dev_id_priv->id);
+ mutex_lock(&lock);
+ }
+ mutex_unlock(&lock);
+}
+
+static void cma_cancel_operation(struct rdma_id_private *id_priv,
+ enum rdma_cm_state state)
+{
+ switch (state) {
+ case RDMA_CM_ADDR_QUERY:
+ rdma_addr_cancel(&id_priv->id.route.addr.dev_addr);
+ break;
+ case RDMA_CM_ROUTE_QUERY:
+ cma_cancel_route(id_priv);
+ break;
+ case RDMA_CM_LISTEN:
+ if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev)
+ cma_cancel_listens(id_priv);
+ break;
+ default:
+ break;
+ }
+}
+
+static void cma_release_port(struct rdma_id_private *id_priv)
+{
+ struct rdma_bind_list *bind_list = id_priv->bind_list;
+ struct net *net = id_priv->id.route.addr.dev_addr.net;
+
+ if (!bind_list)
+ return;
+
+ mutex_lock(&lock);
+ hlist_del(&id_priv->node);
+ if (hlist_empty(&bind_list->owners)) {
+ cma_ps_remove(net, bind_list->ps, bind_list->port);
+ kfree(bind_list);
+ }
+ mutex_unlock(&lock);
+}
+
+static void destroy_mc(struct rdma_id_private *id_priv,
+ struct cma_multicast *mc)
+{
+ if (rdma_cap_ib_mcast(id_priv->id.device, id_priv->id.port_num)) {
+ ib_sa_free_multicast(mc->multicast.ib);
+ kfree(mc);
+ return;
+ }
+
+ if (rdma_protocol_roce(id_priv->id.device,
+ id_priv->id.port_num)) {
+ struct rdma_dev_addr *dev_addr =
+ &id_priv->id.route.addr.dev_addr;
+ struct net_device *ndev = NULL;
+
+ if (dev_addr->bound_dev_if)
+ ndev = dev_get_by_index(dev_addr->net,
+ dev_addr->bound_dev_if);
+ if (ndev) {
+ cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, false);
+ dev_put(ndev);
+ }
+ kref_put(&mc->mcref, release_mc);
+ }
+}
+
+static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
+{
+ struct cma_multicast *mc;
+
+ while (!list_empty(&id_priv->mc_list)) {
+ mc = list_first_entry(&id_priv->mc_list, struct cma_multicast,
+ list);
+ list_del(&mc->list);
+ destroy_mc(id_priv, mc);
+ }
+}
+
+void rdma_destroy_id(struct rdma_cm_id *id)
+{
+ struct rdma_id_private *id_priv;
+ enum rdma_cm_state state;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ state = cma_exch(id_priv, RDMA_CM_DESTROYING);
+ cma_cancel_operation(id_priv, state);
+
+ /*
+ * Wait for any active callback to finish. New callbacks will find
+ * the id_priv state set to destroying and abort.
+ */
+ mutex_lock(&id_priv->handler_mutex);
+ mutex_unlock(&id_priv->handler_mutex);
+
+ rdma_restrack_del(&id_priv->res);
+ if (id_priv->cma_dev) {
+ if (rdma_cap_ib_cm(id_priv->id.device, 1)) {
+ if (id_priv->cm_id.ib)
+ ib_destroy_cm_id(id_priv->cm_id.ib);
+ } else if (rdma_cap_iw_cm(id_priv->id.device, 1)) {
+ if (id_priv->cm_id.iw)
+ iw_destroy_cm_id(id_priv->cm_id.iw);
+ }
+ cma_leave_mc_groups(id_priv);
+ cma_release_dev(id_priv);
+ }
+
+ cma_release_port(id_priv);
+ cma_deref_id(id_priv);
+ wait_for_completion(&id_priv->comp);
+
+ if (id_priv->internal_id)
+ cma_deref_id(id_priv->id.context);
+
+ kfree(id_priv->id.route.path_rec);
+
+ if (id_priv->id.route.addr.dev_addr.sgid_attr)
+ rdma_put_gid_attr(id_priv->id.route.addr.dev_addr.sgid_attr);
+
+ put_net(id_priv->id.route.addr.dev_addr.net);
+ kfree(id_priv);
+}
+EXPORT_SYMBOL(rdma_destroy_id);
+
+static int cma_rep_recv(struct rdma_id_private *id_priv)
+{
+ int ret;
+
+ ret = cma_modify_qp_rtr(id_priv, NULL);
+ if (ret)
+ goto reject;
+
+ ret = cma_modify_qp_rts(id_priv, NULL);
+ if (ret)
+ goto reject;
+
+ ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0);
+ if (ret)
+ goto reject;
+
+ return 0;
+reject:
+ pr_debug_ratelimited("RDMA CM: CONNECT_ERROR: failed to handle reply. status %d\n", ret);
+ cma_modify_qp_err(id_priv);
+ ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED,
+ NULL, 0, NULL, 0);
+ return ret;
+}
+
+static void cma_set_rep_event_data(struct rdma_cm_event *event,
+ const struct ib_cm_rep_event_param *rep_data,
+ void *private_data)
+{
+ event->param.conn.private_data = private_data;
+ event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
+ event->param.conn.responder_resources = rep_data->responder_resources;
+ event->param.conn.initiator_depth = rep_data->initiator_depth;
+ event->param.conn.flow_control = rep_data->flow_control;
+ event->param.conn.rnr_retry_count = rep_data->rnr_retry_count;
+ event->param.conn.srq = rep_data->srq;
+ event->param.conn.qp_num = rep_data->remote_qpn;
+}
+
+static int cma_ib_handler(struct ib_cm_id *cm_id,
+ const struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *id_priv = cm_id->context;
+ struct rdma_cm_event event = {};
+ int ret = 0;
+
+ mutex_lock(&id_priv->handler_mutex);
+ if ((ib_event->event != IB_CM_TIMEWAIT_EXIT &&
+ id_priv->state != RDMA_CM_CONNECT) ||
+ (ib_event->event == IB_CM_TIMEWAIT_EXIT &&
+ id_priv->state != RDMA_CM_DISCONNECT))
+ goto out;
+
+ switch (ib_event->event) {
+ case IB_CM_REQ_ERROR:
+ case IB_CM_REP_ERROR:
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ event.status = -ETIMEDOUT;
+ break;
+ case IB_CM_REP_RECEIVED:
+ if (cma_comp(id_priv, RDMA_CM_CONNECT) &&
+ (id_priv->id.qp_type != IB_QPT_UD))
+ ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+ if (id_priv->id.qp) {
+ event.status = cma_rep_recv(id_priv);
+ event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR :
+ RDMA_CM_EVENT_ESTABLISHED;
+ } else {
+ event.event = RDMA_CM_EVENT_CONNECT_RESPONSE;
+ }
+ cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd,
+ ib_event->private_data);
+ break;
+ case IB_CM_RTU_RECEIVED:
+ case IB_CM_USER_ESTABLISHED:
+ event.event = RDMA_CM_EVENT_ESTABLISHED;
+ break;
+ case IB_CM_DREQ_ERROR:
+ event.status = -ETIMEDOUT; /* fall through */
+ case IB_CM_DREQ_RECEIVED:
+ case IB_CM_DREP_RECEIVED:
+ if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT,
+ RDMA_CM_DISCONNECT))
+ goto out;
+ event.event = RDMA_CM_EVENT_DISCONNECTED;
+ break;
+ case IB_CM_TIMEWAIT_EXIT:
+ event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT;
+ break;
+ case IB_CM_MRA_RECEIVED:
+ /* ignore event */
+ goto out;
+ case IB_CM_REJ_RECEIVED:
+ pr_debug_ratelimited("RDMA CM: REJECTED: %s\n", rdma_reject_msg(&id_priv->id,
+ ib_event->param.rej_rcvd.reason));
+ cma_modify_qp_err(id_priv);
+ event.status = ib_event->param.rej_rcvd.reason;
+ event.event = RDMA_CM_EVENT_REJECTED;
+ event.param.conn.private_data = ib_event->private_data;
+ event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
+ break;
+ default:
+ pr_err("RDMA CMA: unexpected IB CM event: %d\n",
+ ib_event->event);
+ goto out;
+ }
+
+ ret = id_priv->id.event_handler(&id_priv->id, &event);
+ if (ret) {
+ /* Destroy the CM ID by returning a non-zero value. */
+ id_priv->cm_id.ib = NULL;
+ cma_exch(id_priv, RDMA_CM_DESTROYING);
+ mutex_unlock(&id_priv->handler_mutex);
+ rdma_destroy_id(&id_priv->id);
+ return ret;
+ }
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ return ret;
+}
+
+static struct rdma_id_private *
+cma_ib_new_conn_id(const struct rdma_cm_id *listen_id,
+ const struct ib_cm_event *ib_event,
+ struct net_device *net_dev)
+{
+ struct rdma_id_private *listen_id_priv;
+ struct rdma_id_private *id_priv;
+ struct rdma_cm_id *id;
+ struct rdma_route *rt;
+ const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
+ struct sa_path_rec *path = ib_event->param.req_rcvd.primary_path;
+ const __be64 service_id =
+ ib_event->param.req_rcvd.primary_path->service_id;
+ int ret;
+
+ listen_id_priv = container_of(listen_id, struct rdma_id_private, id);
+ id = __rdma_create_id(listen_id->route.addr.dev_addr.net,
+ listen_id->event_handler, listen_id->context,
+ listen_id->ps, ib_event->param.req_rcvd.qp_type,
+ listen_id_priv->res.kern_name);
+ if (IS_ERR(id))
+ return NULL;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+ (struct sockaddr *)&id->route.addr.dst_addr,
+ listen_id, ib_event, ss_family, service_id))
+ goto err;
+
+ rt = &id->route;
+ rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1;
+ rt->path_rec = kmalloc_array(rt->num_paths, sizeof(*rt->path_rec),
+ GFP_KERNEL);
+ if (!rt->path_rec)
+ goto err;
+
+ rt->path_rec[0] = *path;
+ if (rt->num_paths == 2)
+ rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
+
+ if (net_dev) {
+ rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL);
+ } else {
+ if (!cma_protocol_roce(listen_id) &&
+ cma_any_addr(cma_src_addr(id_priv))) {
+ rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND;
+ rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
+ ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
+ } else if (!cma_any_addr(cma_src_addr(id_priv))) {
+ ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr);
+ if (ret)
+ goto err;
+ }
+ }
+ rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
+
+ id_priv->state = RDMA_CM_CONNECT;
+ return id_priv;
+
+err:
+ rdma_destroy_id(id);
+ return NULL;
+}
+
+static struct rdma_id_private *
+cma_ib_new_udp_id(const struct rdma_cm_id *listen_id,
+ const struct ib_cm_event *ib_event,
+ struct net_device *net_dev)
+{
+ const struct rdma_id_private *listen_id_priv;
+ struct rdma_id_private *id_priv;
+ struct rdma_cm_id *id;
+ const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
+ struct net *net = listen_id->route.addr.dev_addr.net;
+ int ret;
+
+ listen_id_priv = container_of(listen_id, struct rdma_id_private, id);
+ id = __rdma_create_id(net, listen_id->event_handler, listen_id->context,
+ listen_id->ps, IB_QPT_UD,
+ listen_id_priv->res.kern_name);
+ if (IS_ERR(id))
+ return NULL;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+ (struct sockaddr *)&id->route.addr.dst_addr,
+ listen_id, ib_event, ss_family,
+ ib_event->param.sidr_req_rcvd.service_id))
+ goto err;
+
+ if (net_dev) {
+ rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL);
+ } else {
+ if (!cma_any_addr(cma_src_addr(id_priv))) {
+ ret = cma_translate_addr(cma_src_addr(id_priv),
+ &id->route.addr.dev_addr);
+ if (ret)
+ goto err;
+ }
+ }
+
+ id_priv->state = RDMA_CM_CONNECT;
+ return id_priv;
+err:
+ rdma_destroy_id(id);
+ return NULL;
+}
+
+static void cma_set_req_event_data(struct rdma_cm_event *event,
+ const struct ib_cm_req_event_param *req_data,
+ void *private_data, int offset)
+{
+ event->param.conn.private_data = private_data + offset;
+ event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset;
+ event->param.conn.responder_resources = req_data->responder_resources;
+ event->param.conn.initiator_depth = req_data->initiator_depth;
+ event->param.conn.flow_control = req_data->flow_control;
+ event->param.conn.retry_count = req_data->retry_count;
+ event->param.conn.rnr_retry_count = req_data->rnr_retry_count;
+ event->param.conn.srq = req_data->srq;
+ event->param.conn.qp_num = req_data->remote_qpn;
+}
+
+static int cma_ib_check_req_qp_type(const struct rdma_cm_id *id,
+ const struct ib_cm_event *ib_event)
+{
+ return (((ib_event->event == IB_CM_REQ_RECEIVED) &&
+ (ib_event->param.req_rcvd.qp_type == id->qp_type)) ||
+ ((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) &&
+ (id->qp_type == IB_QPT_UD)) ||
+ (!id->qp_type));
+}
+
+static int cma_ib_req_handler(struct ib_cm_id *cm_id,
+ const struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *listen_id, *conn_id = NULL;
+ struct rdma_cm_event event = {};
+ struct net_device *net_dev;
+ u8 offset;
+ int ret;
+
+ listen_id = cma_ib_id_from_event(cm_id, ib_event, &net_dev);
+ if (IS_ERR(listen_id))
+ return PTR_ERR(listen_id);
+
+ if (!cma_ib_check_req_qp_type(&listen_id->id, ib_event)) {
+ ret = -EINVAL;
+ goto net_dev_put;
+ }
+
+ mutex_lock(&listen_id->handler_mutex);
+ if (listen_id->state != RDMA_CM_LISTEN) {
+ ret = -ECONNABORTED;
+ goto err1;
+ }
+
+ offset = cma_user_data_offset(listen_id);
+ event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
+ if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) {
+ conn_id = cma_ib_new_udp_id(&listen_id->id, ib_event, net_dev);
+ event.param.ud.private_data = ib_event->private_data + offset;
+ event.param.ud.private_data_len =
+ IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
+ } else {
+ conn_id = cma_ib_new_conn_id(&listen_id->id, ib_event, net_dev);
+ cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
+ ib_event->private_data, offset);
+ }
+ if (!conn_id) {
+ ret = -ENOMEM;
+ goto err1;
+ }
+
+ mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
+ ret = cma_acquire_dev(conn_id, listen_id);
+ if (ret)
+ goto err2;
+
+ conn_id->cm_id.ib = cm_id;
+ cm_id->context = conn_id;
+ cm_id->cm_handler = cma_ib_handler;
+
+ /*
+ * Protect against the user destroying conn_id from another thread
+ * until we're done accessing it.
+ */
+ atomic_inc(&conn_id->refcount);
+ ret = conn_id->id.event_handler(&conn_id->id, &event);
+ if (ret)
+ goto err3;
+ /*
+ * Acquire mutex to prevent user executing rdma_destroy_id()
+ * while we're accessing the cm_id.
+ */
+ mutex_lock(&lock);
+ if (cma_comp(conn_id, RDMA_CM_CONNECT) &&
+ (conn_id->id.qp_type != IB_QPT_UD))
+ ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+ mutex_unlock(&lock);
+ mutex_unlock(&conn_id->handler_mutex);
+ mutex_unlock(&listen_id->handler_mutex);
+ cma_deref_id(conn_id);
+ if (net_dev)
+ dev_put(net_dev);
+ return 0;
+
+err3:
+ cma_deref_id(conn_id);
+ /* Destroy the CM ID by returning a non-zero value. */
+ conn_id->cm_id.ib = NULL;
+err2:
+ cma_exch(conn_id, RDMA_CM_DESTROYING);
+ mutex_unlock(&conn_id->handler_mutex);
+err1:
+ mutex_unlock(&listen_id->handler_mutex);
+ if (conn_id)
+ rdma_destroy_id(&conn_id->id);
+
+net_dev_put:
+ if (net_dev)
+ dev_put(net_dev);
+
+ return ret;
+}
+
+__be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+ if (addr->sa_family == AF_IB)
+ return ((struct sockaddr_ib *) addr)->sib_sid;
+
+ return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr)));
+}
+EXPORT_SYMBOL(rdma_get_service_id);
+
+void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid,
+ union ib_gid *dgid)
+{
+ struct rdma_addr *addr = &cm_id->route.addr;
+
+ if (!cm_id->device) {
+ if (sgid)
+ memset(sgid, 0, sizeof(*sgid));
+ if (dgid)
+ memset(dgid, 0, sizeof(*dgid));
+ return;
+ }
+
+ if (rdma_protocol_roce(cm_id->device, cm_id->port_num)) {
+ if (sgid)
+ rdma_ip2gid((struct sockaddr *)&addr->src_addr, sgid);
+ if (dgid)
+ rdma_ip2gid((struct sockaddr *)&addr->dst_addr, dgid);
+ } else {
+ if (sgid)
+ rdma_addr_get_sgid(&addr->dev_addr, sgid);
+ if (dgid)
+ rdma_addr_get_dgid(&addr->dev_addr, dgid);
+ }
+}
+EXPORT_SYMBOL(rdma_read_gids);
+
+static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
+{
+ struct rdma_id_private *id_priv = iw_id->context;
+ struct rdma_cm_event event = {};
+ int ret = 0;
+ struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr;
+ struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr;
+
+ mutex_lock(&id_priv->handler_mutex);
+ if (id_priv->state != RDMA_CM_CONNECT)
+ goto out;
+
+ switch (iw_event->event) {
+ case IW_CM_EVENT_CLOSE:
+ event.event = RDMA_CM_EVENT_DISCONNECTED;
+ break;
+ case IW_CM_EVENT_CONNECT_REPLY:
+ memcpy(cma_src_addr(id_priv), laddr,
+ rdma_addr_size(laddr));
+ memcpy(cma_dst_addr(id_priv), raddr,
+ rdma_addr_size(raddr));
+ switch (iw_event->status) {
+ case 0:
+ event.event = RDMA_CM_EVENT_ESTABLISHED;
+ event.param.conn.initiator_depth = iw_event->ird;
+ event.param.conn.responder_resources = iw_event->ord;
+ break;
+ case -ECONNRESET:
+ case -ECONNREFUSED:
+ event.event = RDMA_CM_EVENT_REJECTED;
+ break;
+ case -ETIMEDOUT:
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ break;
+ default:
+ event.event = RDMA_CM_EVENT_CONNECT_ERROR;
+ break;
+ }
+ break;
+ case IW_CM_EVENT_ESTABLISHED:
+ event.event = RDMA_CM_EVENT_ESTABLISHED;
+ event.param.conn.initiator_depth = iw_event->ird;
+ event.param.conn.responder_resources = iw_event->ord;
+ break;
+ default:
+ goto out;
+ }
+
+ event.status = iw_event->status;
+ event.param.conn.private_data = iw_event->private_data;
+ event.param.conn.private_data_len = iw_event->private_data_len;
+ ret = id_priv->id.event_handler(&id_priv->id, &event);
+ if (ret) {
+ /* Destroy the CM ID by returning a non-zero value. */
+ id_priv->cm_id.iw = NULL;
+ cma_exch(id_priv, RDMA_CM_DESTROYING);
+ mutex_unlock(&id_priv->handler_mutex);
+ rdma_destroy_id(&id_priv->id);
+ return ret;
+ }
+
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ return ret;
+}
+
+static int iw_conn_req_handler(struct iw_cm_id *cm_id,
+ struct iw_cm_event *iw_event)
+{
+ struct rdma_cm_id *new_cm_id;
+ struct rdma_id_private *listen_id, *conn_id;
+ struct rdma_cm_event event = {};
+ int ret = -ECONNABORTED;
+ struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr;
+ struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr;
+
+ event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
+ event.param.conn.private_data = iw_event->private_data;
+ event.param.conn.private_data_len = iw_event->private_data_len;
+ event.param.conn.initiator_depth = iw_event->ird;
+ event.param.conn.responder_resources = iw_event->ord;
+
+ listen_id = cm_id->context;
+
+ mutex_lock(&listen_id->handler_mutex);
+ if (listen_id->state != RDMA_CM_LISTEN)
+ goto out;
+
+ /* Create a new RDMA id for the new IW CM ID */
+ new_cm_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net,
+ listen_id->id.event_handler,
+ listen_id->id.context,
+ RDMA_PS_TCP, IB_QPT_RC,
+ listen_id->res.kern_name);
+ if (IS_ERR(new_cm_id)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ conn_id = container_of(new_cm_id, struct rdma_id_private, id);
+ mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
+ conn_id->state = RDMA_CM_CONNECT;
+
+ ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr);
+ if (ret) {
+ mutex_unlock(&conn_id->handler_mutex);
+ rdma_destroy_id(new_cm_id);
+ goto out;
+ }
+
+ ret = cma_acquire_dev(conn_id, listen_id);
+ if (ret) {
+ mutex_unlock(&conn_id->handler_mutex);
+ rdma_destroy_id(new_cm_id);
+ goto out;
+ }
+
+ conn_id->cm_id.iw = cm_id;
+ cm_id->context = conn_id;
+ cm_id->cm_handler = cma_iw_handler;
+
+ memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr));
+ memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr));
+
+ /*
+ * Protect against the user destroying conn_id from another thread
+ * until we're done accessing it.
+ */
+ atomic_inc(&conn_id->refcount);
+ ret = conn_id->id.event_handler(&conn_id->id, &event);
+ if (ret) {
+ /* User wants to destroy the CM ID */
+ conn_id->cm_id.iw = NULL;
+ cma_exch(conn_id, RDMA_CM_DESTROYING);
+ mutex_unlock(&conn_id->handler_mutex);
+ mutex_unlock(&listen_id->handler_mutex);
+ cma_deref_id(conn_id);
+ rdma_destroy_id(&conn_id->id);
+ return ret;
+ }
+
+ mutex_unlock(&conn_id->handler_mutex);
+ cma_deref_id(conn_id);
+
+out:
+ mutex_unlock(&listen_id->handler_mutex);
+ return ret;
+}
+
+static int cma_ib_listen(struct rdma_id_private *id_priv)
+{
+ struct sockaddr *addr;
+ struct ib_cm_id *id;
+ __be64 svc_id;
+
+ addr = cma_src_addr(id_priv);
+ svc_id = rdma_get_service_id(&id_priv->id, addr);
+ id = ib_cm_insert_listen(id_priv->id.device,
+ cma_ib_req_handler, svc_id);
+ if (IS_ERR(id))
+ return PTR_ERR(id);
+ id_priv->cm_id.ib = id;
+
+ return 0;
+}
+
+static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
+{
+ int ret;
+ struct iw_cm_id *id;
+
+ id = iw_create_cm_id(id_priv->id.device,
+ iw_conn_req_handler,
+ id_priv);
+ if (IS_ERR(id))
+ return PTR_ERR(id);
+
+ id->tos = id_priv->tos;
+ id_priv->cm_id.iw = id;
+
+ memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv),
+ rdma_addr_size(cma_src_addr(id_priv)));
+
+ ret = iw_cm_listen(id_priv->cm_id.iw, backlog);
+
+ if (ret) {
+ iw_destroy_cm_id(id_priv->cm_id.iw);
+ id_priv->cm_id.iw = NULL;
+ }
+
+ return ret;
+}
+
+static int cma_listen_handler(struct rdma_cm_id *id,
+ struct rdma_cm_event *event)
+{
+ struct rdma_id_private *id_priv = id->context;
+
+ id->context = id_priv->id.context;
+ id->event_handler = id_priv->id.event_handler;
+ return id_priv->id.event_handler(id, event);
+}
+
+static void cma_listen_on_dev(struct rdma_id_private *id_priv,
+ struct cma_device *cma_dev)
+{
+ struct rdma_id_private *dev_id_priv;
+ struct rdma_cm_id *id;
+ struct net *net = id_priv->id.route.addr.dev_addr.net;
+ int ret;
+
+ lockdep_assert_held(&lock);
+
+ if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1))
+ return;
+
+ id = __rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps,
+ id_priv->id.qp_type, id_priv->res.kern_name);
+ if (IS_ERR(id))
+ return;
+
+ dev_id_priv = container_of(id, struct rdma_id_private, id);
+
+ dev_id_priv->state = RDMA_CM_ADDR_BOUND;
+ memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv),
+ rdma_addr_size(cma_src_addr(id_priv)));
+
+ _cma_attach_to_dev(dev_id_priv, cma_dev);
+ list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list);
+ atomic_inc(&id_priv->refcount);
+ dev_id_priv->internal_id = 1;
+ dev_id_priv->afonly = id_priv->afonly;
+
+ ret = rdma_listen(id, id_priv->backlog);
+ if (ret)
+ pr_warn("RDMA CMA: cma_listen_on_dev, error %d, listening on device %s\n",
+ ret, cma_dev->device->name);
+}
+
+static void cma_listen_on_all(struct rdma_id_private *id_priv)
+{
+ struct cma_device *cma_dev;
+
+ mutex_lock(&lock);
+ list_add_tail(&id_priv->list, &listen_any_list);
+ list_for_each_entry(cma_dev, &dev_list, list)
+ cma_listen_on_dev(id_priv, cma_dev);
+ mutex_unlock(&lock);
+}
+
+void rdma_set_service_type(struct rdma_cm_id *id, int tos)
+{
+ struct rdma_id_private *id_priv;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ id_priv->tos = (u8) tos;
+ id_priv->tos_set = true;
+}
+EXPORT_SYMBOL(rdma_set_service_type);
+
+static void cma_query_handler(int status, struct sa_path_rec *path_rec,
+ void *context)
+{
+ struct cma_work *work = context;
+ struct rdma_route *route;
+
+ route = &work->id->id.route;
+
+ if (!status) {
+ route->num_paths = 1;
+ *route->path_rec = *path_rec;
+ } else {
+ work->old_state = RDMA_CM_ROUTE_QUERY;
+ work->new_state = RDMA_CM_ADDR_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
+ work->event.status = status;
+ pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n",
+ status);
+ }
+
+ queue_work(cma_wq, &work->work);
+}
+
+static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms,
+ struct cma_work *work)
+{
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ struct sa_path_rec path_rec;
+ ib_sa_comp_mask comp_mask;
+ struct sockaddr_in6 *sin6;
+ struct sockaddr_ib *sib;
+
+ memset(&path_rec, 0, sizeof path_rec);
+
+ if (rdma_cap_opa_ah(id_priv->id.device, id_priv->id.port_num))
+ path_rec.rec_type = SA_PATH_REC_TYPE_OPA;
+ else
+ path_rec.rec_type = SA_PATH_REC_TYPE_IB;
+ rdma_addr_get_sgid(dev_addr, &path_rec.sgid);
+ rdma_addr_get_dgid(dev_addr, &path_rec.dgid);
+ path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+ path_rec.numb_path = 1;
+ path_rec.reversible = 1;
+ path_rec.service_id = rdma_get_service_id(&id_priv->id,
+ cma_dst_addr(id_priv));
+
+ comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID |
+ IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH |
+ IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID;
+
+ switch (cma_family(id_priv)) {
+ case AF_INET:
+ path_rec.qos_class = cpu_to_be16((u16) id_priv->tos);
+ comp_mask |= IB_SA_PATH_REC_QOS_CLASS;
+ break;
+ case AF_INET6:
+ sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv);
+ path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20);
+ comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
+ break;
+ case AF_IB:
+ sib = (struct sockaddr_ib *) cma_src_addr(id_priv);
+ path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20);
+ comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
+ break;
+ }
+
+ id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device,
+ id_priv->id.port_num, &path_rec,
+ comp_mask, timeout_ms,
+ GFP_KERNEL, cma_query_handler,
+ work, &id_priv->query);
+
+ return (id_priv->query_id < 0) ? id_priv->query_id : 0;
+}
+
+static void cma_work_handler(struct work_struct *_work)
+{
+ struct cma_work *work = container_of(_work, struct cma_work, work);
+ struct rdma_id_private *id_priv = work->id;
+ int destroy = 0;
+
+ mutex_lock(&id_priv->handler_mutex);
+ if (!cma_comp_exch(id_priv, work->old_state, work->new_state))
+ goto out;
+
+ if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
+ cma_exch(id_priv, RDMA_CM_DESTROYING);
+ destroy = 1;
+ }
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ cma_deref_id(id_priv);
+ if (destroy)
+ rdma_destroy_id(&id_priv->id);
+ kfree(work);
+}
+
+static void cma_ndev_work_handler(struct work_struct *_work)
+{
+ struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work);
+ struct rdma_id_private *id_priv = work->id;
+ int destroy = 0;
+
+ mutex_lock(&id_priv->handler_mutex);
+ if (id_priv->state == RDMA_CM_DESTROYING ||
+ id_priv->state == RDMA_CM_DEVICE_REMOVAL)
+ goto out;
+
+ if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
+ cma_exch(id_priv, RDMA_CM_DESTROYING);
+ destroy = 1;
+ }
+
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ cma_deref_id(id_priv);
+ if (destroy)
+ rdma_destroy_id(&id_priv->id);
+ kfree(work);
+}
+
+static void cma_init_resolve_route_work(struct cma_work *work,
+ struct rdma_id_private *id_priv)
+{
+ work->id = id_priv;
+ INIT_WORK(&work->work, cma_work_handler);
+ work->old_state = RDMA_CM_ROUTE_QUERY;
+ work->new_state = RDMA_CM_ROUTE_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+}
+
+static void cma_init_resolve_addr_work(struct cma_work *work,
+ struct rdma_id_private *id_priv)
+{
+ work->id = id_priv;
+ INIT_WORK(&work->work, cma_work_handler);
+ work->old_state = RDMA_CM_ADDR_QUERY;
+ work->new_state = RDMA_CM_ADDR_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+}
+
+static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms)
+{
+ struct rdma_route *route = &id_priv->id.route;
+ struct cma_work *work;
+ int ret;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ cma_init_resolve_route_work(work, id_priv);
+
+ if (!route->path_rec)
+ route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL);
+ if (!route->path_rec) {
+ ret = -ENOMEM;
+ goto err1;
+ }
+
+ ret = cma_query_ib_route(id_priv, timeout_ms, work);
+ if (ret)
+ goto err2;
+
+ return 0;
+err2:
+ kfree(route->path_rec);
+ route->path_rec = NULL;
+err1:
+ kfree(work);
+ return ret;
+}
+
+static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type,
+ unsigned long supported_gids,
+ enum ib_gid_type default_gid)
+{
+ if ((network_type == RDMA_NETWORK_IPV4 ||
+ network_type == RDMA_NETWORK_IPV6) &&
+ test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids))
+ return IB_GID_TYPE_ROCE_UDP_ENCAP;
+
+ return default_gid;
+}
+
+/*
+ * cma_iboe_set_path_rec_l2_fields() is helper function which sets
+ * path record type based on GID type.
+ * It also sets up other L2 fields which includes destination mac address
+ * netdev ifindex, of the path record.
+ * It returns the netdev of the bound interface for this path record entry.
+ */
+static struct net_device *
+cma_iboe_set_path_rec_l2_fields(struct rdma_id_private *id_priv)
+{
+ struct rdma_route *route = &id_priv->id.route;
+ enum ib_gid_type gid_type = IB_GID_TYPE_ROCE;
+ struct rdma_addr *addr = &route->addr;
+ unsigned long supported_gids;
+ struct net_device *ndev;
+
+ if (!addr->dev_addr.bound_dev_if)
+ return NULL;
+
+ ndev = dev_get_by_index(addr->dev_addr.net,
+ addr->dev_addr.bound_dev_if);
+ if (!ndev)
+ return NULL;
+
+ supported_gids = roce_gid_type_mask_support(id_priv->id.device,
+ id_priv->id.port_num);
+ gid_type = cma_route_gid_type(addr->dev_addr.network,
+ supported_gids,
+ id_priv->gid_type);
+ /* Use the hint from IP Stack to select GID Type */
+ if (gid_type < ib_network_to_gid_type(addr->dev_addr.network))
+ gid_type = ib_network_to_gid_type(addr->dev_addr.network);
+ route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type);
+
+ route->path_rec->roce.route_resolved = true;
+ sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr);
+ return ndev;
+}
+
+int rdma_set_ib_path(struct rdma_cm_id *id,
+ struct sa_path_rec *path_rec)
+{
+ struct rdma_id_private *id_priv;
+ struct net_device *ndev;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
+ RDMA_CM_ROUTE_RESOLVED))
+ return -EINVAL;
+
+ id->route.path_rec = kmemdup(path_rec, sizeof(*path_rec),
+ GFP_KERNEL);
+ if (!id->route.path_rec) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ if (rdma_protocol_roce(id->device, id->port_num)) {
+ ndev = cma_iboe_set_path_rec_l2_fields(id_priv);
+ if (!ndev) {
+ ret = -ENODEV;
+ goto err_free;
+ }
+ dev_put(ndev);
+ }
+
+ id->route.num_paths = 1;
+ return 0;
+
+err_free:
+ kfree(id->route.path_rec);
+ id->route.path_rec = NULL;
+err:
+ cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_set_ib_path);
+
+static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms)
+{
+ struct cma_work *work;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ cma_init_resolve_route_work(work, id_priv);
+ queue_work(cma_wq, &work->work);
+ return 0;
+}
+
+static int iboe_tos_to_sl(struct net_device *ndev, int tos)
+{
+ int prio;
+ struct net_device *dev;
+
+ prio = rt_tos2priority(tos);
+ dev = is_vlan_dev(ndev) ? vlan_dev_real_dev(ndev) : ndev;
+ if (dev->num_tc)
+ return netdev_get_prio_tc_map(dev, prio);
+
+#if IS_ENABLED(CONFIG_VLAN_8021Q)
+ if (is_vlan_dev(ndev))
+ return (vlan_dev_get_egress_qos_mask(ndev, prio) &
+ VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+#endif
+ return 0;
+}
+
+static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
+{
+ struct rdma_route *route = &id_priv->id.route;
+ struct rdma_addr *addr = &route->addr;
+ struct cma_work *work;
+ int ret;
+ struct net_device *ndev;
+
+ u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num -
+ rdma_start_port(id_priv->cma_dev->device)];
+ u8 tos = id_priv->tos_set ? id_priv->tos : default_roce_tos;
+
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL);
+ if (!route->path_rec) {
+ ret = -ENOMEM;
+ goto err1;
+ }
+
+ route->num_paths = 1;
+
+ ndev = cma_iboe_set_path_rec_l2_fields(id_priv);
+ if (!ndev) {
+ ret = -ENODEV;
+ goto err2;
+ }
+
+ rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+ &route->path_rec->sgid);
+ rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr,
+ &route->path_rec->dgid);
+
+ if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB)
+ /* TODO: get the hoplimit from the inet/inet6 device */
+ route->path_rec->hop_limit = addr->dev_addr.hoplimit;
+ else
+ route->path_rec->hop_limit = 1;
+ route->path_rec->reversible = 1;
+ route->path_rec->pkey = cpu_to_be16(0xffff);
+ route->path_rec->mtu_selector = IB_SA_EQ;
+ route->path_rec->sl = iboe_tos_to_sl(ndev, tos);
+ route->path_rec->traffic_class = tos;
+ route->path_rec->mtu = iboe_get_mtu(ndev->mtu);
+ route->path_rec->rate_selector = IB_SA_EQ;
+ route->path_rec->rate = iboe_get_rate(ndev);
+ dev_put(ndev);
+ route->path_rec->packet_life_time_selector = IB_SA_EQ;
+ route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME;
+ if (!route->path_rec->mtu) {
+ ret = -EINVAL;
+ goto err2;
+ }
+
+ cma_init_resolve_route_work(work, id_priv);
+ queue_work(cma_wq, &work->work);
+
+ return 0;
+
+err2:
+ kfree(route->path_rec);
+ route->path_rec = NULL;
+ route->num_paths = 0;
+err1:
+ kfree(work);
+ return ret;
+}
+
+int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY))
+ return -EINVAL;
+
+ atomic_inc(&id_priv->refcount);
+ if (rdma_cap_ib_sa(id->device, id->port_num))
+ ret = cma_resolve_ib_route(id_priv, timeout_ms);
+ else if (rdma_protocol_roce(id->device, id->port_num))
+ ret = cma_resolve_iboe_route(id_priv);
+ else if (rdma_protocol_iwarp(id->device, id->port_num))
+ ret = cma_resolve_iw_route(id_priv, timeout_ms);
+ else
+ ret = -ENOSYS;
+
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED);
+ cma_deref_id(id_priv);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_route);
+
+static void cma_set_loopback(struct sockaddr *addr)
+{
+ switch (addr->sa_family) {
+ case AF_INET:
+ ((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ break;
+ case AF_INET6:
+ ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr,
+ 0, 0, 0, htonl(1));
+ break;
+ default:
+ ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr,
+ 0, 0, 0, htonl(1));
+ break;
+ }
+}
+
+static int cma_bind_loopback(struct rdma_id_private *id_priv)
+{
+ struct cma_device *cma_dev, *cur_dev;
+ union ib_gid gid;
+ enum ib_port_state port_state;
+ u16 pkey;
+ int ret;
+ u8 p;
+
+ cma_dev = NULL;
+ mutex_lock(&lock);
+ list_for_each_entry(cur_dev, &dev_list, list) {
+ if (cma_family(id_priv) == AF_IB &&
+ !rdma_cap_ib_cm(cur_dev->device, 1))
+ continue;
+
+ if (!cma_dev)
+ cma_dev = cur_dev;
+
+ for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) {
+ if (!ib_get_cached_port_state(cur_dev->device, p, &port_state) &&
+ port_state == IB_PORT_ACTIVE) {
+ cma_dev = cur_dev;
+ goto port_found;
+ }
+ }
+ }
+
+ if (!cma_dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ p = 1;
+
+port_found:
+ ret = rdma_query_gid(cma_dev->device, p, 0, &gid);
+ if (ret)
+ goto out;
+
+ ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey);
+ if (ret)
+ goto out;
+
+ id_priv->id.route.addr.dev_addr.dev_type =
+ (rdma_protocol_ib(cma_dev->device, p)) ?
+ ARPHRD_INFINIBAND : ARPHRD_ETHER;
+
+ rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid);
+ ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey);
+ id_priv->id.port_num = p;
+ cma_attach_to_dev(id_priv, cma_dev);
+ cma_set_loopback(cma_src_addr(id_priv));
+out:
+ mutex_unlock(&lock);
+ return ret;
+}
+
+static void addr_handler(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *dev_addr, void *context)
+{
+ struct rdma_id_private *id_priv = context;
+ struct rdma_cm_event event = {};
+ struct sockaddr *addr;
+ struct sockaddr_storage old_addr;
+
+ mutex_lock(&id_priv->handler_mutex);
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY,
+ RDMA_CM_ADDR_RESOLVED))
+ goto out;
+
+ /*
+ * Store the previous src address, so that if we fail to acquire
+ * matching rdma device, old address can be restored back, which helps
+ * to cancel the cma listen operation correctly.
+ */
+ addr = cma_src_addr(id_priv);
+ memcpy(&old_addr, addr, rdma_addr_size(addr));
+ memcpy(addr, src_addr, rdma_addr_size(src_addr));
+ if (!status && !id_priv->cma_dev) {
+ status = cma_acquire_dev(id_priv, NULL);
+ if (status)
+ pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n",
+ status);
+ } else if (status) {
+ pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to resolve IP. status %d\n", status);
+ }
+
+ if (status) {
+ memcpy(addr, &old_addr,
+ rdma_addr_size((struct sockaddr *)&old_addr));
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
+ RDMA_CM_ADDR_BOUND))
+ goto out;
+ event.event = RDMA_CM_EVENT_ADDR_ERROR;
+ event.status = status;
+ } else
+ event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+
+ if (id_priv->id.event_handler(&id_priv->id, &event)) {
+ cma_exch(id_priv, RDMA_CM_DESTROYING);
+ mutex_unlock(&id_priv->handler_mutex);
+ cma_deref_id(id_priv);
+ rdma_destroy_id(&id_priv->id);
+ return;
+ }
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ cma_deref_id(id_priv);
+}
+
+static int cma_resolve_loopback(struct rdma_id_private *id_priv)
+{
+ struct cma_work *work;
+ union ib_gid gid;
+ int ret;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ if (!id_priv->cma_dev) {
+ ret = cma_bind_loopback(id_priv);
+ if (ret)
+ goto err;
+ }
+
+ rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid);
+ rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid);
+
+ cma_init_resolve_addr_work(work, id_priv);
+ queue_work(cma_wq, &work->work);
+ return 0;
+err:
+ kfree(work);
+ return ret;
+}
+
+static int cma_resolve_ib_addr(struct rdma_id_private *id_priv)
+{
+ struct cma_work *work;
+ int ret;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ if (!id_priv->cma_dev) {
+ ret = cma_resolve_ib_dev(id_priv);
+ if (ret)
+ goto err;
+ }
+
+ rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *)
+ &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr));
+
+ cma_init_resolve_addr_work(work, id_priv);
+ queue_work(cma_wq, &work->work);
+ return 0;
+err:
+ kfree(work);
+ return ret;
+}
+
+static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+ const struct sockaddr *dst_addr)
+{
+ if (!src_addr || !src_addr->sa_family) {
+ src_addr = (struct sockaddr *) &id->route.addr.src_addr;
+ src_addr->sa_family = dst_addr->sa_family;
+ if (IS_ENABLED(CONFIG_IPV6) &&
+ dst_addr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *) src_addr;
+ struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *) dst_addr;
+ src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id;
+ if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+ id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id;
+ } else if (dst_addr->sa_family == AF_IB) {
+ ((struct sockaddr_ib *) src_addr)->sib_pkey =
+ ((struct sockaddr_ib *) dst_addr)->sib_pkey;
+ }
+ }
+ return rdma_bind_addr(id, src_addr);
+}
+
+int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+ const struct sockaddr *dst_addr, int timeout_ms)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (id_priv->state == RDMA_CM_IDLE) {
+ ret = cma_bind_addr(id, src_addr, dst_addr);
+ if (ret)
+ return ret;
+ }
+
+ if (cma_family(id_priv) != dst_addr->sa_family)
+ return -EINVAL;
+
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY))
+ return -EINVAL;
+
+ memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr));
+ atomic_inc(&id_priv->refcount);
+ if (cma_any_addr(dst_addr)) {
+ ret = cma_resolve_loopback(id_priv);
+ } else {
+ if (dst_addr->sa_family == AF_IB) {
+ ret = cma_resolve_ib_addr(id_priv);
+ } else {
+ ret = rdma_resolve_ip(cma_src_addr(id_priv),
+ dst_addr, &id->route.addr.dev_addr,
+ timeout_ms, addr_handler, id_priv);
+ }
+ }
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND);
+ cma_deref_id(id_priv);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_addr);
+
+int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse)
+{
+ struct rdma_id_private *id_priv;
+ unsigned long flags;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ spin_lock_irqsave(&id_priv->lock, flags);
+ if (reuse || id_priv->state == RDMA_CM_IDLE) {
+ id_priv->reuseaddr = reuse;
+ ret = 0;
+ } else {
+ ret = -EINVAL;
+ }
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_set_reuseaddr);
+
+int rdma_set_afonly(struct rdma_cm_id *id, int afonly)
+{
+ struct rdma_id_private *id_priv;
+ unsigned long flags;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ spin_lock_irqsave(&id_priv->lock, flags);
+ if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) {
+ id_priv->options |= (1 << CMA_OPTION_AFONLY);
+ id_priv->afonly = afonly;
+ ret = 0;
+ } else {
+ ret = -EINVAL;
+ }
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_set_afonly);
+
+static void cma_bind_port(struct rdma_bind_list *bind_list,
+ struct rdma_id_private *id_priv)
+{
+ struct sockaddr *addr;
+ struct sockaddr_ib *sib;
+ u64 sid, mask;
+ __be16 port;
+
+ lockdep_assert_held(&lock);
+
+ addr = cma_src_addr(id_priv);
+ port = htons(bind_list->port);
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ ((struct sockaddr_in *) addr)->sin_port = port;
+ break;
+ case AF_INET6:
+ ((struct sockaddr_in6 *) addr)->sin6_port = port;
+ break;
+ case AF_IB:
+ sib = (struct sockaddr_ib *) addr;
+ sid = be64_to_cpu(sib->sib_sid);
+ mask = be64_to_cpu(sib->sib_sid_mask);
+ sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port));
+ sib->sib_sid_mask = cpu_to_be64(~0ULL);
+ break;
+ }
+ id_priv->bind_list = bind_list;
+ hlist_add_head(&id_priv->node, &bind_list->owners);
+}
+
+static int cma_alloc_port(enum rdma_ucm_port_space ps,
+ struct rdma_id_private *id_priv, unsigned short snum)
+{
+ struct rdma_bind_list *bind_list;
+ int ret;
+
+ lockdep_assert_held(&lock);
+
+ bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL);
+ if (!bind_list)
+ return -ENOMEM;
+
+ ret = cma_ps_alloc(id_priv->id.route.addr.dev_addr.net, ps, bind_list,
+ snum);
+ if (ret < 0)
+ goto err;
+
+ bind_list->ps = ps;
+ bind_list->port = (unsigned short)ret;
+ cma_bind_port(bind_list, id_priv);
+ return 0;
+err:
+ kfree(bind_list);
+ return ret == -ENOSPC ? -EADDRNOTAVAIL : ret;
+}
+
+static int cma_port_is_unique(struct rdma_bind_list *bind_list,
+ struct rdma_id_private *id_priv)
+{
+ struct rdma_id_private *cur_id;
+ struct sockaddr *daddr = cma_dst_addr(id_priv);
+ struct sockaddr *saddr = cma_src_addr(id_priv);
+ __be16 dport = cma_port(daddr);
+
+ lockdep_assert_held(&lock);
+
+ hlist_for_each_entry(cur_id, &bind_list->owners, node) {
+ struct sockaddr *cur_daddr = cma_dst_addr(cur_id);
+ struct sockaddr *cur_saddr = cma_src_addr(cur_id);
+ __be16 cur_dport = cma_port(cur_daddr);
+
+ if (id_priv == cur_id)
+ continue;
+
+ /* different dest port -> unique */
+ if (!cma_any_port(daddr) &&
+ !cma_any_port(cur_daddr) &&
+ (dport != cur_dport))
+ continue;
+
+ /* different src address -> unique */
+ if (!cma_any_addr(saddr) &&
+ !cma_any_addr(cur_saddr) &&
+ cma_addr_cmp(saddr, cur_saddr))
+ continue;
+
+ /* different dst address -> unique */
+ if (!cma_any_addr(daddr) &&
+ !cma_any_addr(cur_daddr) &&
+ cma_addr_cmp(daddr, cur_daddr))
+ continue;
+
+ return -EADDRNOTAVAIL;
+ }
+ return 0;
+}
+
+static int cma_alloc_any_port(enum rdma_ucm_port_space ps,
+ struct rdma_id_private *id_priv)
+{
+ static unsigned int last_used_port;
+ int low, high, remaining;
+ unsigned int rover;
+ struct net *net = id_priv->id.route.addr.dev_addr.net;
+
+ lockdep_assert_held(&lock);
+
+ inet_get_local_port_range(net, &low, &high);
+ remaining = (high - low) + 1;
+ rover = prandom_u32() % remaining + low;
+retry:
+ if (last_used_port != rover) {
+ struct rdma_bind_list *bind_list;
+ int ret;
+
+ bind_list = cma_ps_find(net, ps, (unsigned short)rover);
+
+ if (!bind_list) {
+ ret = cma_alloc_port(ps, id_priv, rover);
+ } else {
+ ret = cma_port_is_unique(bind_list, id_priv);
+ if (!ret)
+ cma_bind_port(bind_list, id_priv);
+ }
+ /*
+ * Remember previously used port number in order to avoid
+ * re-using same port immediately after it is closed.
+ */
+ if (!ret)
+ last_used_port = rover;
+ if (ret != -EADDRNOTAVAIL)
+ return ret;
+ }
+ if (--remaining) {
+ rover++;
+ if ((rover < low) || (rover > high))
+ rover = low;
+ goto retry;
+ }
+ return -EADDRNOTAVAIL;
+}
+
+/*
+ * Check that the requested port is available. This is called when trying to
+ * bind to a specific port, or when trying to listen on a bound port. In
+ * the latter case, the provided id_priv may already be on the bind_list, but
+ * we still need to check that it's okay to start listening.
+ */
+static int cma_check_port(struct rdma_bind_list *bind_list,
+ struct rdma_id_private *id_priv, uint8_t reuseaddr)
+{
+ struct rdma_id_private *cur_id;
+ struct sockaddr *addr, *cur_addr;
+
+ lockdep_assert_held(&lock);
+
+ addr = cma_src_addr(id_priv);
+ hlist_for_each_entry(cur_id, &bind_list->owners, node) {
+ if (id_priv == cur_id)
+ continue;
+
+ if ((cur_id->state != RDMA_CM_LISTEN) && reuseaddr &&
+ cur_id->reuseaddr)
+ continue;
+
+ cur_addr = cma_src_addr(cur_id);
+ if (id_priv->afonly && cur_id->afonly &&
+ (addr->sa_family != cur_addr->sa_family))
+ continue;
+
+ if (cma_any_addr(addr) || cma_any_addr(cur_addr))
+ return -EADDRNOTAVAIL;
+
+ if (!cma_addr_cmp(addr, cur_addr))
+ return -EADDRINUSE;
+ }
+ return 0;
+}
+
+static int cma_use_port(enum rdma_ucm_port_space ps,
+ struct rdma_id_private *id_priv)
+{
+ struct rdma_bind_list *bind_list;
+ unsigned short snum;
+ int ret;
+
+ lockdep_assert_held(&lock);
+
+ snum = ntohs(cma_port(cma_src_addr(id_priv)));
+ if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+ return -EACCES;
+
+ bind_list = cma_ps_find(id_priv->id.route.addr.dev_addr.net, ps, snum);
+ if (!bind_list) {
+ ret = cma_alloc_port(ps, id_priv, snum);
+ } else {
+ ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr);
+ if (!ret)
+ cma_bind_port(bind_list, id_priv);
+ }
+ return ret;
+}
+
+static int cma_bind_listen(struct rdma_id_private *id_priv)
+{
+ struct rdma_bind_list *bind_list = id_priv->bind_list;
+ int ret = 0;
+
+ mutex_lock(&lock);
+ if (bind_list->owners.first->next)
+ ret = cma_check_port(bind_list, id_priv, 0);
+ mutex_unlock(&lock);
+ return ret;
+}
+
+static enum rdma_ucm_port_space
+cma_select_inet_ps(struct rdma_id_private *id_priv)
+{
+ switch (id_priv->id.ps) {
+ case RDMA_PS_TCP:
+ case RDMA_PS_UDP:
+ case RDMA_PS_IPOIB:
+ case RDMA_PS_IB:
+ return id_priv->id.ps;
+ default:
+
+ return 0;
+ }
+}
+
+static enum rdma_ucm_port_space
+cma_select_ib_ps(struct rdma_id_private *id_priv)
+{
+ enum rdma_ucm_port_space ps = 0;
+ struct sockaddr_ib *sib;
+ u64 sid_ps, mask, sid;
+
+ sib = (struct sockaddr_ib *) cma_src_addr(id_priv);
+ mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK;
+ sid = be64_to_cpu(sib->sib_sid) & mask;
+
+ if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) {
+ sid_ps = RDMA_IB_IP_PS_IB;
+ ps = RDMA_PS_IB;
+ } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) &&
+ (sid == (RDMA_IB_IP_PS_TCP & mask))) {
+ sid_ps = RDMA_IB_IP_PS_TCP;
+ ps = RDMA_PS_TCP;
+ } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) &&
+ (sid == (RDMA_IB_IP_PS_UDP & mask))) {
+ sid_ps = RDMA_IB_IP_PS_UDP;
+ ps = RDMA_PS_UDP;
+ }
+
+ if (ps) {
+ sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib)));
+ sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK |
+ be64_to_cpu(sib->sib_sid_mask));
+ }
+ return ps;
+}
+
+static int cma_get_port(struct rdma_id_private *id_priv)
+{
+ enum rdma_ucm_port_space ps;
+ int ret;
+
+ if (cma_family(id_priv) != AF_IB)
+ ps = cma_select_inet_ps(id_priv);
+ else
+ ps = cma_select_ib_ps(id_priv);
+ if (!ps)
+ return -EPROTONOSUPPORT;
+
+ mutex_lock(&lock);
+ if (cma_any_port(cma_src_addr(id_priv)))
+ ret = cma_alloc_any_port(ps, id_priv);
+ else
+ ret = cma_use_port(ps, id_priv);
+ mutex_unlock(&lock);
+
+ return ret;
+}
+
+static int cma_check_linklocal(struct rdma_dev_addr *dev_addr,
+ struct sockaddr *addr)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ struct sockaddr_in6 *sin6;
+
+ if (addr->sa_family != AF_INET6)
+ return 0;
+
+ sin6 = (struct sockaddr_in6 *) addr;
+
+ if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
+ return 0;
+
+ if (!sin6->sin6_scope_id)
+ return -EINVAL;
+
+ dev_addr->bound_dev_if = sin6->sin6_scope_id;
+#endif
+ return 0;
+}
+
+int rdma_listen(struct rdma_cm_id *id, int backlog)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (id_priv->state == RDMA_CM_IDLE) {
+ id->route.addr.src_addr.ss_family = AF_INET;
+ ret = rdma_bind_addr(id, cma_src_addr(id_priv));
+ if (ret)
+ return ret;
+ }
+
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN))
+ return -EINVAL;
+
+ if (id_priv->reuseaddr) {
+ ret = cma_bind_listen(id_priv);
+ if (ret)
+ goto err;
+ }
+
+ id_priv->backlog = backlog;
+ if (id->device) {
+ if (rdma_cap_ib_cm(id->device, 1)) {
+ ret = cma_ib_listen(id_priv);
+ if (ret)
+ goto err;
+ } else if (rdma_cap_iw_cm(id->device, 1)) {
+ ret = cma_iw_listen(id_priv, backlog);
+ if (ret)
+ goto err;
+ } else {
+ ret = -ENOSYS;
+ goto err;
+ }
+ } else
+ cma_listen_on_all(id_priv);
+
+ return 0;
+err:
+ id_priv->backlog = 0;
+ cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_listen);
+
+int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+ struct sockaddr *daddr;
+
+ if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 &&
+ addr->sa_family != AF_IB)
+ return -EAFNOSUPPORT;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND))
+ return -EINVAL;
+
+ ret = cma_check_linklocal(&id->route.addr.dev_addr, addr);
+ if (ret)
+ goto err1;
+
+ memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr));
+ if (!cma_any_addr(addr)) {
+ ret = cma_translate_addr(addr, &id->route.addr.dev_addr);
+ if (ret)
+ goto err1;
+
+ ret = cma_acquire_dev(id_priv, NULL);
+ if (ret)
+ goto err1;
+ }
+
+ if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) {
+ if (addr->sa_family == AF_INET)
+ id_priv->afonly = 1;
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (addr->sa_family == AF_INET6) {
+ struct net *net = id_priv->id.route.addr.dev_addr.net;
+
+ id_priv->afonly = net->ipv6.sysctl.bindv6only;
+ }
+#endif
+ }
+ daddr = cma_dst_addr(id_priv);
+ daddr->sa_family = addr->sa_family;
+
+ ret = cma_get_port(id_priv);
+ if (ret)
+ goto err2;
+
+ return 0;
+err2:
+ rdma_restrack_del(&id_priv->res);
+ if (id_priv->cma_dev)
+ cma_release_dev(id_priv);
+err1:
+ cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_bind_addr);
+
+static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv)
+{
+ struct cma_hdr *cma_hdr;
+
+ cma_hdr = hdr;
+ cma_hdr->cma_version = CMA_VERSION;
+ if (cma_family(id_priv) == AF_INET) {
+ struct sockaddr_in *src4, *dst4;
+
+ src4 = (struct sockaddr_in *) cma_src_addr(id_priv);
+ dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv);
+
+ cma_set_ip_ver(cma_hdr, 4);
+ cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr;
+ cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr;
+ cma_hdr->port = src4->sin_port;
+ } else if (cma_family(id_priv) == AF_INET6) {
+ struct sockaddr_in6 *src6, *dst6;
+
+ src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv);
+ dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv);
+
+ cma_set_ip_ver(cma_hdr, 6);
+ cma_hdr->src_addr.ip6 = src6->sin6_addr;
+ cma_hdr->dst_addr.ip6 = dst6->sin6_addr;
+ cma_hdr->port = src6->sin6_port;
+ }
+ return 0;
+}
+
+static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
+ const struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *id_priv = cm_id->context;
+ struct rdma_cm_event event = {};
+ const struct ib_cm_sidr_rep_event_param *rep =
+ &ib_event->param.sidr_rep_rcvd;
+ int ret = 0;
+
+ mutex_lock(&id_priv->handler_mutex);
+ if (id_priv->state != RDMA_CM_CONNECT)
+ goto out;
+
+ switch (ib_event->event) {
+ case IB_CM_SIDR_REQ_ERROR:
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ event.status = -ETIMEDOUT;
+ break;
+ case IB_CM_SIDR_REP_RECEIVED:
+ event.param.ud.private_data = ib_event->private_data;
+ event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
+ if (rep->status != IB_SIDR_SUCCESS) {
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ event.status = ib_event->param.sidr_rep_rcvd.status;
+ pr_debug_ratelimited("RDMA CM: UNREACHABLE: bad SIDR reply. status %d\n",
+ event.status);
+ break;
+ }
+ ret = cma_set_qkey(id_priv, rep->qkey);
+ if (ret) {
+ pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to set qkey. status %d\n", ret);
+ event.event = RDMA_CM_EVENT_ADDR_ERROR;
+ event.status = ret;
+ break;
+ }
+ ib_init_ah_attr_from_path(id_priv->id.device,
+ id_priv->id.port_num,
+ id_priv->id.route.path_rec,
+ &event.param.ud.ah_attr,
+ rep->sgid_attr);
+ event.param.ud.qp_num = rep->qpn;
+ event.param.ud.qkey = rep->qkey;
+ event.event = RDMA_CM_EVENT_ESTABLISHED;
+ event.status = 0;
+ break;
+ default:
+ pr_err("RDMA CMA: unexpected IB CM event: %d\n",
+ ib_event->event);
+ goto out;
+ }
+
+ ret = id_priv->id.event_handler(&id_priv->id, &event);
+
+ rdma_destroy_ah_attr(&event.param.ud.ah_attr);
+ if (ret) {
+ /* Destroy the CM ID by returning a non-zero value. */
+ id_priv->cm_id.ib = NULL;
+ cma_exch(id_priv, RDMA_CM_DESTROYING);
+ mutex_unlock(&id_priv->handler_mutex);
+ rdma_destroy_id(&id_priv->id);
+ return ret;
+ }
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ return ret;
+}
+
+static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_cm_sidr_req_param req;
+ struct ib_cm_id *id;
+ void *private_data;
+ u8 offset;
+ int ret;
+
+ memset(&req, 0, sizeof req);
+ offset = cma_user_data_offset(id_priv);
+ req.private_data_len = offset + conn_param->private_data_len;
+ if (req.private_data_len < conn_param->private_data_len)
+ return -EINVAL;
+
+ if (req.private_data_len) {
+ private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
+ if (!private_data)
+ return -ENOMEM;
+ } else {
+ private_data = NULL;
+ }
+
+ if (conn_param->private_data && conn_param->private_data_len)
+ memcpy(private_data + offset, conn_param->private_data,
+ conn_param->private_data_len);
+
+ if (private_data) {
+ ret = cma_format_hdr(private_data, id_priv);
+ if (ret)
+ goto out;
+ req.private_data = private_data;
+ }
+
+ id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler,
+ id_priv);
+ if (IS_ERR(id)) {
+ ret = PTR_ERR(id);
+ goto out;
+ }
+ id_priv->cm_id.ib = id;
+
+ req.path = id_priv->id.route.path_rec;
+ req.sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr;
+ req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
+ req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8);
+ req.max_cm_retries = CMA_MAX_CM_RETRIES;
+
+ ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req);
+ if (ret) {
+ ib_destroy_cm_id(id_priv->cm_id.ib);
+ id_priv->cm_id.ib = NULL;
+ }
+out:
+ kfree(private_data);
+ return ret;
+}
+
+static int cma_connect_ib(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_cm_req_param req;
+ struct rdma_route *route;
+ void *private_data;
+ struct ib_cm_id *id;
+ u8 offset;
+ int ret;
+
+ memset(&req, 0, sizeof req);
+ offset = cma_user_data_offset(id_priv);
+ req.private_data_len = offset + conn_param->private_data_len;
+ if (req.private_data_len < conn_param->private_data_len)
+ return -EINVAL;
+
+ if (req.private_data_len) {
+ private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
+ if (!private_data)
+ return -ENOMEM;
+ } else {
+ private_data = NULL;
+ }
+
+ if (conn_param->private_data && conn_param->private_data_len)
+ memcpy(private_data + offset, conn_param->private_data,
+ conn_param->private_data_len);
+
+ id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv);
+ if (IS_ERR(id)) {
+ ret = PTR_ERR(id);
+ goto out;
+ }
+ id_priv->cm_id.ib = id;
+
+ route = &id_priv->id.route;
+ if (private_data) {
+ ret = cma_format_hdr(private_data, id_priv);
+ if (ret)
+ goto out;
+ req.private_data = private_data;
+ }
+
+ req.primary_path = &route->path_rec[0];
+ if (route->num_paths == 2)
+ req.alternate_path = &route->path_rec[1];
+
+ req.ppath_sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr;
+ /* Alternate path SGID attribute currently unsupported */
+ req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
+ req.qp_num = id_priv->qp_num;
+ req.qp_type = id_priv->id.qp_type;
+ req.starting_psn = id_priv->seq_num;
+ req.responder_resources = conn_param->responder_resources;
+ req.initiator_depth = conn_param->initiator_depth;
+ req.flow_control = conn_param->flow_control;
+ req.retry_count = min_t(u8, 7, conn_param->retry_count);
+ req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count);
+ req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
+ req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
+ req.max_cm_retries = CMA_MAX_CM_RETRIES;
+ req.srq = id_priv->srq ? 1 : 0;
+
+ ret = ib_send_cm_req(id_priv->cm_id.ib, &req);
+out:
+ if (ret && !IS_ERR(id)) {
+ ib_destroy_cm_id(id);
+ id_priv->cm_id.ib = NULL;
+ }
+
+ kfree(private_data);
+ return ret;
+}
+
+static int cma_connect_iw(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct iw_cm_id *cm_id;
+ int ret;
+ struct iw_cm_conn_param iw_param;
+
+ cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv);
+ if (IS_ERR(cm_id))
+ return PTR_ERR(cm_id);
+
+ cm_id->tos = id_priv->tos;
+ id_priv->cm_id.iw = cm_id;
+
+ memcpy(&cm_id->local_addr, cma_src_addr(id_priv),
+ rdma_addr_size(cma_src_addr(id_priv)));
+ memcpy(&cm_id->remote_addr, cma_dst_addr(id_priv),
+ rdma_addr_size(cma_dst_addr(id_priv)));
+
+ ret = cma_modify_qp_rtr(id_priv, conn_param);
+ if (ret)
+ goto out;
+
+ if (conn_param) {
+ iw_param.ord = conn_param->initiator_depth;
+ iw_param.ird = conn_param->responder_resources;
+ iw_param.private_data = conn_param->private_data;
+ iw_param.private_data_len = conn_param->private_data_len;
+ iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num;
+ } else {
+ memset(&iw_param, 0, sizeof iw_param);
+ iw_param.qpn = id_priv->qp_num;
+ }
+ ret = iw_cm_connect(cm_id, &iw_param);
+out:
+ if (ret) {
+ iw_destroy_cm_id(cm_id);
+ id_priv->cm_id.iw = NULL;
+ }
+ return ret;
+}
+
+int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT))
+ return -EINVAL;
+
+ if (!id->qp) {
+ id_priv->qp_num = conn_param->qp_num;
+ id_priv->srq = conn_param->srq;
+ }
+
+ if (rdma_cap_ib_cm(id->device, id->port_num)) {
+ if (id->qp_type == IB_QPT_UD)
+ ret = cma_resolve_ib_udp(id_priv, conn_param);
+ else
+ ret = cma_connect_ib(id_priv, conn_param);
+ } else if (rdma_cap_iw_cm(id->device, id->port_num))
+ ret = cma_connect_iw(id_priv, conn_param);
+ else
+ ret = -ENOSYS;
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_connect);
+
+static int cma_accept_ib(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_cm_rep_param rep;
+ int ret;
+
+ ret = cma_modify_qp_rtr(id_priv, conn_param);
+ if (ret)
+ goto out;
+
+ ret = cma_modify_qp_rts(id_priv, conn_param);
+ if (ret)
+ goto out;
+
+ memset(&rep, 0, sizeof rep);
+ rep.qp_num = id_priv->qp_num;
+ rep.starting_psn = id_priv->seq_num;
+ rep.private_data = conn_param->private_data;
+ rep.private_data_len = conn_param->private_data_len;
+ rep.responder_resources = conn_param->responder_resources;
+ rep.initiator_depth = conn_param->initiator_depth;
+ rep.failover_accepted = 0;
+ rep.flow_control = conn_param->flow_control;
+ rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count);
+ rep.srq = id_priv->srq ? 1 : 0;
+
+ ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep);
+out:
+ return ret;
+}
+
+static int cma_accept_iw(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct iw_cm_conn_param iw_param;
+ int ret;
+
+ if (!conn_param)
+ return -EINVAL;
+
+ ret = cma_modify_qp_rtr(id_priv, conn_param);
+ if (ret)
+ return ret;
+
+ iw_param.ord = conn_param->initiator_depth;
+ iw_param.ird = conn_param->responder_resources;
+ iw_param.private_data = conn_param->private_data;
+ iw_param.private_data_len = conn_param->private_data_len;
+ if (id_priv->id.qp) {
+ iw_param.qpn = id_priv->qp_num;
+ } else
+ iw_param.qpn = conn_param->qp_num;
+
+ return iw_cm_accept(id_priv->cm_id.iw, &iw_param);
+}
+
+static int cma_send_sidr_rep(struct rdma_id_private *id_priv,
+ enum ib_cm_sidr_status status, u32 qkey,
+ const void *private_data, int private_data_len)
+{
+ struct ib_cm_sidr_rep_param rep;
+ int ret;
+
+ memset(&rep, 0, sizeof rep);
+ rep.status = status;
+ if (status == IB_SIDR_SUCCESS) {
+ ret = cma_set_qkey(id_priv, qkey);
+ if (ret)
+ return ret;
+ rep.qp_num = id_priv->qp_num;
+ rep.qkey = id_priv->qkey;
+ }
+ rep.private_data = private_data;
+ rep.private_data_len = private_data_len;
+
+ return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep);
+}
+
+int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param,
+ const char *caller)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+
+ if (caller)
+ id_priv->res.kern_name = caller;
+ else
+ rdma_restrack_set_task(&id_priv->res, current);
+
+ if (!cma_comp(id_priv, RDMA_CM_CONNECT))
+ return -EINVAL;
+
+ if (!id->qp && conn_param) {
+ id_priv->qp_num = conn_param->qp_num;
+ id_priv->srq = conn_param->srq;
+ }
+
+ if (rdma_cap_ib_cm(id->device, id->port_num)) {
+ if (id->qp_type == IB_QPT_UD) {
+ if (conn_param)
+ ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
+ conn_param->qkey,
+ conn_param->private_data,
+ conn_param->private_data_len);
+ else
+ ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
+ 0, NULL, 0);
+ } else {
+ if (conn_param)
+ ret = cma_accept_ib(id_priv, conn_param);
+ else
+ ret = cma_rep_recv(id_priv);
+ }
+ } else if (rdma_cap_iw_cm(id->device, id->port_num))
+ ret = cma_accept_iw(id_priv, conn_param);
+ else
+ ret = -ENOSYS;
+
+ if (ret)
+ goto reject;
+
+ return 0;
+reject:
+ cma_modify_qp_err(id_priv);
+ rdma_reject(id, NULL, 0);
+ return ret;
+}
+EXPORT_SYMBOL(__rdma_accept);
+
+int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!id_priv->cm_id.ib)
+ return -EINVAL;
+
+ switch (id->device->node_type) {
+ case RDMA_NODE_IB_CA:
+ ret = ib_cm_notify(id_priv->cm_id.ib, event);
+ break;
+ default:
+ ret = 0;
+ break;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rdma_notify);
+
+int rdma_reject(struct rdma_cm_id *id, const void *private_data,
+ u8 private_data_len)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!id_priv->cm_id.ib)
+ return -EINVAL;
+
+ if (rdma_cap_ib_cm(id->device, id->port_num)) {
+ if (id->qp_type == IB_QPT_UD)
+ ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0,
+ private_data, private_data_len);
+ else
+ ret = ib_send_cm_rej(id_priv->cm_id.ib,
+ IB_CM_REJ_CONSUMER_DEFINED, NULL,
+ 0, private_data, private_data_len);
+ } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
+ ret = iw_cm_reject(id_priv->cm_id.iw,
+ private_data, private_data_len);
+ } else
+ ret = -ENOSYS;
+
+ return ret;
+}
+EXPORT_SYMBOL(rdma_reject);
+
+int rdma_disconnect(struct rdma_cm_id *id)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!id_priv->cm_id.ib)
+ return -EINVAL;
+
+ if (rdma_cap_ib_cm(id->device, id->port_num)) {
+ ret = cma_modify_qp_err(id_priv);
+ if (ret)
+ goto out;
+ /* Initiate or respond to a disconnect. */
+ if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0))
+ ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0);
+ } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
+ ret = iw_cm_disconnect(id_priv->cm_id.iw, 0);
+ } else
+ ret = -EINVAL;
+
+out:
+ return ret;
+}
+EXPORT_SYMBOL(rdma_disconnect);
+
+static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
+{
+ struct rdma_id_private *id_priv;
+ struct cma_multicast *mc = multicast->context;
+ struct rdma_cm_event event = {};
+ int ret = 0;
+
+ id_priv = mc->id_priv;
+ mutex_lock(&id_priv->handler_mutex);
+ if (id_priv->state != RDMA_CM_ADDR_BOUND &&
+ id_priv->state != RDMA_CM_ADDR_RESOLVED)
+ goto out;
+
+ if (!status)
+ status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey));
+ else
+ pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to join multicast. status %d\n",
+ status);
+ event.status = status;
+ event.param.ud.private_data = mc->context;
+ if (!status) {
+ struct rdma_dev_addr *dev_addr =
+ &id_priv->id.route.addr.dev_addr;
+ struct net_device *ndev =
+ dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
+ enum ib_gid_type gid_type =
+ id_priv->cma_dev->default_gid_type[id_priv->id.port_num -
+ rdma_start_port(id_priv->cma_dev->device)];
+
+ event.event = RDMA_CM_EVENT_MULTICAST_JOIN;
+ ret = ib_init_ah_from_mcmember(id_priv->id.device,
+ id_priv->id.port_num,
+ &multicast->rec,
+ ndev, gid_type,
+ &event.param.ud.ah_attr);
+ if (ret)
+ event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
+
+ event.param.ud.qp_num = 0xFFFFFF;
+ event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey);
+ if (ndev)
+ dev_put(ndev);
+ } else
+ event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
+
+ ret = id_priv->id.event_handler(&id_priv->id, &event);
+
+ rdma_destroy_ah_attr(&event.param.ud.ah_attr);
+ if (ret) {
+ cma_exch(id_priv, RDMA_CM_DESTROYING);
+ mutex_unlock(&id_priv->handler_mutex);
+ rdma_destroy_id(&id_priv->id);
+ return 0;
+ }
+
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ return 0;
+}
+
+static void cma_set_mgid(struct rdma_id_private *id_priv,
+ struct sockaddr *addr, union ib_gid *mgid)
+{
+ unsigned char mc_map[MAX_ADDR_LEN];
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ struct sockaddr_in *sin = (struct sockaddr_in *) addr;
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr;
+
+ if (cma_any_addr(addr)) {
+ memset(mgid, 0, sizeof *mgid);
+ } else if ((addr->sa_family == AF_INET6) &&
+ ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) ==
+ 0xFF10A01B)) {
+ /* IPv6 address is an SA assigned MGID. */
+ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
+ } else if (addr->sa_family == AF_IB) {
+ memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid);
+ } else if (addr->sa_family == AF_INET6) {
+ ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map);
+ if (id_priv->id.ps == RDMA_PS_UDP)
+ mc_map[7] = 0x01; /* Use RDMA CM signature */
+ *mgid = *(union ib_gid *) (mc_map + 4);
+ } else {
+ ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map);
+ if (id_priv->id.ps == RDMA_PS_UDP)
+ mc_map[7] = 0x01; /* Use RDMA CM signature */
+ *mgid = *(union ib_gid *) (mc_map + 4);
+ }
+}
+
+static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
+ struct cma_multicast *mc)
+{
+ struct ib_sa_mcmember_rec rec;
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ ib_sa_comp_mask comp_mask;
+ int ret;
+
+ ib_addr_get_mgid(dev_addr, &rec.mgid);
+ ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num,
+ &rec.mgid, &rec);
+ if (ret)
+ return ret;
+
+ ret = cma_set_qkey(id_priv, 0);
+ if (ret)
+ return ret;
+
+ cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid);
+ rec.qkey = cpu_to_be32(id_priv->qkey);
+ rdma_addr_get_sgid(dev_addr, &rec.port_gid);
+ rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+ rec.join_state = mc->join_state;
+
+ if ((rec.join_state == BIT(SENDONLY_FULLMEMBER_JOIN)) &&
+ (!ib_sa_sendonly_fullmem_support(&sa_client,
+ id_priv->id.device,
+ id_priv->id.port_num))) {
+ pr_warn("RDMA CM: %s port %u Unable to multicast join\n"
+ "RDMA CM: SM doesn't support Send Only Full Member option\n",
+ id_priv->id.device->name, id_priv->id.port_num);
+ return -EOPNOTSUPP;
+ }
+
+ comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
+ IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE |
+ IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL |
+ IB_SA_MCMEMBER_REC_FLOW_LABEL |
+ IB_SA_MCMEMBER_REC_TRAFFIC_CLASS;
+
+ if (id_priv->id.ps == RDMA_PS_IPOIB)
+ comp_mask |= IB_SA_MCMEMBER_REC_RATE |
+ IB_SA_MCMEMBER_REC_RATE_SELECTOR |
+ IB_SA_MCMEMBER_REC_MTU_SELECTOR |
+ IB_SA_MCMEMBER_REC_MTU |
+ IB_SA_MCMEMBER_REC_HOP_LIMIT;
+
+ mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device,
+ id_priv->id.port_num, &rec,
+ comp_mask, GFP_KERNEL,
+ cma_ib_mc_handler, mc);
+ return PTR_ERR_OR_ZERO(mc->multicast.ib);
+}
+
+static void iboe_mcast_work_handler(struct work_struct *work)
+{
+ struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work);
+ struct cma_multicast *mc = mw->mc;
+ struct ib_sa_multicast *m = mc->multicast.ib;
+
+ mc->multicast.ib->context = mc;
+ cma_ib_mc_handler(0, m);
+ kref_put(&mc->mcref, release_mc);
+ kfree(mw);
+}
+
+static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid,
+ enum ib_gid_type gid_type)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)addr;
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr;
+
+ if (cma_any_addr(addr)) {
+ memset(mgid, 0, sizeof *mgid);
+ } else if (addr->sa_family == AF_INET6) {
+ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
+ } else {
+ mgid->raw[0] =
+ (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0xff;
+ mgid->raw[1] =
+ (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0x0e;
+ mgid->raw[2] = 0;
+ mgid->raw[3] = 0;
+ mgid->raw[4] = 0;
+ mgid->raw[5] = 0;
+ mgid->raw[6] = 0;
+ mgid->raw[7] = 0;
+ mgid->raw[8] = 0;
+ mgid->raw[9] = 0;
+ mgid->raw[10] = 0xff;
+ mgid->raw[11] = 0xff;
+ *(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr;
+ }
+}
+
+static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
+ struct cma_multicast *mc)
+{
+ struct iboe_mcast_work *work;
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ int err = 0;
+ struct sockaddr *addr = (struct sockaddr *)&mc->addr;
+ struct net_device *ndev = NULL;
+ enum ib_gid_type gid_type;
+ bool send_only;
+
+ send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN);
+
+ if (cma_zero_addr((struct sockaddr *)&mc->addr))
+ return -EINVAL;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL);
+ if (!mc->multicast.ib) {
+ err = -ENOMEM;
+ goto out1;
+ }
+
+ gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num -
+ rdma_start_port(id_priv->cma_dev->device)];
+ cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid, gid_type);
+
+ mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff);
+ if (id_priv->id.ps == RDMA_PS_UDP)
+ mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
+
+ if (dev_addr->bound_dev_if)
+ ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
+ if (!ndev) {
+ err = -ENODEV;
+ goto out2;
+ }
+ mc->multicast.ib->rec.rate = iboe_get_rate(ndev);
+ mc->multicast.ib->rec.hop_limit = 1;
+ mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu);
+
+ if (addr->sa_family == AF_INET) {
+ if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) {
+ mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT;
+ if (!send_only) {
+ err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid,
+ true);
+ }
+ }
+ } else {
+ if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
+ err = -ENOTSUPP;
+ }
+ dev_put(ndev);
+ if (err || !mc->multicast.ib->rec.mtu) {
+ if (!err)
+ err = -EINVAL;
+ goto out2;
+ }
+ rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+ &mc->multicast.ib->rec.port_gid);
+ work->id = id_priv;
+ work->mc = mc;
+ INIT_WORK(&work->work, iboe_mcast_work_handler);
+ kref_get(&mc->mcref);
+ queue_work(cma_wq, &work->work);
+
+ return 0;
+
+out2:
+ kfree(mc->multicast.ib);
+out1:
+ kfree(work);
+ return err;
+}
+
+int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
+ u8 join_state, void *context)
+{
+ struct rdma_id_private *id_priv;
+ struct cma_multicast *mc;
+ int ret;
+
+ /* Not supported for kernel QPs */
+ if (WARN_ON(id->qp))
+ return -EINVAL;
+
+ if (!id->device)
+ return -EINVAL;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp(id_priv, RDMA_CM_ADDR_BOUND) &&
+ !cma_comp(id_priv, RDMA_CM_ADDR_RESOLVED))
+ return -EINVAL;
+
+ mc = kmalloc(sizeof *mc, GFP_KERNEL);
+ if (!mc)
+ return -ENOMEM;
+
+ memcpy(&mc->addr, addr, rdma_addr_size(addr));
+ mc->context = context;
+ mc->id_priv = id_priv;
+ mc->join_state = join_state;
+
+ if (rdma_protocol_roce(id->device, id->port_num)) {
+ kref_init(&mc->mcref);
+ ret = cma_iboe_join_multicast(id_priv, mc);
+ if (ret)
+ goto out_err;
+ } else if (rdma_cap_ib_mcast(id->device, id->port_num)) {
+ ret = cma_join_ib_multicast(id_priv, mc);
+ if (ret)
+ goto out_err;
+ } else {
+ ret = -ENOSYS;
+ goto out_err;
+ }
+
+ spin_lock(&id_priv->lock);
+ list_add(&mc->list, &id_priv->mc_list);
+ spin_unlock(&id_priv->lock);
+
+ return 0;
+out_err:
+ kfree(mc);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_join_multicast);
+
+void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+ struct rdma_id_private *id_priv;
+ struct cma_multicast *mc;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ spin_lock_irq(&id_priv->lock);
+ list_for_each_entry(mc, &id_priv->mc_list, list) {
+ if (memcmp(&mc->addr, addr, rdma_addr_size(addr)) != 0)
+ continue;
+ list_del(&mc->list);
+ spin_unlock_irq(&id_priv->lock);
+
+ WARN_ON(id_priv->cma_dev->device != id->device);
+ destroy_mc(id_priv, mc);
+ return;
+ }
+ spin_unlock_irq(&id_priv->lock);
+}
+EXPORT_SYMBOL(rdma_leave_multicast);
+
+static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv)
+{
+ struct rdma_dev_addr *dev_addr;
+ struct cma_ndev_work *work;
+
+ dev_addr = &id_priv->id.route.addr.dev_addr;
+
+ if ((dev_addr->bound_dev_if == ndev->ifindex) &&
+ (net_eq(dev_net(ndev), dev_addr->net)) &&
+ memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) {
+ pr_info("RDMA CM addr change for ndev %s used by id %p\n",
+ ndev->name, &id_priv->id);
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ INIT_WORK(&work->work, cma_ndev_work_handler);
+ work->id = id_priv;
+ work->event.event = RDMA_CM_EVENT_ADDR_CHANGE;
+ atomic_inc(&id_priv->refcount);
+ queue_work(cma_wq, &work->work);
+ }
+
+ return 0;
+}
+
+static int cma_netdev_callback(struct notifier_block *self, unsigned long event,
+ void *ptr)
+{
+ struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+ struct cma_device *cma_dev;
+ struct rdma_id_private *id_priv;
+ int ret = NOTIFY_DONE;
+
+ if (event != NETDEV_BONDING_FAILOVER)
+ return NOTIFY_DONE;
+
+ if (!netif_is_bond_master(ndev))
+ return NOTIFY_DONE;
+
+ mutex_lock(&lock);
+ list_for_each_entry(cma_dev, &dev_list, list)
+ list_for_each_entry(id_priv, &cma_dev->id_list, list) {
+ ret = cma_netdev_change(ndev, id_priv);
+ if (ret)
+ goto out;
+ }
+
+out:
+ mutex_unlock(&lock);
+ return ret;
+}
+
+static struct notifier_block cma_nb = {
+ .notifier_call = cma_netdev_callback
+};
+
+static void cma_add_one(struct ib_device *device)
+{
+ struct cma_device *cma_dev;
+ struct rdma_id_private *id_priv;
+ unsigned int i;
+ unsigned long supported_gids = 0;
+
+ cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL);
+ if (!cma_dev)
+ return;
+
+ cma_dev->device = device;
+ cma_dev->default_gid_type = kcalloc(device->phys_port_cnt,
+ sizeof(*cma_dev->default_gid_type),
+ GFP_KERNEL);
+ if (!cma_dev->default_gid_type)
+ goto free_cma_dev;
+
+ cma_dev->default_roce_tos = kcalloc(device->phys_port_cnt,
+ sizeof(*cma_dev->default_roce_tos),
+ GFP_KERNEL);
+ if (!cma_dev->default_roce_tos)
+ goto free_gid_type;
+
+ for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+ supported_gids = roce_gid_type_mask_support(device, i);
+ WARN_ON(!supported_gids);
+ if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE))
+ cma_dev->default_gid_type[i - rdma_start_port(device)] =
+ CMA_PREFERRED_ROCE_GID_TYPE;
+ else
+ cma_dev->default_gid_type[i - rdma_start_port(device)] =
+ find_first_bit(&supported_gids, BITS_PER_LONG);
+ cma_dev->default_roce_tos[i - rdma_start_port(device)] = 0;
+ }
+
+ init_completion(&cma_dev->comp);
+ atomic_set(&cma_dev->refcount, 1);
+ INIT_LIST_HEAD(&cma_dev->id_list);
+ ib_set_client_data(device, &cma_client, cma_dev);
+
+ mutex_lock(&lock);
+ list_add_tail(&cma_dev->list, &dev_list);
+ list_for_each_entry(id_priv, &listen_any_list, list)
+ cma_listen_on_dev(id_priv, cma_dev);
+ mutex_unlock(&lock);
+
+ return;
+
+free_gid_type:
+ kfree(cma_dev->default_gid_type);
+
+free_cma_dev:
+ kfree(cma_dev);
+
+ return;
+}
+
+static int cma_remove_id_dev(struct rdma_id_private *id_priv)
+{
+ struct rdma_cm_event event = {};
+ enum rdma_cm_state state;
+ int ret = 0;
+
+ /* Record that we want to remove the device */
+ state = cma_exch(id_priv, RDMA_CM_DEVICE_REMOVAL);
+ if (state == RDMA_CM_DESTROYING)
+ return 0;
+
+ cma_cancel_operation(id_priv, state);
+ mutex_lock(&id_priv->handler_mutex);
+
+ /* Check for destruction from another callback. */
+ if (!cma_comp(id_priv, RDMA_CM_DEVICE_REMOVAL))
+ goto out;
+
+ event.event = RDMA_CM_EVENT_DEVICE_REMOVAL;
+ ret = id_priv->id.event_handler(&id_priv->id, &event);
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ return ret;
+}
+
+static void cma_process_remove(struct cma_device *cma_dev)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ mutex_lock(&lock);
+ while (!list_empty(&cma_dev->id_list)) {
+ id_priv = list_entry(cma_dev->id_list.next,
+ struct rdma_id_private, list);
+
+ list_del(&id_priv->listen_list);
+ list_del_init(&id_priv->list);
+ atomic_inc(&id_priv->refcount);
+ mutex_unlock(&lock);
+
+ ret = id_priv->internal_id ? 1 : cma_remove_id_dev(id_priv);
+ cma_deref_id(id_priv);
+ if (ret)
+ rdma_destroy_id(&id_priv->id);
+
+ mutex_lock(&lock);
+ }
+ mutex_unlock(&lock);
+
+ cma_deref_dev(cma_dev);
+ wait_for_completion(&cma_dev->comp);
+}
+
+static void cma_remove_one(struct ib_device *device, void *client_data)
+{
+ struct cma_device *cma_dev = client_data;
+
+ if (!cma_dev)
+ return;
+
+ mutex_lock(&lock);
+ list_del(&cma_dev->list);
+ mutex_unlock(&lock);
+
+ cma_process_remove(cma_dev);
+ kfree(cma_dev->default_roce_tos);
+ kfree(cma_dev->default_gid_type);
+ kfree(cma_dev);
+}
+
+static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nlmsghdr *nlh;
+ struct rdma_cm_id_stats *id_stats;
+ struct rdma_id_private *id_priv;
+ struct rdma_cm_id *id = NULL;
+ struct cma_device *cma_dev;
+ int i_dev = 0, i_id = 0;
+
+ /*
+ * We export all of the IDs as a sequence of messages. Each
+ * ID gets its own netlink message.
+ */
+ mutex_lock(&lock);
+
+ list_for_each_entry(cma_dev, &dev_list, list) {
+ if (i_dev < cb->args[0]) {
+ i_dev++;
+ continue;
+ }
+
+ i_id = 0;
+ list_for_each_entry(id_priv, &cma_dev->id_list, list) {
+ if (i_id < cb->args[1]) {
+ i_id++;
+ continue;
+ }
+
+ id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq,
+ sizeof *id_stats, RDMA_NL_RDMA_CM,
+ RDMA_NL_RDMA_CM_ID_STATS,
+ NLM_F_MULTI);
+ if (!id_stats)
+ goto out;
+
+ memset(id_stats, 0, sizeof *id_stats);
+ id = &id_priv->id;
+ id_stats->node_type = id->route.addr.dev_addr.dev_type;
+ id_stats->port_num = id->port_num;
+ id_stats->bound_dev_if =
+ id->route.addr.dev_addr.bound_dev_if;
+
+ if (ibnl_put_attr(skb, nlh,
+ rdma_addr_size(cma_src_addr(id_priv)),
+ cma_src_addr(id_priv),
+ RDMA_NL_RDMA_CM_ATTR_SRC_ADDR))
+ goto out;
+ if (ibnl_put_attr(skb, nlh,
+ rdma_addr_size(cma_dst_addr(id_priv)),
+ cma_dst_addr(id_priv),
+ RDMA_NL_RDMA_CM_ATTR_DST_ADDR))
+ goto out;
+
+ id_stats->pid = task_pid_vnr(id_priv->res.task);
+ id_stats->port_space = id->ps;
+ id_stats->cm_state = id_priv->state;
+ id_stats->qp_num = id_priv->qp_num;
+ id_stats->qp_type = id->qp_type;
+
+ i_id++;
+ nlmsg_end(skb, nlh);
+ }
+
+ cb->args[1] = 0;
+ i_dev++;
+ }
+
+out:
+ mutex_unlock(&lock);
+ cb->args[0] = i_dev;
+ cb->args[1] = i_id;
+
+ return skb->len;
+}
+
+static const struct rdma_nl_cbs cma_cb_table[RDMA_NL_RDMA_CM_NUM_OPS] = {
+ [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats},
+};
+
+static int cma_init_net(struct net *net)
+{
+ struct cma_pernet *pernet = cma_pernet(net);
+
+ idr_init(&pernet->tcp_ps);
+ idr_init(&pernet->udp_ps);
+ idr_init(&pernet->ipoib_ps);
+ idr_init(&pernet->ib_ps);
+
+ return 0;
+}
+
+static void cma_exit_net(struct net *net)
+{
+ struct cma_pernet *pernet = cma_pernet(net);
+
+ idr_destroy(&pernet->tcp_ps);
+ idr_destroy(&pernet->udp_ps);
+ idr_destroy(&pernet->ipoib_ps);
+ idr_destroy(&pernet->ib_ps);
+}
+
+static struct pernet_operations cma_pernet_operations = {
+ .init = cma_init_net,
+ .exit = cma_exit_net,
+ .id = &cma_pernet_id,
+ .size = sizeof(struct cma_pernet),
+};
+
+static int __init cma_init(void)
+{
+ int ret;
+
+ /*
+ * There is a rare lock ordering dependency in cma_netdev_callback()
+ * that only happens when bonding is enabled. Teach lockdep that rtnl
+ * must never be nested under lock so it can find these without having
+ * to test with bonding.
+ */
+ if (IS_ENABLED(CONFIG_LOCKDEP)) {
+ rtnl_lock();
+ mutex_lock(&lock);
+ mutex_unlock(&lock);
+ rtnl_unlock();
+ }
+
+ cma_wq = alloc_ordered_workqueue("rdma_cm", WQ_MEM_RECLAIM);
+ if (!cma_wq)
+ return -ENOMEM;
+
+ ret = register_pernet_subsys(&cma_pernet_operations);
+ if (ret)
+ goto err_wq;
+
+ ib_sa_register_client(&sa_client);
+ register_netdevice_notifier(&cma_nb);
+
+ ret = ib_register_client(&cma_client);
+ if (ret)
+ goto err;
+
+ rdma_nl_register(RDMA_NL_RDMA_CM, cma_cb_table);
+ cma_configfs_init();
+
+ return 0;
+
+err:
+ unregister_netdevice_notifier(&cma_nb);
+ ib_sa_unregister_client(&sa_client);
+ unregister_pernet_subsys(&cma_pernet_operations);
+err_wq:
+ destroy_workqueue(cma_wq);
+ return ret;
+}
+
+static void __exit cma_cleanup(void)
+{
+ cma_configfs_exit();
+ rdma_nl_unregister(RDMA_NL_RDMA_CM);
+ ib_unregister_client(&cma_client);
+ unregister_netdevice_notifier(&cma_nb);
+ ib_sa_unregister_client(&sa_client);
+ unregister_pernet_subsys(&cma_pernet_operations);
+ destroy_workqueue(cma_wq);
+}
+
+MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_RDMA_CM, 1);
+
+module_init(cma_init);
+module_exit(cma_cleanup);
diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c
new file mode 100644
index 000000000..ce183d054
--- /dev/null
+++ b/drivers/infiniband/core/cma_configfs.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/configfs.h>
+#include <rdma/ib_verbs.h>
+#include "core_priv.h"
+
+struct cma_device;
+
+struct cma_dev_group;
+
+struct cma_dev_port_group {
+ unsigned int port_num;
+ struct cma_dev_group *cma_dev_group;
+ struct config_group group;
+};
+
+struct cma_dev_group {
+ char name[IB_DEVICE_NAME_MAX];
+ struct config_group device_group;
+ struct config_group ports_group;
+ struct cma_dev_port_group *ports;
+};
+
+static struct cma_dev_port_group *to_dev_port_group(struct config_item *item)
+{
+ struct config_group *group;
+
+ if (!item)
+ return NULL;
+
+ group = container_of(item, struct config_group, cg_item);
+ return container_of(group, struct cma_dev_port_group, group);
+}
+
+static bool filter_by_name(struct ib_device *ib_dev, void *cookie)
+{
+ return !strcmp(ib_dev->name, cookie);
+}
+
+static int cma_configfs_params_get(struct config_item *item,
+ struct cma_device **pcma_dev,
+ struct cma_dev_port_group **pgroup)
+{
+ struct cma_dev_port_group *group = to_dev_port_group(item);
+ struct cma_device *cma_dev;
+
+ if (!group)
+ return -ENODEV;
+
+ cma_dev = cma_enum_devices_by_ibdev(filter_by_name,
+ group->cma_dev_group->name);
+ if (!cma_dev)
+ return -ENODEV;
+
+ *pcma_dev = cma_dev;
+ *pgroup = group;
+
+ return 0;
+}
+
+static void cma_configfs_params_put(struct cma_device *cma_dev)
+{
+ cma_deref_dev(cma_dev);
+}
+
+static ssize_t default_roce_mode_show(struct config_item *item,
+ char *buf)
+{
+ struct cma_device *cma_dev;
+ struct cma_dev_port_group *group;
+ int gid_type;
+ ssize_t ret;
+
+ ret = cma_configfs_params_get(item, &cma_dev, &group);
+ if (ret)
+ return ret;
+
+ gid_type = cma_get_default_gid_type(cma_dev, group->port_num);
+ cma_configfs_params_put(cma_dev);
+
+ if (gid_type < 0)
+ return gid_type;
+
+ return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_type));
+}
+
+static ssize_t default_roce_mode_store(struct config_item *item,
+ const char *buf, size_t count)
+{
+ struct cma_device *cma_dev;
+ struct cma_dev_port_group *group;
+ int gid_type = ib_cache_gid_parse_type_str(buf);
+ ssize_t ret;
+
+ if (gid_type < 0)
+ return -EINVAL;
+
+ ret = cma_configfs_params_get(item, &cma_dev, &group);
+ if (ret)
+ return ret;
+
+ ret = cma_set_default_gid_type(cma_dev, group->port_num, gid_type);
+
+ cma_configfs_params_put(cma_dev);
+
+ return !ret ? strnlen(buf, count) : ret;
+}
+
+CONFIGFS_ATTR(, default_roce_mode);
+
+static ssize_t default_roce_tos_show(struct config_item *item, char *buf)
+{
+ struct cma_device *cma_dev;
+ struct cma_dev_port_group *group;
+ ssize_t ret;
+ u8 tos;
+
+ ret = cma_configfs_params_get(item, &cma_dev, &group);
+ if (ret)
+ return ret;
+
+ tos = cma_get_default_roce_tos(cma_dev, group->port_num);
+ cma_configfs_params_put(cma_dev);
+
+ return sprintf(buf, "%u\n", tos);
+}
+
+static ssize_t default_roce_tos_store(struct config_item *item,
+ const char *buf, size_t count)
+{
+ struct cma_device *cma_dev;
+ struct cma_dev_port_group *group;
+ ssize_t ret;
+ u8 tos;
+
+ ret = kstrtou8(buf, 0, &tos);
+ if (ret)
+ return ret;
+
+ ret = cma_configfs_params_get(item, &cma_dev, &group);
+ if (ret)
+ return ret;
+
+ ret = cma_set_default_roce_tos(cma_dev, group->port_num, tos);
+ cma_configfs_params_put(cma_dev);
+
+ return ret ? ret : strnlen(buf, count);
+}
+
+CONFIGFS_ATTR(, default_roce_tos);
+
+static struct configfs_attribute *cma_configfs_attributes[] = {
+ &attr_default_roce_mode,
+ &attr_default_roce_tos,
+ NULL,
+};
+
+static const struct config_item_type cma_port_group_type = {
+ .ct_attrs = cma_configfs_attributes,
+ .ct_owner = THIS_MODULE
+};
+
+static int make_cma_ports(struct cma_dev_group *cma_dev_group,
+ struct cma_device *cma_dev)
+{
+ struct ib_device *ibdev;
+ unsigned int i;
+ unsigned int ports_num;
+ struct cma_dev_port_group *ports;
+ int err;
+
+ ibdev = cma_get_ib_dev(cma_dev);
+
+ if (!ibdev)
+ return -ENODEV;
+
+ ports_num = ibdev->phys_port_cnt;
+ ports = kcalloc(ports_num, sizeof(*cma_dev_group->ports),
+ GFP_KERNEL);
+
+ if (!ports) {
+ err = -ENOMEM;
+ goto free;
+ }
+
+ for (i = 0; i < ports_num; i++) {
+ char port_str[10];
+
+ ports[i].port_num = i + 1;
+ snprintf(port_str, sizeof(port_str), "%u", i + 1);
+ ports[i].cma_dev_group = cma_dev_group;
+ config_group_init_type_name(&ports[i].group,
+ port_str,
+ &cma_port_group_type);
+ configfs_add_default_group(&ports[i].group,
+ &cma_dev_group->ports_group);
+
+ }
+ cma_dev_group->ports = ports;
+
+ return 0;
+free:
+ kfree(ports);
+ cma_dev_group->ports = NULL;
+ return err;
+}
+
+static void release_cma_dev(struct config_item *item)
+{
+ struct config_group *group = container_of(item, struct config_group,
+ cg_item);
+ struct cma_dev_group *cma_dev_group = container_of(group,
+ struct cma_dev_group,
+ device_group);
+
+ kfree(cma_dev_group);
+};
+
+static void release_cma_ports_group(struct config_item *item)
+{
+ struct config_group *group = container_of(item, struct config_group,
+ cg_item);
+ struct cma_dev_group *cma_dev_group = container_of(group,
+ struct cma_dev_group,
+ ports_group);
+
+ kfree(cma_dev_group->ports);
+ cma_dev_group->ports = NULL;
+};
+
+static struct configfs_item_operations cma_ports_item_ops = {
+ .release = release_cma_ports_group
+};
+
+static const struct config_item_type cma_ports_group_type = {
+ .ct_item_ops = &cma_ports_item_ops,
+ .ct_owner = THIS_MODULE
+};
+
+static struct configfs_item_operations cma_device_item_ops = {
+ .release = release_cma_dev
+};
+
+static const struct config_item_type cma_device_group_type = {
+ .ct_item_ops = &cma_device_item_ops,
+ .ct_owner = THIS_MODULE
+};
+
+static struct config_group *make_cma_dev(struct config_group *group,
+ const char *name)
+{
+ int err = -ENODEV;
+ struct cma_device *cma_dev = cma_enum_devices_by_ibdev(filter_by_name,
+ (void *)name);
+ struct cma_dev_group *cma_dev_group = NULL;
+
+ if (!cma_dev)
+ goto fail;
+
+ cma_dev_group = kzalloc(sizeof(*cma_dev_group), GFP_KERNEL);
+
+ if (!cma_dev_group) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ strlcpy(cma_dev_group->name, name, sizeof(cma_dev_group->name));
+
+ config_group_init_type_name(&cma_dev_group->ports_group, "ports",
+ &cma_ports_group_type);
+
+ err = make_cma_ports(cma_dev_group, cma_dev);
+ if (err)
+ goto fail;
+
+ config_group_init_type_name(&cma_dev_group->device_group, name,
+ &cma_device_group_type);
+ configfs_add_default_group(&cma_dev_group->ports_group,
+ &cma_dev_group->device_group);
+
+ cma_deref_dev(cma_dev);
+ return &cma_dev_group->device_group;
+
+fail:
+ if (cma_dev)
+ cma_deref_dev(cma_dev);
+ kfree(cma_dev_group);
+ return ERR_PTR(err);
+}
+
+static void drop_cma_dev(struct config_group *cgroup, struct config_item *item)
+{
+ struct config_group *group =
+ container_of(item, struct config_group, cg_item);
+ struct cma_dev_group *cma_dev_group =
+ container_of(group, struct cma_dev_group, device_group);
+
+ configfs_remove_default_groups(&cma_dev_group->ports_group);
+ configfs_remove_default_groups(&cma_dev_group->device_group);
+ config_item_put(item);
+}
+
+static struct configfs_group_operations cma_subsys_group_ops = {
+ .make_group = make_cma_dev,
+ .drop_item = drop_cma_dev,
+};
+
+static const struct config_item_type cma_subsys_type = {
+ .ct_group_ops = &cma_subsys_group_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct configfs_subsystem cma_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "rdma_cm",
+ .ci_type = &cma_subsys_type,
+ },
+ },
+};
+
+int __init cma_configfs_init(void)
+{
+ config_group_init(&cma_subsys.su_group);
+ mutex_init(&cma_subsys.su_mutex);
+ return configfs_register_subsystem(&cma_subsys);
+}
+
+void __exit cma_configfs_exit(void)
+{
+ configfs_unregister_subsystem(&cma_subsys);
+}
diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h
new file mode 100644
index 000000000..194cfe78c
--- /dev/null
+++ b/drivers/infiniband/core/cma_priv.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc. All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _CMA_PRIV_H
+#define _CMA_PRIV_H
+
+enum rdma_cm_state {
+ RDMA_CM_IDLE,
+ RDMA_CM_ADDR_QUERY,
+ RDMA_CM_ADDR_RESOLVED,
+ RDMA_CM_ROUTE_QUERY,
+ RDMA_CM_ROUTE_RESOLVED,
+ RDMA_CM_CONNECT,
+ RDMA_CM_DISCONNECT,
+ RDMA_CM_ADDR_BOUND,
+ RDMA_CM_LISTEN,
+ RDMA_CM_DEVICE_REMOVAL,
+ RDMA_CM_DESTROYING
+};
+
+struct rdma_id_private {
+ struct rdma_cm_id id;
+
+ struct rdma_bind_list *bind_list;
+ struct hlist_node node;
+ struct list_head list; /* listen_any_list or cma_device.list */
+ struct list_head listen_list; /* per device listens */
+ struct cma_device *cma_dev;
+ struct list_head mc_list;
+
+ int internal_id;
+ enum rdma_cm_state state;
+ spinlock_t lock;
+ struct mutex qp_mutex;
+
+ struct completion comp;
+ atomic_t refcount;
+ struct mutex handler_mutex;
+
+ int backlog;
+ int timeout_ms;
+ struct ib_sa_query *query;
+ int query_id;
+ union {
+ struct ib_cm_id *ib;
+ struct iw_cm_id *iw;
+ } cm_id;
+
+ u32 seq_num;
+ u32 qkey;
+ u32 qp_num;
+ u32 options;
+ u8 srq;
+ u8 tos;
+ bool tos_set;
+ u8 reuseaddr;
+ u8 afonly;
+ enum ib_gid_type gid_type;
+
+ /*
+ * Internal to RDMA/core, don't use in the drivers
+ */
+ struct rdma_restrack_entry res;
+};
+#endif /* _CMA_PRIV_H */
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
new file mode 100644
index 000000000..77c7005c3
--- /dev/null
+++ b/drivers/infiniband/core/core_priv.h
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _CORE_PRIV_H
+#define _CORE_PRIV_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/cgroup_rdma.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/opa_addr.h>
+#include <rdma/ib_mad.h>
+#include <rdma/restrack.h>
+#include "mad_priv.h"
+
+/* Total number of ports combined across all struct ib_devices's */
+#define RDMA_MAX_PORTS 1024
+
+struct pkey_index_qp_list {
+ struct list_head pkey_index_list;
+ u16 pkey_index;
+ /* Lock to hold while iterating the qp_list. */
+ spinlock_t qp_list_lock;
+ struct list_head qp_list;
+};
+
+#if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS)
+int cma_configfs_init(void);
+void cma_configfs_exit(void);
+#else
+static inline int cma_configfs_init(void)
+{
+ return 0;
+}
+
+static inline void cma_configfs_exit(void)
+{
+}
+#endif
+struct cma_device;
+void cma_ref_dev(struct cma_device *cma_dev);
+void cma_deref_dev(struct cma_device *cma_dev);
+typedef bool (*cma_device_filter)(struct ib_device *, void *);
+struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter,
+ void *cookie);
+int cma_get_default_gid_type(struct cma_device *cma_dev,
+ unsigned int port);
+int cma_set_default_gid_type(struct cma_device *cma_dev,
+ unsigned int port,
+ enum ib_gid_type default_gid_type);
+int cma_get_default_roce_tos(struct cma_device *cma_dev, unsigned int port);
+int cma_set_default_roce_tos(struct cma_device *a_dev, unsigned int port,
+ u8 default_roce_tos);
+struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev);
+
+int ib_device_register_sysfs(struct ib_device *device,
+ int (*port_callback)(struct ib_device *,
+ u8, struct kobject *));
+void ib_device_unregister_sysfs(struct ib_device *device);
+
+typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
+ struct net_device *idev, void *cookie);
+
+typedef bool (*roce_netdev_filter)(struct ib_device *device, u8 port,
+ struct net_device *idev, void *cookie);
+
+void ib_enum_roce_netdev(struct ib_device *ib_dev,
+ roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie);
+void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie);
+
+typedef int (*nldev_callback)(struct ib_device *device,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ unsigned int idx);
+
+int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
+ struct netlink_callback *cb);
+
+enum ib_cache_gid_default_mode {
+ IB_CACHE_GID_DEFAULT_MODE_SET,
+ IB_CACHE_GID_DEFAULT_MODE_DELETE
+};
+
+int ib_cache_gid_parse_type_str(const char *buf);
+
+const char *ib_cache_gid_type_str(enum ib_gid_type gid_type);
+
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev,
+ unsigned long gid_type_mask,
+ enum ib_cache_gid_default_mode mode);
+
+int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr);
+
+int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr);
+
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev);
+
+int roce_gid_mgmt_init(void);
+void roce_gid_mgmt_cleanup(void);
+
+unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port);
+
+int ib_cache_setup_one(struct ib_device *device);
+void ib_cache_cleanup_one(struct ib_device *device);
+void ib_cache_release_one(struct ib_device *device);
+
+#ifdef CONFIG_CGROUP_RDMA
+int ib_device_register_rdmacg(struct ib_device *device);
+void ib_device_unregister_rdmacg(struct ib_device *device);
+
+int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+ struct ib_device *device,
+ enum rdmacg_resource_type resource_index);
+
+void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+ struct ib_device *device,
+ enum rdmacg_resource_type resource_index);
+#else
+static inline int ib_device_register_rdmacg(struct ib_device *device)
+{ return 0; }
+
+static inline void ib_device_unregister_rdmacg(struct ib_device *device)
+{ }
+
+static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+ struct ib_device *device,
+ enum rdmacg_resource_type resource_index)
+{ return 0; }
+
+static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+ struct ib_device *device,
+ enum rdmacg_resource_type resource_index)
+{ }
+#endif
+
+static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
+ struct net_device *upper)
+{
+ return netdev_has_upper_dev_all_rcu(dev, upper);
+}
+
+int addr_init(void);
+void addr_cleanup(void);
+
+int ib_mad_init(void);
+void ib_mad_cleanup(void);
+
+int ib_sa_init(void);
+void ib_sa_cleanup(void);
+
+int rdma_nl_init(void);
+void rdma_nl_exit(void);
+
+int ib_nl_handle_resolve_resp(struct sk_buff *skb,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack);
+int ib_nl_handle_set_timeout(struct sk_buff *skb,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack);
+int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack);
+
+int ib_get_cached_subnet_prefix(struct ib_device *device,
+ u8 port_num,
+ u64 *sn_pfx);
+
+#ifdef CONFIG_SECURITY_INFINIBAND
+void ib_security_destroy_port_pkey_list(struct ib_device *device);
+
+void ib_security_cache_change(struct ib_device *device,
+ u8 port_num,
+ u64 subnet_prefix);
+
+int ib_security_modify_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_udata *udata);
+
+int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev);
+void ib_destroy_qp_security_begin(struct ib_qp_security *sec);
+void ib_destroy_qp_security_abort(struct ib_qp_security *sec);
+void ib_destroy_qp_security_end(struct ib_qp_security *sec);
+int ib_open_shared_qp_security(struct ib_qp *qp, struct ib_device *dev);
+void ib_close_shared_qp_security(struct ib_qp_security *sec);
+int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
+ enum ib_qp_type qp_type);
+void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent);
+int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index);
+#else
+static inline void ib_security_destroy_port_pkey_list(struct ib_device *device)
+{
+}
+
+static inline void ib_security_cache_change(struct ib_device *device,
+ u8 port_num,
+ u64 subnet_prefix)
+{
+}
+
+static inline int ib_security_modify_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_udata *udata)
+{
+ return qp->device->modify_qp(qp->real_qp,
+ qp_attr,
+ qp_attr_mask,
+ udata);
+}
+
+static inline int ib_create_qp_security(struct ib_qp *qp,
+ struct ib_device *dev)
+{
+ return 0;
+}
+
+static inline void ib_destroy_qp_security_begin(struct ib_qp_security *sec)
+{
+}
+
+static inline void ib_destroy_qp_security_abort(struct ib_qp_security *sec)
+{
+}
+
+static inline void ib_destroy_qp_security_end(struct ib_qp_security *sec)
+{
+}
+
+static inline int ib_open_shared_qp_security(struct ib_qp *qp,
+ struct ib_device *dev)
+{
+ return 0;
+}
+
+static inline void ib_close_shared_qp_security(struct ib_qp_security *sec)
+{
+}
+
+static inline int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
+ enum ib_qp_type qp_type)
+{
+ return 0;
+}
+
+static inline void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent)
+{
+}
+
+static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map,
+ u16 pkey_index)
+{
+ return 0;
+}
+#endif
+
+struct ib_device *ib_device_get_by_index(u32 ifindex);
+/* RDMA device netlink */
+void nldev_init(void);
+void nldev_exit(void);
+
+static inline struct ib_qp *_ib_create_qp(struct ib_device *dev,
+ struct ib_pd *pd,
+ struct ib_qp_init_attr *attr,
+ struct ib_udata *udata,
+ struct ib_uobject *uobj)
+{
+ struct ib_qp *qp;
+
+ if (!dev->create_qp)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ qp = dev->create_qp(pd, attr, udata);
+ if (IS_ERR(qp))
+ return qp;
+
+ qp->device = dev;
+ qp->pd = pd;
+ qp->uobject = uobj;
+ /*
+ * We don't track XRC QPs for now, because they don't have PD
+ * and more importantly they are created internaly by driver,
+ * see mlx5 create_dev_resources() as an example.
+ */
+ if (attr->qp_type < IB_QPT_XRC_INI) {
+ qp->res.type = RDMA_RESTRACK_QP;
+ rdma_restrack_add(&qp->res);
+ } else
+ qp->res.valid = false;
+
+ return qp;
+}
+
+struct rdma_dev_addr;
+int rdma_resolve_ip_route(struct sockaddr *src_addr,
+ const struct sockaddr *dst_addr,
+ struct rdma_dev_addr *addr);
+
+int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
+ const union ib_gid *dgid,
+ u8 *dmac, const struct net_device *ndev,
+ int *hoplimit);
+
+#endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
new file mode 100644
index 000000000..9271f7290
--- /dev/null
+++ b/drivers/infiniband/core/cq.c
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2015 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <rdma/ib_verbs.h>
+
+/* # of WCs to poll for with a single call to ib_poll_cq */
+#define IB_POLL_BATCH 16
+#define IB_POLL_BATCH_DIRECT 8
+
+/* # of WCs to iterate over before yielding */
+#define IB_POLL_BUDGET_IRQ 256
+#define IB_POLL_BUDGET_WORKQUEUE 65536
+
+#define IB_POLL_FLAGS \
+ (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
+
+static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs,
+ int batch)
+{
+ int i, n, completed = 0;
+
+ /*
+ * budget might be (-1) if the caller does not
+ * want to bound this call, thus we need unsigned
+ * minimum here.
+ */
+ while ((n = ib_poll_cq(cq, min_t(u32, batch,
+ budget - completed), wcs)) > 0) {
+ for (i = 0; i < n; i++) {
+ struct ib_wc *wc = &wcs[i];
+
+ if (wc->wr_cqe)
+ wc->wr_cqe->done(cq, wc);
+ else
+ WARN_ON_ONCE(wc->status == IB_WC_SUCCESS);
+ }
+
+ completed += n;
+
+ if (n != batch || (budget != -1 && completed >= budget))
+ break;
+ }
+
+ return completed;
+}
+
+/**
+ * ib_process_direct_cq - process a CQ in caller context
+ * @cq: CQ to process
+ * @budget: number of CQEs to poll for
+ *
+ * This function is used to process all outstanding CQ entries.
+ * It does not offload CQ processing to a different context and does
+ * not ask for completion interrupts from the HCA.
+ * Using direct processing on CQ with non IB_POLL_DIRECT type may trigger
+ * concurrent processing.
+ *
+ * Note: do not pass -1 as %budget unless it is guaranteed that the number
+ * of completions that will be processed is small.
+ */
+int ib_process_cq_direct(struct ib_cq *cq, int budget)
+{
+ struct ib_wc wcs[IB_POLL_BATCH_DIRECT];
+
+ return __ib_process_cq(cq, budget, wcs, IB_POLL_BATCH_DIRECT);
+}
+EXPORT_SYMBOL(ib_process_cq_direct);
+
+static void ib_cq_completion_direct(struct ib_cq *cq, void *private)
+{
+ WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq);
+}
+
+static int ib_poll_handler(struct irq_poll *iop, int budget)
+{
+ struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
+ int completed;
+
+ completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
+ if (completed < budget) {
+ irq_poll_complete(&cq->iop);
+ if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
+ irq_poll_sched(&cq->iop);
+ }
+
+ return completed;
+}
+
+static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
+{
+ irq_poll_sched(&cq->iop);
+}
+
+static void ib_cq_poll_work(struct work_struct *work)
+{
+ struct ib_cq *cq = container_of(work, struct ib_cq, work);
+ int completed;
+
+ completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, cq->wc,
+ IB_POLL_BATCH);
+ if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
+ ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
+ queue_work(cq->comp_wq, &cq->work);
+}
+
+static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
+{
+ queue_work(cq->comp_wq, &cq->work);
+}
+
+/**
+ * __ib_alloc_cq - allocate a completion queue
+ * @dev: device to allocate the CQ for
+ * @private: driver private data, accessible from cq->cq_context
+ * @nr_cqe: number of CQEs to allocate
+ * @comp_vector: HCA completion vectors for this CQ
+ * @poll_ctx: context to poll the CQ from.
+ * @caller: module owner name.
+ *
+ * This is the proper interface to allocate a CQ for in-kernel users. A
+ * CQ allocated with this interface will automatically be polled from the
+ * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id
+ * to use this CQ abstraction.
+ */
+struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private,
+ int nr_cqe, int comp_vector,
+ enum ib_poll_context poll_ctx, const char *caller)
+{
+ struct ib_cq_init_attr cq_attr = {
+ .cqe = nr_cqe,
+ .comp_vector = comp_vector,
+ };
+ struct ib_cq *cq;
+ int ret = -ENOMEM;
+
+ cq = dev->create_cq(dev, &cq_attr, NULL, NULL);
+ if (IS_ERR(cq))
+ return cq;
+
+ cq->device = dev;
+ cq->uobject = NULL;
+ cq->event_handler = NULL;
+ cq->cq_context = private;
+ cq->poll_ctx = poll_ctx;
+ atomic_set(&cq->usecnt, 0);
+
+ cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
+ if (!cq->wc)
+ goto out_destroy_cq;
+
+ cq->res.type = RDMA_RESTRACK_CQ;
+ cq->res.kern_name = caller;
+ rdma_restrack_add(&cq->res);
+
+ switch (cq->poll_ctx) {
+ case IB_POLL_DIRECT:
+ cq->comp_handler = ib_cq_completion_direct;
+ break;
+ case IB_POLL_SOFTIRQ:
+ cq->comp_handler = ib_cq_completion_softirq;
+
+ irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
+ ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+ break;
+ case IB_POLL_WORKQUEUE:
+ case IB_POLL_UNBOUND_WORKQUEUE:
+ cq->comp_handler = ib_cq_completion_workqueue;
+ INIT_WORK(&cq->work, ib_cq_poll_work);
+ ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+ cq->comp_wq = (cq->poll_ctx == IB_POLL_WORKQUEUE) ?
+ ib_comp_wq : ib_comp_unbound_wq;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out_free_wc;
+ }
+
+ return cq;
+
+out_free_wc:
+ kfree(cq->wc);
+ rdma_restrack_del(&cq->res);
+out_destroy_cq:
+ cq->device->destroy_cq(cq);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(__ib_alloc_cq);
+
+/**
+ * ib_free_cq - free a completion queue
+ * @cq: completion queue to free.
+ */
+void ib_free_cq(struct ib_cq *cq)
+{
+ int ret;
+
+ if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
+ return;
+
+ switch (cq->poll_ctx) {
+ case IB_POLL_DIRECT:
+ break;
+ case IB_POLL_SOFTIRQ:
+ irq_poll_disable(&cq->iop);
+ break;
+ case IB_POLL_WORKQUEUE:
+ case IB_POLL_UNBOUND_WORKQUEUE:
+ cancel_work_sync(&cq->work);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+
+ kfree(cq->wc);
+ rdma_restrack_del(&cq->res);
+ ret = cq->device->destroy_cq(cq);
+ WARN_ON_ONCE(ret);
+}
+EXPORT_SYMBOL(ib_free_cq);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
new file mode 100644
index 000000000..ffd0f43e2
--- /dev/null
+++ b/drivers/infiniband/core/device.c
@@ -0,0 +1,1261 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/security.h>
+#include <linux/notifier.h>
+#include <rdma/rdma_netlink.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+
+#include "core_priv.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("core kernel InfiniBand API");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_client_data {
+ struct list_head list;
+ struct ib_client *client;
+ void * data;
+ /* The device or client is going down. Do not call client or device
+ * callbacks other than remove(). */
+ bool going_down;
+};
+
+struct workqueue_struct *ib_comp_wq;
+struct workqueue_struct *ib_comp_unbound_wq;
+struct workqueue_struct *ib_wq;
+EXPORT_SYMBOL_GPL(ib_wq);
+
+/* The device_list and client_list contain devices and clients after their
+ * registration has completed, and the devices and clients are removed
+ * during unregistration. */
+static LIST_HEAD(device_list);
+static LIST_HEAD(client_list);
+
+/*
+ * device_mutex and lists_rwsem protect access to both device_list and
+ * client_list. device_mutex protects writer access by device and client
+ * registration / de-registration. lists_rwsem protects reader access to
+ * these lists. Iterators of these lists must lock it for read, while updates
+ * to the lists must be done with a write lock. A special case is when the
+ * device_mutex is locked. In this case locking the lists for read access is
+ * not necessary as the device_mutex implies it.
+ *
+ * lists_rwsem also protects access to the client data list.
+ */
+static DEFINE_MUTEX(device_mutex);
+static DECLARE_RWSEM(lists_rwsem);
+
+static int ib_security_change(struct notifier_block *nb, unsigned long event,
+ void *lsm_data);
+static void ib_policy_change_task(struct work_struct *work);
+static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task);
+
+static struct notifier_block ibdev_lsm_nb = {
+ .notifier_call = ib_security_change,
+};
+
+static int ib_device_check_mandatory(struct ib_device *device)
+{
+#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x }
+ static const struct {
+ size_t offset;
+ char *name;
+ } mandatory_table[] = {
+ IB_MANDATORY_FUNC(query_device),
+ IB_MANDATORY_FUNC(query_port),
+ IB_MANDATORY_FUNC(query_pkey),
+ IB_MANDATORY_FUNC(alloc_pd),
+ IB_MANDATORY_FUNC(dealloc_pd),
+ IB_MANDATORY_FUNC(create_qp),
+ IB_MANDATORY_FUNC(modify_qp),
+ IB_MANDATORY_FUNC(destroy_qp),
+ IB_MANDATORY_FUNC(post_send),
+ IB_MANDATORY_FUNC(post_recv),
+ IB_MANDATORY_FUNC(create_cq),
+ IB_MANDATORY_FUNC(destroy_cq),
+ IB_MANDATORY_FUNC(poll_cq),
+ IB_MANDATORY_FUNC(req_notify_cq),
+ IB_MANDATORY_FUNC(get_dma_mr),
+ IB_MANDATORY_FUNC(dereg_mr),
+ IB_MANDATORY_FUNC(get_port_immutable)
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
+ if (!*(void **) ((void *) device + mandatory_table[i].offset)) {
+ pr_warn("Device %s is missing mandatory function %s\n",
+ device->name, mandatory_table[i].name);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static struct ib_device *__ib_device_get_by_index(u32 index)
+{
+ struct ib_device *device;
+
+ list_for_each_entry(device, &device_list, core_list)
+ if (device->index == index)
+ return device;
+
+ return NULL;
+}
+
+/*
+ * Caller is responsible to return refrerence count by calling put_device()
+ */
+struct ib_device *ib_device_get_by_index(u32 index)
+{
+ struct ib_device *device;
+
+ down_read(&lists_rwsem);
+ device = __ib_device_get_by_index(index);
+ if (device)
+ get_device(&device->dev);
+
+ up_read(&lists_rwsem);
+ return device;
+}
+
+static struct ib_device *__ib_device_get_by_name(const char *name)
+{
+ struct ib_device *device;
+
+ list_for_each_entry(device, &device_list, core_list)
+ if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX))
+ return device;
+
+ return NULL;
+}
+
+static int alloc_name(char *name)
+{
+ unsigned long *inuse;
+ char buf[IB_DEVICE_NAME_MAX];
+ struct ib_device *device;
+ int i;
+
+ inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+ if (!inuse)
+ return -ENOMEM;
+
+ list_for_each_entry(device, &device_list, core_list) {
+ if (!sscanf(device->name, name, &i))
+ continue;
+ if (i < 0 || i >= PAGE_SIZE * 8)
+ continue;
+ snprintf(buf, sizeof buf, name, i);
+ if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX))
+ set_bit(i, inuse);
+ }
+
+ i = find_first_zero_bit(inuse, PAGE_SIZE * 8);
+ free_page((unsigned long) inuse);
+ snprintf(buf, sizeof buf, name, i);
+
+ if (__ib_device_get_by_name(buf))
+ return -ENFILE;
+
+ strlcpy(name, buf, IB_DEVICE_NAME_MAX);
+ return 0;
+}
+
+static void ib_device_release(struct device *device)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ WARN_ON(dev->reg_state == IB_DEV_REGISTERED);
+ if (dev->reg_state == IB_DEV_UNREGISTERED) {
+ /*
+ * In IB_DEV_UNINITIALIZED state, cache or port table
+ * is not even created. Free cache and port table only when
+ * device reaches UNREGISTERED state.
+ */
+ ib_cache_release_one(dev);
+ kfree(dev->port_immutable);
+ }
+ kfree(dev);
+}
+
+static int ib_device_uevent(struct device *device,
+ struct kobj_uevent_env *env)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ if (add_uevent_var(env, "NAME=%s", dev->name))
+ return -ENOMEM;
+
+ /*
+ * It would be nice to pass the node GUID with the event...
+ */
+
+ return 0;
+}
+
+static struct class ib_class = {
+ .name = "infiniband",
+ .dev_release = ib_device_release,
+ .dev_uevent = ib_device_uevent,
+};
+
+/**
+ * ib_alloc_device - allocate an IB device struct
+ * @size:size of structure to allocate
+ *
+ * Low-level drivers should use ib_alloc_device() to allocate &struct
+ * ib_device. @size is the size of the structure to be allocated,
+ * including any private data used by the low-level driver.
+ * ib_dealloc_device() must be used to free structures allocated with
+ * ib_alloc_device().
+ */
+struct ib_device *ib_alloc_device(size_t size)
+{
+ struct ib_device *device;
+
+ if (WARN_ON(size < sizeof(struct ib_device)))
+ return NULL;
+
+ device = kzalloc(size, GFP_KERNEL);
+ if (!device)
+ return NULL;
+
+ rdma_restrack_init(&device->res);
+
+ device->dev.class = &ib_class;
+ device_initialize(&device->dev);
+
+ dev_set_drvdata(&device->dev, device);
+
+ INIT_LIST_HEAD(&device->event_handler_list);
+ spin_lock_init(&device->event_handler_lock);
+ spin_lock_init(&device->client_data_lock);
+ INIT_LIST_HEAD(&device->client_data_list);
+ INIT_LIST_HEAD(&device->port_list);
+
+ return device;
+}
+EXPORT_SYMBOL(ib_alloc_device);
+
+/**
+ * ib_dealloc_device - free an IB device struct
+ * @device:structure to free
+ *
+ * Free a structure allocated with ib_alloc_device().
+ */
+void ib_dealloc_device(struct ib_device *device)
+{
+ WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
+ device->reg_state != IB_DEV_UNINITIALIZED);
+ rdma_restrack_clean(&device->res);
+ put_device(&device->dev);
+}
+EXPORT_SYMBOL(ib_dealloc_device);
+
+static int add_client_context(struct ib_device *device, struct ib_client *client)
+{
+ struct ib_client_data *context;
+ unsigned long flags;
+
+ context = kmalloc(sizeof *context, GFP_KERNEL);
+ if (!context)
+ return -ENOMEM;
+
+ context->client = client;
+ context->data = NULL;
+ context->going_down = false;
+
+ down_write(&lists_rwsem);
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_add(&context->list, &device->client_data_list);
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+ up_write(&lists_rwsem);
+
+ return 0;
+}
+
+static int verify_immutable(const struct ib_device *dev, u8 port)
+{
+ return WARN_ON(!rdma_cap_ib_mad(dev, port) &&
+ rdma_max_mad_size(dev, port) != 0);
+}
+
+static int read_port_immutable(struct ib_device *device)
+{
+ int ret;
+ u8 start_port = rdma_start_port(device);
+ u8 end_port = rdma_end_port(device);
+ u8 port;
+
+ /**
+ * device->port_immutable is indexed directly by the port number to make
+ * access to this data as efficient as possible.
+ *
+ * Therefore port_immutable is declared as a 1 based array with
+ * potential empty slots at the beginning.
+ */
+ device->port_immutable = kcalloc(end_port + 1,
+ sizeof(*device->port_immutable),
+ GFP_KERNEL);
+ if (!device->port_immutable)
+ return -ENOMEM;
+
+ for (port = start_port; port <= end_port; ++port) {
+ ret = device->get_port_immutable(device, port,
+ &device->port_immutable[port]);
+ if (ret)
+ return ret;
+
+ if (verify_immutable(device, port))
+ return -EINVAL;
+ }
+ return 0;
+}
+
+void ib_get_device_fw_str(struct ib_device *dev, char *str)
+{
+ if (dev->get_dev_fw_str)
+ dev->get_dev_fw_str(dev, str);
+ else
+ str[0] = '\0';
+}
+EXPORT_SYMBOL(ib_get_device_fw_str);
+
+static int setup_port_pkey_list(struct ib_device *device)
+{
+ int i;
+
+ /**
+ * device->port_pkey_list is indexed directly by the port number,
+ * Therefore it is declared as a 1 based array with potential empty
+ * slots at the beginning.
+ */
+ device->port_pkey_list = kcalloc(rdma_end_port(device) + 1,
+ sizeof(*device->port_pkey_list),
+ GFP_KERNEL);
+
+ if (!device->port_pkey_list)
+ return -ENOMEM;
+
+ for (i = 0; i < (rdma_end_port(device) + 1); i++) {
+ spin_lock_init(&device->port_pkey_list[i].list_lock);
+ INIT_LIST_HEAD(&device->port_pkey_list[i].pkey_list);
+ }
+
+ return 0;
+}
+
+static void ib_policy_change_task(struct work_struct *work)
+{
+ struct ib_device *dev;
+
+ down_read(&lists_rwsem);
+ list_for_each_entry(dev, &device_list, core_list) {
+ int i;
+
+ for (i = rdma_start_port(dev); i <= rdma_end_port(dev); i++) {
+ u64 sp;
+ int ret = ib_get_cached_subnet_prefix(dev,
+ i,
+ &sp);
+
+ WARN_ONCE(ret,
+ "ib_get_cached_subnet_prefix err: %d, this should never happen here\n",
+ ret);
+ if (!ret)
+ ib_security_cache_change(dev, i, sp);
+ }
+ }
+ up_read(&lists_rwsem);
+}
+
+static int ib_security_change(struct notifier_block *nb, unsigned long event,
+ void *lsm_data)
+{
+ if (event != LSM_POLICY_CHANGE)
+ return NOTIFY_DONE;
+
+ schedule_work(&ib_policy_change_work);
+
+ return NOTIFY_OK;
+}
+
+/**
+ * __dev_new_index - allocate an device index
+ *
+ * Returns a suitable unique value for a new device interface
+ * number. It assumes that there are less than 2^32-1 ib devices
+ * will be present in the system.
+ */
+static u32 __dev_new_index(void)
+{
+ /*
+ * The device index to allow stable naming.
+ * Similar to struct net -> ifindex.
+ */
+ static u32 index;
+
+ for (;;) {
+ if (!(++index))
+ index = 1;
+
+ if (!__ib_device_get_by_index(index))
+ return index;
+ }
+}
+
+/**
+ * ib_register_device - Register an IB device with IB core
+ * @device:Device to register
+ *
+ * Low-level drivers use ib_register_device() to register their
+ * devices with the IB core. All registered clients will receive a
+ * callback for each device that is added. @device must be allocated
+ * with ib_alloc_device().
+ */
+int ib_register_device(struct ib_device *device,
+ int (*port_callback)(struct ib_device *,
+ u8, struct kobject *))
+{
+ int ret;
+ struct ib_client *client;
+ struct ib_udata uhw = {.outlen = 0, .inlen = 0};
+ struct device *parent = device->dev.parent;
+
+ WARN_ON_ONCE(device->dma_device);
+ if (device->dev.dma_ops) {
+ /*
+ * The caller provided custom DMA operations. Copy the
+ * DMA-related fields that are used by e.g. dma_alloc_coherent()
+ * into device->dev.
+ */
+ device->dma_device = &device->dev;
+ if (!device->dev.dma_mask) {
+ if (parent)
+ device->dev.dma_mask = parent->dma_mask;
+ else
+ WARN_ON_ONCE(true);
+ }
+ if (!device->dev.coherent_dma_mask) {
+ if (parent)
+ device->dev.coherent_dma_mask =
+ parent->coherent_dma_mask;
+ else
+ WARN_ON_ONCE(true);
+ }
+ } else {
+ /*
+ * The caller did not provide custom DMA operations. Use the
+ * DMA mapping operations of the parent device.
+ */
+ WARN_ON_ONCE(!parent);
+ device->dma_device = parent;
+ }
+
+ mutex_lock(&device_mutex);
+
+ if (strchr(device->name, '%')) {
+ ret = alloc_name(device->name);
+ if (ret)
+ goto out;
+ }
+
+ if (ib_device_check_mandatory(device)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = read_port_immutable(device);
+ if (ret) {
+ pr_warn("Couldn't create per port immutable data %s\n",
+ device->name);
+ goto out;
+ }
+
+ ret = setup_port_pkey_list(device);
+ if (ret) {
+ pr_warn("Couldn't create per port_pkey_list\n");
+ goto out;
+ }
+
+ ret = ib_cache_setup_one(device);
+ if (ret) {
+ pr_warn("Couldn't set up InfiniBand P_Key/GID cache\n");
+ goto port_cleanup;
+ }
+
+ ret = ib_device_register_rdmacg(device);
+ if (ret) {
+ pr_warn("Couldn't register device with rdma cgroup\n");
+ goto cache_cleanup;
+ }
+
+ memset(&device->attrs, 0, sizeof(device->attrs));
+ ret = device->query_device(device, &device->attrs, &uhw);
+ if (ret) {
+ pr_warn("Couldn't query the device attributes\n");
+ goto cg_cleanup;
+ }
+
+ ret = ib_device_register_sysfs(device, port_callback);
+ if (ret) {
+ pr_warn("Couldn't register device %s with driver model\n",
+ device->name);
+ goto cg_cleanup;
+ }
+
+ device->reg_state = IB_DEV_REGISTERED;
+
+ list_for_each_entry(client, &client_list, list)
+ if (!add_client_context(device, client) && client->add)
+ client->add(device);
+
+ device->index = __dev_new_index();
+ down_write(&lists_rwsem);
+ list_add_tail(&device->core_list, &device_list);
+ up_write(&lists_rwsem);
+ mutex_unlock(&device_mutex);
+ return 0;
+
+cg_cleanup:
+ ib_device_unregister_rdmacg(device);
+cache_cleanup:
+ ib_cache_cleanup_one(device);
+ ib_cache_release_one(device);
+port_cleanup:
+ kfree(device->port_immutable);
+out:
+ mutex_unlock(&device_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ib_register_device);
+
+/**
+ * ib_unregister_device - Unregister an IB device
+ * @device:Device to unregister
+ *
+ * Unregister an IB device. All clients will receive a remove callback.
+ */
+void ib_unregister_device(struct ib_device *device)
+{
+ struct ib_client_data *context, *tmp;
+ unsigned long flags;
+
+ mutex_lock(&device_mutex);
+
+ down_write(&lists_rwsem);
+ list_del(&device->core_list);
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+ context->going_down = true;
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+ downgrade_write(&lists_rwsem);
+
+ list_for_each_entry_safe(context, tmp, &device->client_data_list,
+ list) {
+ if (context->client->remove)
+ context->client->remove(device, context->data);
+ }
+ up_read(&lists_rwsem);
+
+ ib_device_unregister_sysfs(device);
+ ib_device_unregister_rdmacg(device);
+
+ mutex_unlock(&device_mutex);
+
+ ib_cache_cleanup_one(device);
+
+ ib_security_destroy_port_pkey_list(device);
+ kfree(device->port_pkey_list);
+
+ down_write(&lists_rwsem);
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+ kfree(context);
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+ up_write(&lists_rwsem);
+
+ device->reg_state = IB_DEV_UNREGISTERED;
+}
+EXPORT_SYMBOL(ib_unregister_device);
+
+/**
+ * ib_register_client - Register an IB client
+ * @client:Client to register
+ *
+ * Upper level users of the IB drivers can use ib_register_client() to
+ * register callbacks for IB device addition and removal. When an IB
+ * device is added, each registered client's add method will be called
+ * (in the order the clients were registered), and when a device is
+ * removed, each client's remove method will be called (in the reverse
+ * order that clients were registered). In addition, when
+ * ib_register_client() is called, the client will receive an add
+ * callback for all devices already registered.
+ */
+int ib_register_client(struct ib_client *client)
+{
+ struct ib_device *device;
+
+ mutex_lock(&device_mutex);
+
+ list_for_each_entry(device, &device_list, core_list)
+ if (!add_client_context(device, client) && client->add)
+ client->add(device);
+
+ down_write(&lists_rwsem);
+ list_add_tail(&client->list, &client_list);
+ up_write(&lists_rwsem);
+
+ mutex_unlock(&device_mutex);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_register_client);
+
+/**
+ * ib_unregister_client - Unregister an IB client
+ * @client:Client to unregister
+ *
+ * Upper level users use ib_unregister_client() to remove their client
+ * registration. When ib_unregister_client() is called, the client
+ * will receive a remove callback for each IB device still registered.
+ */
+void ib_unregister_client(struct ib_client *client)
+{
+ struct ib_client_data *context, *tmp;
+ struct ib_device *device;
+ unsigned long flags;
+
+ mutex_lock(&device_mutex);
+
+ down_write(&lists_rwsem);
+ list_del(&client->list);
+ up_write(&lists_rwsem);
+
+ list_for_each_entry(device, &device_list, core_list) {
+ struct ib_client_data *found_context = NULL;
+
+ down_write(&lists_rwsem);
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+ if (context->client == client) {
+ context->going_down = true;
+ found_context = context;
+ break;
+ }
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+ up_write(&lists_rwsem);
+
+ if (client->remove)
+ client->remove(device, found_context ?
+ found_context->data : NULL);
+
+ if (!found_context) {
+ pr_warn("No client context found for %s/%s\n",
+ device->name, client->name);
+ continue;
+ }
+
+ down_write(&lists_rwsem);
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_del(&found_context->list);
+ kfree(found_context);
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+ up_write(&lists_rwsem);
+ }
+
+ mutex_unlock(&device_mutex);
+}
+EXPORT_SYMBOL(ib_unregister_client);
+
+/**
+ * ib_get_client_data - Get IB client context
+ * @device:Device to get context for
+ * @client:Client to get context for
+ *
+ * ib_get_client_data() returns client context set with
+ * ib_set_client_data().
+ */
+void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
+{
+ struct ib_client_data *context;
+ void *ret = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_for_each_entry(context, &device->client_data_list, list)
+ if (context->client == client) {
+ ret = context->data;
+ break;
+ }
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_client_data);
+
+/**
+ * ib_set_client_data - Set IB client context
+ * @device:Device to set context for
+ * @client:Client to set context for
+ * @data:Context to set
+ *
+ * ib_set_client_data() sets client context that can be retrieved with
+ * ib_get_client_data().
+ */
+void ib_set_client_data(struct ib_device *device, struct ib_client *client,
+ void *data)
+{
+ struct ib_client_data *context;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_for_each_entry(context, &device->client_data_list, list)
+ if (context->client == client) {
+ context->data = data;
+ goto out;
+ }
+
+ pr_warn("No client context found for %s/%s\n",
+ device->name, client->name);
+
+out:
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+}
+EXPORT_SYMBOL(ib_set_client_data);
+
+/**
+ * ib_register_event_handler - Register an IB event handler
+ * @event_handler:Handler to register
+ *
+ * ib_register_event_handler() registers an event handler that will be
+ * called back when asynchronous IB events occur (as defined in
+ * chapter 11 of the InfiniBand Architecture Specification). This
+ * callback may occur in interrupt context.
+ */
+void ib_register_event_handler(struct ib_event_handler *event_handler)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+ list_add_tail(&event_handler->list,
+ &event_handler->device->event_handler_list);
+ spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+}
+EXPORT_SYMBOL(ib_register_event_handler);
+
+/**
+ * ib_unregister_event_handler - Unregister an event handler
+ * @event_handler:Handler to unregister
+ *
+ * Unregister an event handler registered with
+ * ib_register_event_handler().
+ */
+void ib_unregister_event_handler(struct ib_event_handler *event_handler)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+ list_del(&event_handler->list);
+ spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+}
+EXPORT_SYMBOL(ib_unregister_event_handler);
+
+/**
+ * ib_dispatch_event - Dispatch an asynchronous event
+ * @event:Event to dispatch
+ *
+ * Low-level drivers must call ib_dispatch_event() to dispatch the
+ * event to all registered event handlers when an asynchronous event
+ * occurs.
+ */
+void ib_dispatch_event(struct ib_event *event)
+{
+ unsigned long flags;
+ struct ib_event_handler *handler;
+
+ spin_lock_irqsave(&event->device->event_handler_lock, flags);
+
+ list_for_each_entry(handler, &event->device->event_handler_list, list)
+ handler->handler(handler, event);
+
+ spin_unlock_irqrestore(&event->device->event_handler_lock, flags);
+}
+EXPORT_SYMBOL(ib_dispatch_event);
+
+/**
+ * ib_query_port - Query IB port attributes
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @port_attr:Port attributes
+ *
+ * ib_query_port() returns the attributes of a port through the
+ * @port_attr pointer.
+ */
+int ib_query_port(struct ib_device *device,
+ u8 port_num,
+ struct ib_port_attr *port_attr)
+{
+ union ib_gid gid;
+ int err;
+
+ if (!rdma_is_port_valid(device, port_num))
+ return -EINVAL;
+
+ memset(port_attr, 0, sizeof(*port_attr));
+ err = device->query_port(device, port_num, port_attr);
+ if (err || port_attr->subnet_prefix)
+ return err;
+
+ if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND)
+ return 0;
+
+ err = device->query_gid(device, port_num, 0, &gid);
+ if (err)
+ return err;
+
+ port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix);
+ return 0;
+}
+EXPORT_SYMBOL(ib_query_port);
+
+/**
+ * ib_enum_roce_netdev - enumerate all RoCE ports
+ * @ib_dev : IB device we want to query
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all of the physical RoCE ports of ib_dev
+ * which are related to netdevice and calls callback() on each
+ * device for which filter() function returns non zero.
+ */
+void ib_enum_roce_netdev(struct ib_device *ib_dev,
+ roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie)
+{
+ u8 port;
+
+ for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev);
+ port++)
+ if (rdma_protocol_roce(ib_dev, port)) {
+ struct net_device *idev = NULL;
+
+ if (ib_dev->get_netdev)
+ idev = ib_dev->get_netdev(ib_dev, port);
+
+ if (idev &&
+ idev->reg_state >= NETREG_UNREGISTERED) {
+ dev_put(idev);
+ idev = NULL;
+ }
+
+ if (filter(ib_dev, port, idev, filter_cookie))
+ cb(ib_dev, port, idev, cookie);
+
+ if (idev)
+ dev_put(idev);
+ }
+}
+
+/**
+ * ib_enum_all_roce_netdevs - enumerate all RoCE devices
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all RoCE devices' physical ports which are related
+ * to netdevices and calls callback() on each device for which
+ * filter() function returns non zero.
+ */
+void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie)
+{
+ struct ib_device *dev;
+
+ down_read(&lists_rwsem);
+ list_for_each_entry(dev, &device_list, core_list)
+ ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
+ up_read(&lists_rwsem);
+}
+
+/**
+ * ib_enum_all_devs - enumerate all ib_devices
+ * @cb: Callback to call for each found ib_device
+ *
+ * Enumerates all ib_devices and calls callback() on each device.
+ */
+int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct ib_device *dev;
+ unsigned int idx = 0;
+ int ret = 0;
+
+ down_read(&lists_rwsem);
+ list_for_each_entry(dev, &device_list, core_list) {
+ ret = nldev_cb(dev, skb, cb, idx);
+ if (ret)
+ break;
+ idx++;
+ }
+
+ up_read(&lists_rwsem);
+ return ret;
+}
+
+/**
+ * ib_query_pkey - Get P_Key table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:P_Key table index to query
+ * @pkey:Returned P_Key
+ *
+ * ib_query_pkey() fetches the specified P_Key table entry.
+ */
+int ib_query_pkey(struct ib_device *device,
+ u8 port_num, u16 index, u16 *pkey)
+{
+ return device->query_pkey(device, port_num, index, pkey);
+}
+EXPORT_SYMBOL(ib_query_pkey);
+
+/**
+ * ib_modify_device - Change IB device attributes
+ * @device:Device to modify
+ * @device_modify_mask:Mask of attributes to change
+ * @device_modify:New attribute values
+ *
+ * ib_modify_device() changes a device's attributes as specified by
+ * the @device_modify_mask and @device_modify structure.
+ */
+int ib_modify_device(struct ib_device *device,
+ int device_modify_mask,
+ struct ib_device_modify *device_modify)
+{
+ if (!device->modify_device)
+ return -ENOSYS;
+
+ return device->modify_device(device, device_modify_mask,
+ device_modify);
+}
+EXPORT_SYMBOL(ib_modify_device);
+
+/**
+ * ib_modify_port - Modifies the attributes for the specified port.
+ * @device: The device to modify.
+ * @port_num: The number of the port to modify.
+ * @port_modify_mask: Mask used to specify which attributes of the port
+ * to change.
+ * @port_modify: New attribute values for the port.
+ *
+ * ib_modify_port() changes a port's attributes as specified by the
+ * @port_modify_mask and @port_modify structure.
+ */
+int ib_modify_port(struct ib_device *device,
+ u8 port_num, int port_modify_mask,
+ struct ib_port_modify *port_modify)
+{
+ int rc;
+
+ if (!rdma_is_port_valid(device, port_num))
+ return -EINVAL;
+
+ if (device->modify_port)
+ rc = device->modify_port(device, port_num, port_modify_mask,
+ port_modify);
+ else
+ rc = rdma_protocol_roce(device, port_num) ? 0 : -ENOSYS;
+ return rc;
+}
+EXPORT_SYMBOL(ib_modify_port);
+
+/**
+ * ib_find_gid - Returns the port number and GID table index where
+ * a specified GID value occurs. Its searches only for IB link layer.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the GID table where the GID was found. This
+ * parameter may be NULL.
+ */
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+ u8 *port_num, u16 *index)
+{
+ union ib_gid tmp_gid;
+ int ret, port, i;
+
+ for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
+ if (!rdma_protocol_ib(device, port))
+ continue;
+
+ for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
+ ret = rdma_query_gid(device, port, i, &tmp_gid);
+ if (ret)
+ continue;
+
+ if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
+ *port_num = port;
+ if (index)
+ *index = i;
+ return 0;
+ }
+ }
+ }
+
+ return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_gid);
+
+/**
+ * ib_find_pkey - Returns the PKey table index where a specified
+ * PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the PKey table where the PKey was found.
+ */
+int ib_find_pkey(struct ib_device *device,
+ u8 port_num, u16 pkey, u16 *index)
+{
+ int ret, i;
+ u16 tmp_pkey;
+ int partial_ix = -1;
+
+ for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) {
+ ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
+ if (ret)
+ return ret;
+ if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
+ /* if there is full-member pkey take it.*/
+ if (tmp_pkey & 0x8000) {
+ *index = i;
+ return 0;
+ }
+ if (partial_ix < 0)
+ partial_ix = i;
+ }
+ }
+
+ /*no full-member, if exists take the limited*/
+ if (partial_ix >= 0) {
+ *index = partial_ix;
+ return 0;
+ }
+ return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_pkey);
+
+/**
+ * ib_get_net_dev_by_params() - Return the appropriate net_dev
+ * for a received CM request
+ * @dev: An RDMA device on which the request has been received.
+ * @port: Port number on the RDMA device.
+ * @pkey: The Pkey the request came on.
+ * @gid: A GID that the net_dev uses to communicate.
+ * @addr: Contains the IP address that the request specified as its
+ * destination.
+ */
+struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
+ u8 port,
+ u16 pkey,
+ const union ib_gid *gid,
+ const struct sockaddr *addr)
+{
+ struct net_device *net_dev = NULL;
+ struct ib_client_data *context;
+
+ if (!rdma_protocol_ib(dev, port))
+ return NULL;
+
+ down_read(&lists_rwsem);
+
+ list_for_each_entry(context, &dev->client_data_list, list) {
+ struct ib_client *client = context->client;
+
+ if (context->going_down)
+ continue;
+
+ if (client->get_net_dev_by_params) {
+ net_dev = client->get_net_dev_by_params(dev, port, pkey,
+ gid, addr,
+ context->data);
+ if (net_dev)
+ break;
+ }
+ }
+
+ up_read(&lists_rwsem);
+
+ return net_dev;
+}
+EXPORT_SYMBOL(ib_get_net_dev_by_params);
+
+static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = {
+ [RDMA_NL_LS_OP_RESOLVE] = {
+ .doit = ib_nl_handle_resolve_resp,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
+ [RDMA_NL_LS_OP_SET_TIMEOUT] = {
+ .doit = ib_nl_handle_set_timeout,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
+ [RDMA_NL_LS_OP_IP_RESOLVE] = {
+ .doit = ib_nl_handle_ip_res_resp,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
+};
+
+static int __init ib_core_init(void)
+{
+ int ret;
+
+ ib_wq = alloc_workqueue("infiniband", 0, 0);
+ if (!ib_wq)
+ return -ENOMEM;
+
+ ib_comp_wq = alloc_workqueue("ib-comp-wq",
+ WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+ if (!ib_comp_wq) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ib_comp_unbound_wq =
+ alloc_workqueue("ib-comp-unb-wq",
+ WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM |
+ WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE);
+ if (!ib_comp_unbound_wq) {
+ ret = -ENOMEM;
+ goto err_comp;
+ }
+
+ ret = class_register(&ib_class);
+ if (ret) {
+ pr_warn("Couldn't create InfiniBand device class\n");
+ goto err_comp_unbound;
+ }
+
+ ret = rdma_nl_init();
+ if (ret) {
+ pr_warn("Couldn't init IB netlink interface: err %d\n", ret);
+ goto err_sysfs;
+ }
+
+ ret = addr_init();
+ if (ret) {
+ pr_warn("Could't init IB address resolution\n");
+ goto err_ibnl;
+ }
+
+ ret = ib_mad_init();
+ if (ret) {
+ pr_warn("Couldn't init IB MAD\n");
+ goto err_addr;
+ }
+
+ ret = ib_sa_init();
+ if (ret) {
+ pr_warn("Couldn't init SA\n");
+ goto err_mad;
+ }
+
+ ret = register_lsm_notifier(&ibdev_lsm_nb);
+ if (ret) {
+ pr_warn("Couldn't register LSM notifier. ret %d\n", ret);
+ goto err_sa;
+ }
+
+ nldev_init();
+ rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);
+ roce_gid_mgmt_init();
+
+ return 0;
+
+err_sa:
+ ib_sa_cleanup();
+err_mad:
+ ib_mad_cleanup();
+err_addr:
+ addr_cleanup();
+err_ibnl:
+ rdma_nl_exit();
+err_sysfs:
+ class_unregister(&ib_class);
+err_comp_unbound:
+ destroy_workqueue(ib_comp_unbound_wq);
+err_comp:
+ destroy_workqueue(ib_comp_wq);
+err:
+ destroy_workqueue(ib_wq);
+ return ret;
+}
+
+static void __exit ib_core_cleanup(void)
+{
+ roce_gid_mgmt_cleanup();
+ nldev_exit();
+ rdma_nl_unregister(RDMA_NL_LS);
+ unregister_lsm_notifier(&ibdev_lsm_nb);
+ ib_sa_cleanup();
+ ib_mad_cleanup();
+ addr_cleanup();
+ rdma_nl_exit();
+ class_unregister(&ib_class);
+ destroy_workqueue(ib_comp_unbound_wq);
+ destroy_workqueue(ib_comp_wq);
+ /* Make sure that any pending umem accounting work is done. */
+ destroy_workqueue(ib_wq);
+}
+
+MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4);
+
+subsys_initcall(ib_core_init);
+module_exit(ib_core_cleanup);
diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c
new file mode 100644
index 000000000..a077500f7
--- /dev/null
+++ b/drivers/infiniband/core/fmr_pool.c
@@ -0,0 +1,508 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/jhash.h>
+#include <linux/kthread.h>
+
+#include <rdma/ib_fmr_pool.h>
+
+#include "core_priv.h"
+
+#define PFX "fmr_pool: "
+
+enum {
+ IB_FMR_MAX_REMAPS = 32,
+
+ IB_FMR_HASH_BITS = 8,
+ IB_FMR_HASH_SIZE = 1 << IB_FMR_HASH_BITS,
+ IB_FMR_HASH_MASK = IB_FMR_HASH_SIZE - 1
+};
+
+/*
+ * If an FMR is not in use, then the list member will point to either
+ * its pool's free_list (if the FMR can be mapped again; that is,
+ * remap_count < pool->max_remaps) or its pool's dirty_list (if the
+ * FMR needs to be unmapped before being remapped). In either of
+ * these cases it is a bug if the ref_count is not 0. In other words,
+ * if ref_count is > 0, then the list member must not be linked into
+ * either free_list or dirty_list.
+ *
+ * The cache_node member is used to link the FMR into a cache bucket
+ * (if caching is enabled). This is independent of the reference
+ * count of the FMR. When a valid FMR is released, its ref_count is
+ * decremented, and if ref_count reaches 0, the FMR is placed in
+ * either free_list or dirty_list as appropriate. However, it is not
+ * removed from the cache and may be "revived" if a call to
+ * ib_fmr_register_physical() occurs before the FMR is remapped. In
+ * this case we just increment the ref_count and remove the FMR from
+ * free_list/dirty_list.
+ *
+ * Before we remap an FMR from free_list, we remove it from the cache
+ * (to prevent another user from obtaining a stale FMR). When an FMR
+ * is released, we add it to the tail of the free list, so that our
+ * cache eviction policy is "least recently used."
+ *
+ * All manipulation of ref_count, list and cache_node is protected by
+ * pool_lock to maintain consistency.
+ */
+
+struct ib_fmr_pool {
+ spinlock_t pool_lock;
+
+ int pool_size;
+ int max_pages;
+ int max_remaps;
+ int dirty_watermark;
+ int dirty_len;
+ struct list_head free_list;
+ struct list_head dirty_list;
+ struct hlist_head *cache_bucket;
+
+ void (*flush_function)(struct ib_fmr_pool *pool,
+ void * arg);
+ void *flush_arg;
+
+ struct kthread_worker *worker;
+ struct kthread_work work;
+
+ atomic_t req_ser;
+ atomic_t flush_ser;
+
+ wait_queue_head_t force_wait;
+};
+
+static inline u32 ib_fmr_hash(u64 first_page)
+{
+ return jhash_2words((u32) first_page, (u32) (first_page >> 32), 0) &
+ (IB_FMR_HASH_SIZE - 1);
+}
+
+/* Caller must hold pool_lock */
+static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool,
+ u64 *page_list,
+ int page_list_len,
+ u64 io_virtual_address)
+{
+ struct hlist_head *bucket;
+ struct ib_pool_fmr *fmr;
+
+ if (!pool->cache_bucket)
+ return NULL;
+
+ bucket = pool->cache_bucket + ib_fmr_hash(*page_list);
+
+ hlist_for_each_entry(fmr, bucket, cache_node)
+ if (io_virtual_address == fmr->io_virtual_address &&
+ page_list_len == fmr->page_list_len &&
+ !memcmp(page_list, fmr->page_list,
+ page_list_len * sizeof *page_list))
+ return fmr;
+
+ return NULL;
+}
+
+static void ib_fmr_batch_release(struct ib_fmr_pool *pool)
+{
+ int ret;
+ struct ib_pool_fmr *fmr;
+ LIST_HEAD(unmap_list);
+ LIST_HEAD(fmr_list);
+
+ spin_lock_irq(&pool->pool_lock);
+
+ list_for_each_entry(fmr, &pool->dirty_list, list) {
+ hlist_del_init(&fmr->cache_node);
+ fmr->remap_count = 0;
+ list_add_tail(&fmr->fmr->list, &fmr_list);
+
+#ifdef DEBUG
+ if (fmr->ref_count !=0) {
+ pr_warn(PFX "Unmapping FMR 0x%08x with ref count %d\n",
+ fmr, fmr->ref_count);
+ }
+#endif
+ }
+
+ list_splice_init(&pool->dirty_list, &unmap_list);
+ pool->dirty_len = 0;
+
+ spin_unlock_irq(&pool->pool_lock);
+
+ if (list_empty(&unmap_list)) {
+ return;
+ }
+
+ ret = ib_unmap_fmr(&fmr_list);
+ if (ret)
+ pr_warn(PFX "ib_unmap_fmr returned %d\n", ret);
+
+ spin_lock_irq(&pool->pool_lock);
+ list_splice(&unmap_list, &pool->free_list);
+ spin_unlock_irq(&pool->pool_lock);
+}
+
+static void ib_fmr_cleanup_func(struct kthread_work *work)
+{
+ struct ib_fmr_pool *pool = container_of(work, struct ib_fmr_pool, work);
+
+ ib_fmr_batch_release(pool);
+ atomic_inc(&pool->flush_ser);
+ wake_up_interruptible(&pool->force_wait);
+
+ if (pool->flush_function)
+ pool->flush_function(pool, pool->flush_arg);
+
+ if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0)
+ kthread_queue_work(pool->worker, &pool->work);
+}
+
+/**
+ * ib_create_fmr_pool - Create an FMR pool
+ * @pd:Protection domain for FMRs
+ * @params:FMR pool parameters
+ *
+ * Create a pool of FMRs. Return value is pointer to new pool or
+ * error code if creation failed.
+ */
+struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd,
+ struct ib_fmr_pool_param *params)
+{
+ struct ib_device *device;
+ struct ib_fmr_pool *pool;
+ int i;
+ int ret;
+ int max_remaps;
+
+ if (!params)
+ return ERR_PTR(-EINVAL);
+
+ device = pd->device;
+ if (!device->alloc_fmr || !device->dealloc_fmr ||
+ !device->map_phys_fmr || !device->unmap_fmr) {
+ pr_info(PFX "Device %s does not support FMRs\n", device->name);
+ return ERR_PTR(-ENOSYS);
+ }
+
+ if (!device->attrs.max_map_per_fmr)
+ max_remaps = IB_FMR_MAX_REMAPS;
+ else
+ max_remaps = device->attrs.max_map_per_fmr;
+
+ pool = kmalloc(sizeof *pool, GFP_KERNEL);
+ if (!pool)
+ return ERR_PTR(-ENOMEM);
+
+ pool->cache_bucket = NULL;
+ pool->flush_function = params->flush_function;
+ pool->flush_arg = params->flush_arg;
+
+ INIT_LIST_HEAD(&pool->free_list);
+ INIT_LIST_HEAD(&pool->dirty_list);
+
+ if (params->cache) {
+ pool->cache_bucket =
+ kmalloc_array(IB_FMR_HASH_SIZE,
+ sizeof(*pool->cache_bucket),
+ GFP_KERNEL);
+ if (!pool->cache_bucket) {
+ ret = -ENOMEM;
+ goto out_free_pool;
+ }
+
+ for (i = 0; i < IB_FMR_HASH_SIZE; ++i)
+ INIT_HLIST_HEAD(pool->cache_bucket + i);
+ }
+
+ pool->pool_size = 0;
+ pool->max_pages = params->max_pages_per_fmr;
+ pool->max_remaps = max_remaps;
+ pool->dirty_watermark = params->dirty_watermark;
+ pool->dirty_len = 0;
+ spin_lock_init(&pool->pool_lock);
+ atomic_set(&pool->req_ser, 0);
+ atomic_set(&pool->flush_ser, 0);
+ init_waitqueue_head(&pool->force_wait);
+
+ pool->worker = kthread_create_worker(0, "ib_fmr(%s)", device->name);
+ if (IS_ERR(pool->worker)) {
+ pr_warn(PFX "couldn't start cleanup kthread worker\n");
+ ret = PTR_ERR(pool->worker);
+ goto out_free_pool;
+ }
+ kthread_init_work(&pool->work, ib_fmr_cleanup_func);
+
+ {
+ struct ib_pool_fmr *fmr;
+ struct ib_fmr_attr fmr_attr = {
+ .max_pages = params->max_pages_per_fmr,
+ .max_maps = pool->max_remaps,
+ .page_shift = params->page_shift
+ };
+ int bytes_per_fmr = sizeof *fmr;
+
+ if (pool->cache_bucket)
+ bytes_per_fmr += params->max_pages_per_fmr * sizeof (u64);
+
+ for (i = 0; i < params->pool_size; ++i) {
+ fmr = kmalloc(bytes_per_fmr, GFP_KERNEL);
+ if (!fmr)
+ goto out_fail;
+
+ fmr->pool = pool;
+ fmr->remap_count = 0;
+ fmr->ref_count = 0;
+ INIT_HLIST_NODE(&fmr->cache_node);
+
+ fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr);
+ if (IS_ERR(fmr->fmr)) {
+ pr_warn(PFX "fmr_create failed for FMR %d\n",
+ i);
+ kfree(fmr);
+ goto out_fail;
+ }
+
+ list_add_tail(&fmr->list, &pool->free_list);
+ ++pool->pool_size;
+ }
+ }
+
+ return pool;
+
+ out_free_pool:
+ kfree(pool->cache_bucket);
+ kfree(pool);
+
+ return ERR_PTR(ret);
+
+ out_fail:
+ ib_destroy_fmr_pool(pool);
+
+ return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_create_fmr_pool);
+
+/**
+ * ib_destroy_fmr_pool - Free FMR pool
+ * @pool:FMR pool to free
+ *
+ * Destroy an FMR pool and free all associated resources.
+ */
+void ib_destroy_fmr_pool(struct ib_fmr_pool *pool)
+{
+ struct ib_pool_fmr *fmr;
+ struct ib_pool_fmr *tmp;
+ LIST_HEAD(fmr_list);
+ int i;
+
+ kthread_destroy_worker(pool->worker);
+ ib_fmr_batch_release(pool);
+
+ i = 0;
+ list_for_each_entry_safe(fmr, tmp, &pool->free_list, list) {
+ if (fmr->remap_count) {
+ INIT_LIST_HEAD(&fmr_list);
+ list_add_tail(&fmr->fmr->list, &fmr_list);
+ ib_unmap_fmr(&fmr_list);
+ }
+ ib_dealloc_fmr(fmr->fmr);
+ list_del(&fmr->list);
+ kfree(fmr);
+ ++i;
+ }
+
+ if (i < pool->pool_size)
+ pr_warn(PFX "pool still has %d regions registered\n",
+ pool->pool_size - i);
+
+ kfree(pool->cache_bucket);
+ kfree(pool);
+}
+EXPORT_SYMBOL(ib_destroy_fmr_pool);
+
+/**
+ * ib_flush_fmr_pool - Invalidate all unmapped FMRs
+ * @pool:FMR pool to flush
+ *
+ * Ensure that all unmapped FMRs are fully invalidated.
+ */
+int ib_flush_fmr_pool(struct ib_fmr_pool *pool)
+{
+ int serial;
+ struct ib_pool_fmr *fmr, *next;
+
+ /*
+ * The free_list holds FMRs that may have been used
+ * but have not been remapped enough times to be dirty.
+ * Put them on the dirty list now so that the cleanup
+ * thread will reap them too.
+ */
+ spin_lock_irq(&pool->pool_lock);
+ list_for_each_entry_safe(fmr, next, &pool->free_list, list) {
+ if (fmr->remap_count > 0)
+ list_move(&fmr->list, &pool->dirty_list);
+ }
+ spin_unlock_irq(&pool->pool_lock);
+
+ serial = atomic_inc_return(&pool->req_ser);
+ kthread_queue_work(pool->worker, &pool->work);
+
+ if (wait_event_interruptible(pool->force_wait,
+ atomic_read(&pool->flush_ser) - serial >= 0))
+ return -EINTR;
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_flush_fmr_pool);
+
+/**
+ * ib_fmr_pool_map_phys - Map an FMR from an FMR pool.
+ * @pool_handle: FMR pool to allocate FMR from
+ * @page_list: List of pages to map
+ * @list_len: Number of pages in @page_list
+ * @io_virtual_address: I/O virtual address for new FMR
+ */
+struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
+ u64 *page_list,
+ int list_len,
+ u64 io_virtual_address)
+{
+ struct ib_fmr_pool *pool = pool_handle;
+ struct ib_pool_fmr *fmr;
+ unsigned long flags;
+ int result;
+
+ if (list_len < 1 || list_len > pool->max_pages)
+ return ERR_PTR(-EINVAL);
+
+ spin_lock_irqsave(&pool->pool_lock, flags);
+ fmr = ib_fmr_cache_lookup(pool,
+ page_list,
+ list_len,
+ io_virtual_address);
+ if (fmr) {
+ /* found in cache */
+ ++fmr->ref_count;
+ if (fmr->ref_count == 1) {
+ list_del(&fmr->list);
+ }
+
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+ return fmr;
+ }
+
+ if (list_empty(&pool->free_list)) {
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ fmr = list_entry(pool->free_list.next, struct ib_pool_fmr, list);
+ list_del(&fmr->list);
+ hlist_del_init(&fmr->cache_node);
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+ result = ib_map_phys_fmr(fmr->fmr, page_list, list_len,
+ io_virtual_address);
+
+ if (result) {
+ spin_lock_irqsave(&pool->pool_lock, flags);
+ list_add(&fmr->list, &pool->free_list);
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+ pr_warn(PFX "fmr_map returns %d\n", result);
+
+ return ERR_PTR(result);
+ }
+
+ ++fmr->remap_count;
+ fmr->ref_count = 1;
+
+ if (pool->cache_bucket) {
+ fmr->io_virtual_address = io_virtual_address;
+ fmr->page_list_len = list_len;
+ memcpy(fmr->page_list, page_list, list_len * sizeof(*page_list));
+
+ spin_lock_irqsave(&pool->pool_lock, flags);
+ hlist_add_head(&fmr->cache_node,
+ pool->cache_bucket + ib_fmr_hash(fmr->page_list[0]));
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+ }
+
+ return fmr;
+}
+EXPORT_SYMBOL(ib_fmr_pool_map_phys);
+
+/**
+ * ib_fmr_pool_unmap - Unmap FMR
+ * @fmr:FMR to unmap
+ *
+ * Unmap an FMR. The FMR mapping may remain valid until the FMR is
+ * reused (or until ib_flush_fmr_pool() is called).
+ */
+int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr)
+{
+ struct ib_fmr_pool *pool;
+ unsigned long flags;
+
+ pool = fmr->pool;
+
+ spin_lock_irqsave(&pool->pool_lock, flags);
+
+ --fmr->ref_count;
+ if (!fmr->ref_count) {
+ if (fmr->remap_count < pool->max_remaps) {
+ list_add_tail(&fmr->list, &pool->free_list);
+ } else {
+ list_add_tail(&fmr->list, &pool->dirty_list);
+ if (++pool->dirty_len >= pool->dirty_watermark) {
+ atomic_inc(&pool->req_ser);
+ kthread_queue_work(pool->worker, &pool->work);
+ }
+ }
+ }
+
+#ifdef DEBUG
+ if (fmr->ref_count < 0)
+ pr_warn(PFX "FMR %p has ref count %d < 0\n",
+ fmr, fmr->ref_count);
+#endif
+
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_fmr_pool_unmap);
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
new file mode 100644
index 000000000..57aec656a
--- /dev/null
+++ b/drivers/infiniband/core/iwcm.c
@@ -0,0 +1,1210 @@
+/*
+ * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_addr.h>
+#include <rdma/iw_portmap.h>
+#include <rdma/rdma_netlink.h>
+
+#include "iwcm.h"
+
+MODULE_AUTHOR("Tom Tucker");
+MODULE_DESCRIPTION("iWARP CM");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static const char * const iwcm_rej_reason_strs[] = {
+ [ECONNRESET] = "reset by remote host",
+ [ECONNREFUSED] = "refused by remote application",
+ [ETIMEDOUT] = "setup timeout",
+};
+
+const char *__attribute_const__ iwcm_reject_msg(int reason)
+{
+ size_t index;
+
+ /* iWARP uses negative errnos */
+ index = -reason;
+
+ if (index < ARRAY_SIZE(iwcm_rej_reason_strs) &&
+ iwcm_rej_reason_strs[index])
+ return iwcm_rej_reason_strs[index];
+ else
+ return "unrecognized reason";
+}
+EXPORT_SYMBOL(iwcm_reject_msg);
+
+static struct rdma_nl_cbs iwcm_nl_cb_table[RDMA_NL_IWPM_NUM_OPS] = {
+ [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb},
+ [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb},
+ [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb},
+ [RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb},
+ [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb},
+ [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb},
+ [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb}
+};
+
+static struct workqueue_struct *iwcm_wq;
+struct iwcm_work {
+ struct work_struct work;
+ struct iwcm_id_private *cm_id;
+ struct list_head list;
+ struct iw_cm_event event;
+ struct list_head free_list;
+};
+
+static unsigned int default_backlog = 256;
+
+static struct ctl_table_header *iwcm_ctl_table_hdr;
+static struct ctl_table iwcm_ctl_table[] = {
+ {
+ .procname = "default_backlog",
+ .data = &default_backlog,
+ .maxlen = sizeof(default_backlog),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+/*
+ * The following services provide a mechanism for pre-allocating iwcm_work
+ * elements. The design pre-allocates them based on the cm_id type:
+ * LISTENING IDS: Get enough elements preallocated to handle the
+ * listen backlog.
+ * ACTIVE IDS: 4: CONNECT_REPLY, ESTABLISHED, DISCONNECT, CLOSE
+ * PASSIVE IDS: 3: ESTABLISHED, DISCONNECT, CLOSE
+ *
+ * Allocating them in connect and listen avoids having to deal
+ * with allocation failures on the event upcall from the provider (which
+ * is called in the interrupt context).
+ *
+ * One exception is when creating the cm_id for incoming connection requests.
+ * There are two cases:
+ * 1) in the event upcall, cm_event_handler(), for a listening cm_id. If
+ * the backlog is exceeded, then no more connection request events will
+ * be processed. cm_event_handler() returns -ENOMEM in this case. Its up
+ * to the provider to reject the connection request.
+ * 2) in the connection request workqueue handler, cm_conn_req_handler().
+ * If work elements cannot be allocated for the new connect request cm_id,
+ * then IWCM will call the provider reject method. This is ok since
+ * cm_conn_req_handler() runs in the workqueue thread context.
+ */
+
+static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv)
+{
+ struct iwcm_work *work;
+
+ if (list_empty(&cm_id_priv->work_free_list))
+ return NULL;
+ work = list_entry(cm_id_priv->work_free_list.next, struct iwcm_work,
+ free_list);
+ list_del_init(&work->free_list);
+ return work;
+}
+
+static void put_work(struct iwcm_work *work)
+{
+ list_add(&work->free_list, &work->cm_id->work_free_list);
+}
+
+static void dealloc_work_entries(struct iwcm_id_private *cm_id_priv)
+{
+ struct list_head *e, *tmp;
+
+ list_for_each_safe(e, tmp, &cm_id_priv->work_free_list) {
+ list_del(e);
+ kfree(list_entry(e, struct iwcm_work, free_list));
+ }
+}
+
+static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count)
+{
+ struct iwcm_work *work;
+
+ BUG_ON(!list_empty(&cm_id_priv->work_free_list));
+ while (count--) {
+ work = kmalloc(sizeof(struct iwcm_work), GFP_KERNEL);
+ if (!work) {
+ dealloc_work_entries(cm_id_priv);
+ return -ENOMEM;
+ }
+ work->cm_id = cm_id_priv;
+ INIT_LIST_HEAD(&work->list);
+ put_work(work);
+ }
+ return 0;
+}
+
+/*
+ * Save private data from incoming connection requests to
+ * iw_cm_event, so the low level driver doesn't have to. Adjust
+ * the event ptr to point to the local copy.
+ */
+static int copy_private_data(struct iw_cm_event *event)
+{
+ void *p;
+
+ p = kmemdup(event->private_data, event->private_data_len, GFP_ATOMIC);
+ if (!p)
+ return -ENOMEM;
+ event->private_data = p;
+ return 0;
+}
+
+static void free_cm_id(struct iwcm_id_private *cm_id_priv)
+{
+ dealloc_work_entries(cm_id_priv);
+ kfree(cm_id_priv);
+}
+
+/*
+ * Release a reference on cm_id. If the last reference is being
+ * released, free the cm_id and return 1.
+ */
+static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
+{
+ BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
+ if (atomic_dec_and_test(&cm_id_priv->refcount)) {
+ BUG_ON(!list_empty(&cm_id_priv->work_list));
+ free_cm_id(cm_id_priv);
+ return 1;
+ }
+
+ return 0;
+}
+
+static void add_ref(struct iw_cm_id *cm_id)
+{
+ struct iwcm_id_private *cm_id_priv;
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ atomic_inc(&cm_id_priv->refcount);
+}
+
+static void rem_ref(struct iw_cm_id *cm_id)
+{
+ struct iwcm_id_private *cm_id_priv;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+ (void)iwcm_deref_id(cm_id_priv);
+}
+
+static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event);
+
+struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
+ iw_cm_handler cm_handler,
+ void *context)
+{
+ struct iwcm_id_private *cm_id_priv;
+
+ cm_id_priv = kzalloc(sizeof(*cm_id_priv), GFP_KERNEL);
+ if (!cm_id_priv)
+ return ERR_PTR(-ENOMEM);
+
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ cm_id_priv->id.device = device;
+ cm_id_priv->id.cm_handler = cm_handler;
+ cm_id_priv->id.context = context;
+ cm_id_priv->id.event_handler = cm_event_handler;
+ cm_id_priv->id.add_ref = add_ref;
+ cm_id_priv->id.rem_ref = rem_ref;
+ spin_lock_init(&cm_id_priv->lock);
+ atomic_set(&cm_id_priv->refcount, 1);
+ init_waitqueue_head(&cm_id_priv->connect_wait);
+ init_completion(&cm_id_priv->destroy_comp);
+ INIT_LIST_HEAD(&cm_id_priv->work_list);
+ INIT_LIST_HEAD(&cm_id_priv->work_free_list);
+
+ return &cm_id_priv->id;
+}
+EXPORT_SYMBOL(iw_create_cm_id);
+
+
+static int iwcm_modify_qp_err(struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+
+ if (!qp)
+ return -EINVAL;
+
+ qp_attr.qp_state = IB_QPS_ERR;
+ return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+}
+
+/*
+ * This is really the RDMAC CLOSING state. It is most similar to the
+ * IB SQD QP state.
+ */
+static int iwcm_modify_qp_sqd(struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+
+ BUG_ON(qp == NULL);
+ qp_attr.qp_state = IB_QPS_SQD;
+ return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+}
+
+/*
+ * CM_ID <-- CLOSING
+ *
+ * Block if a passive or active connection is currently being processed. Then
+ * process the event as follows:
+ * - If we are ESTABLISHED, move to CLOSING and modify the QP state
+ * based on the abrupt flag
+ * - If the connection is already in the CLOSING or IDLE state, the peer is
+ * disconnecting concurrently with us and we've already seen the
+ * DISCONNECT event -- ignore the request and return 0
+ * - Disconnect on a listening endpoint returns -EINVAL
+ */
+int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt)
+{
+ struct iwcm_id_private *cm_id_priv;
+ unsigned long flags;
+ int ret = 0;
+ struct ib_qp *qp = NULL;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ /* Wait if we're currently in a connect or accept downcall */
+ wait_event(cm_id_priv->connect_wait,
+ !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_ESTABLISHED:
+ cm_id_priv->state = IW_CM_STATE_CLOSING;
+
+ /* QP could be <nul> for user-mode client */
+ if (cm_id_priv->qp)
+ qp = cm_id_priv->qp;
+ else
+ ret = -EINVAL;
+ break;
+ case IW_CM_STATE_LISTEN:
+ ret = -EINVAL;
+ break;
+ case IW_CM_STATE_CLOSING:
+ /* remote peer closed first */
+ case IW_CM_STATE_IDLE:
+ /* accept or connect returned !0 */
+ break;
+ case IW_CM_STATE_CONN_RECV:
+ /*
+ * App called disconnect before/without calling accept after
+ * connect_request event delivered.
+ */
+ break;
+ case IW_CM_STATE_CONN_SENT:
+ /* Can only get here if wait above fails */
+ default:
+ BUG();
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ if (qp) {
+ if (abrupt)
+ ret = iwcm_modify_qp_err(qp);
+ else
+ ret = iwcm_modify_qp_sqd(qp);
+
+ /*
+ * If both sides are disconnecting the QP could
+ * already be in ERR or SQD states
+ */
+ ret = 0;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_disconnect);
+
+/*
+ * CM_ID <-- DESTROYING
+ *
+ * Clean up all resources associated with the connection and release
+ * the initial reference taken by iw_create_cm_id.
+ */
+static void destroy_cm_id(struct iw_cm_id *cm_id)
+{
+ struct iwcm_id_private *cm_id_priv;
+ unsigned long flags;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ /*
+ * Wait if we're currently in a connect or accept downcall. A
+ * listening endpoint should never block here.
+ */
+ wait_event(cm_id_priv->connect_wait,
+ !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
+
+ /*
+ * Since we're deleting the cm_id, drop any events that
+ * might arrive before the last dereference.
+ */
+ set_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_LISTEN:
+ cm_id_priv->state = IW_CM_STATE_DESTROYING;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ /* destroy the listening endpoint */
+ cm_id->device->iwcm->destroy_listen(cm_id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ break;
+ case IW_CM_STATE_ESTABLISHED:
+ cm_id_priv->state = IW_CM_STATE_DESTROYING;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ /* Abrupt close of the connection */
+ (void)iwcm_modify_qp_err(cm_id_priv->qp);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ break;
+ case IW_CM_STATE_IDLE:
+ case IW_CM_STATE_CLOSING:
+ cm_id_priv->state = IW_CM_STATE_DESTROYING;
+ break;
+ case IW_CM_STATE_CONN_RECV:
+ /*
+ * App called destroy before/without calling accept after
+ * receiving connection request event notification or
+ * returned non zero from the event callback function.
+ * In either case, must tell the provider to reject.
+ */
+ cm_id_priv->state = IW_CM_STATE_DESTROYING;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_id->device->iwcm->reject(cm_id, NULL, 0);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ break;
+ case IW_CM_STATE_CONN_SENT:
+ case IW_CM_STATE_DESTROYING:
+ default:
+ BUG();
+ break;
+ }
+ if (cm_id_priv->qp) {
+ cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+ cm_id_priv->qp = NULL;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ if (cm_id->mapped) {
+ iwpm_remove_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr);
+ iwpm_remove_mapping(&cm_id->local_addr, RDMA_NL_IWCM);
+ }
+
+ (void)iwcm_deref_id(cm_id_priv);
+}
+
+/*
+ * This function is only called by the application thread and cannot
+ * be called by the event thread. The function will wait for all
+ * references to be released on the cm_id and then kfree the cm_id
+ * object.
+ */
+void iw_destroy_cm_id(struct iw_cm_id *cm_id)
+{
+ destroy_cm_id(cm_id);
+}
+EXPORT_SYMBOL(iw_destroy_cm_id);
+
+/**
+ * iw_cm_check_wildcard - If IP address is 0 then use original
+ * @pm_addr: sockaddr containing the ip to check for wildcard
+ * @cm_addr: sockaddr containing the actual IP address
+ * @cm_outaddr: sockaddr to set IP addr which leaving port
+ *
+ * Checks the pm_addr for wildcard and then sets cm_outaddr's
+ * IP to the actual (cm_addr).
+ */
+static void iw_cm_check_wildcard(struct sockaddr_storage *pm_addr,
+ struct sockaddr_storage *cm_addr,
+ struct sockaddr_storage *cm_outaddr)
+{
+ if (pm_addr->ss_family == AF_INET) {
+ struct sockaddr_in *pm4_addr = (struct sockaddr_in *)pm_addr;
+
+ if (pm4_addr->sin_addr.s_addr == htonl(INADDR_ANY)) {
+ struct sockaddr_in *cm4_addr =
+ (struct sockaddr_in *)cm_addr;
+ struct sockaddr_in *cm4_outaddr =
+ (struct sockaddr_in *)cm_outaddr;
+
+ cm4_outaddr->sin_addr = cm4_addr->sin_addr;
+ }
+ } else {
+ struct sockaddr_in6 *pm6_addr = (struct sockaddr_in6 *)pm_addr;
+
+ if (ipv6_addr_type(&pm6_addr->sin6_addr) == IPV6_ADDR_ANY) {
+ struct sockaddr_in6 *cm6_addr =
+ (struct sockaddr_in6 *)cm_addr;
+ struct sockaddr_in6 *cm6_outaddr =
+ (struct sockaddr_in6 *)cm_outaddr;
+
+ cm6_outaddr->sin6_addr = cm6_addr->sin6_addr;
+ }
+ }
+}
+
+/**
+ * iw_cm_map - Use portmapper to map the ports
+ * @cm_id: connection manager pointer
+ * @active: Indicates the active side when true
+ * returns nonzero for error only if iwpm_create_mapinfo() fails
+ *
+ * Tries to add a mapping for a port using the Portmapper. If
+ * successful in mapping the IP/Port it will check the remote
+ * mapped IP address for a wildcard IP address and replace the
+ * zero IP address with the remote_addr.
+ */
+static int iw_cm_map(struct iw_cm_id *cm_id, bool active)
+{
+ struct iwpm_dev_data pm_reg_msg;
+ struct iwpm_sa_data pm_msg;
+ int status;
+
+ cm_id->m_local_addr = cm_id->local_addr;
+ cm_id->m_remote_addr = cm_id->remote_addr;
+
+ memcpy(pm_reg_msg.dev_name, cm_id->device->name,
+ sizeof(pm_reg_msg.dev_name));
+ memcpy(pm_reg_msg.if_name, cm_id->device->iwcm->ifname,
+ sizeof(pm_reg_msg.if_name));
+
+ if (iwpm_register_pid(&pm_reg_msg, RDMA_NL_IWCM) ||
+ !iwpm_valid_pid())
+ return 0;
+
+ cm_id->mapped = true;
+ pm_msg.loc_addr = cm_id->local_addr;
+ pm_msg.rem_addr = cm_id->remote_addr;
+ if (active)
+ status = iwpm_add_and_query_mapping(&pm_msg,
+ RDMA_NL_IWCM);
+ else
+ status = iwpm_add_mapping(&pm_msg, RDMA_NL_IWCM);
+
+ if (!status) {
+ cm_id->m_local_addr = pm_msg.mapped_loc_addr;
+ if (active) {
+ cm_id->m_remote_addr = pm_msg.mapped_rem_addr;
+ iw_cm_check_wildcard(&pm_msg.mapped_rem_addr,
+ &cm_id->remote_addr,
+ &cm_id->m_remote_addr);
+ }
+ }
+
+ return iwpm_create_mapinfo(&cm_id->local_addr,
+ &cm_id->m_local_addr,
+ RDMA_NL_IWCM);
+}
+
+/*
+ * CM_ID <-- LISTEN
+ *
+ * Start listening for connect requests. Generates one CONNECT_REQUEST
+ * event for each inbound connect request.
+ */
+int iw_cm_listen(struct iw_cm_id *cm_id, int backlog)
+{
+ struct iwcm_id_private *cm_id_priv;
+ unsigned long flags;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+ if (!backlog)
+ backlog = default_backlog;
+
+ ret = alloc_work_entries(cm_id_priv, backlog);
+ if (ret)
+ return ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_IDLE:
+ cm_id_priv->state = IW_CM_STATE_LISTEN;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ ret = iw_cm_map(cm_id, false);
+ if (!ret)
+ ret = cm_id->device->iwcm->create_listen(cm_id, backlog);
+ if (ret)
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_listen);
+
+/*
+ * CM_ID <-- IDLE
+ *
+ * Rejects an inbound connection request. No events are generated.
+ */
+int iw_cm_reject(struct iw_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct iwcm_id_private *cm_id_priv;
+ unsigned long flags;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ return -EINVAL;
+ }
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ ret = cm_id->device->iwcm->reject(cm_id, private_data,
+ private_data_len);
+
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_reject);
+
+/*
+ * CM_ID <-- ESTABLISHED
+ *
+ * Accepts an inbound connection request and generates an ESTABLISHED
+ * event. Callers of iw_cm_disconnect and iw_destroy_cm_id will block
+ * until the ESTABLISHED event is received from the provider.
+ */
+int iw_cm_accept(struct iw_cm_id *cm_id,
+ struct iw_cm_conn_param *iw_param)
+{
+ struct iwcm_id_private *cm_id_priv;
+ struct ib_qp *qp;
+ unsigned long flags;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ return -EINVAL;
+ }
+ /* Get the ib_qp given the QPN */
+ qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+ if (!qp) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ return -EINVAL;
+ }
+ cm_id->device->iwcm->add_ref(qp);
+ cm_id_priv->qp = qp;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ ret = cm_id->device->iwcm->accept(cm_id, iw_param);
+ if (ret) {
+ /* An error on accept precludes provider events */
+ BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id_priv->qp) {
+ cm_id->device->iwcm->rem_ref(qp);
+ cm_id_priv->qp = NULL;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_accept);
+
+/*
+ * Active Side: CM_ID <-- CONN_SENT
+ *
+ * If successful, results in the generation of a CONNECT_REPLY
+ * event. iw_cm_disconnect and iw_cm_destroy will block until the
+ * CONNECT_REPLY event is received from the provider.
+ */
+int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+ struct iwcm_id_private *cm_id_priv;
+ int ret;
+ unsigned long flags;
+ struct ib_qp *qp;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+ ret = alloc_work_entries(cm_id_priv, 4);
+ if (ret)
+ return ret;
+
+ set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+ if (cm_id_priv->state != IW_CM_STATE_IDLE) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ /* Get the ib_qp given the QPN */
+ qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+ if (!qp) {
+ ret = -EINVAL;
+ goto err;
+ }
+ cm_id->device->iwcm->add_ref(qp);
+ cm_id_priv->qp = qp;
+ cm_id_priv->state = IW_CM_STATE_CONN_SENT;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ ret = iw_cm_map(cm_id, true);
+ if (!ret)
+ ret = cm_id->device->iwcm->connect(cm_id, iw_param);
+ if (!ret)
+ return 0; /* success */
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id_priv->qp) {
+ cm_id->device->iwcm->rem_ref(qp);
+ cm_id_priv->qp = NULL;
+ }
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+err:
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_connect);
+
+/*
+ * Passive Side: new CM_ID <-- CONN_RECV
+ *
+ * Handles an inbound connect request. The function creates a new
+ * iw_cm_id to represent the new connection and inherits the client
+ * callback function and other attributes from the listening parent.
+ *
+ * The work item contains a pointer to the listen_cm_id and the event. The
+ * listen_cm_id contains the client cm_handler, context and
+ * device. These are copied when the device is cloned. The event
+ * contains the new four tuple.
+ *
+ * An error on the child should not affect the parent, so this
+ * function does not return a value.
+ */
+static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ unsigned long flags;
+ struct iw_cm_id *cm_id;
+ struct iwcm_id_private *cm_id_priv;
+ int ret;
+
+ /*
+ * The provider should never generate a connection request
+ * event with a bad status.
+ */
+ BUG_ON(iw_event->status);
+
+ cm_id = iw_create_cm_id(listen_id_priv->id.device,
+ listen_id_priv->id.cm_handler,
+ listen_id_priv->id.context);
+ /* If the cm_id could not be created, ignore the request */
+ if (IS_ERR(cm_id))
+ goto out;
+
+ cm_id->provider_data = iw_event->provider_data;
+ cm_id->m_local_addr = iw_event->local_addr;
+ cm_id->m_remote_addr = iw_event->remote_addr;
+ cm_id->local_addr = listen_id_priv->id.local_addr;
+
+ ret = iwpm_get_remote_info(&listen_id_priv->id.m_local_addr,
+ &iw_event->remote_addr,
+ &cm_id->remote_addr,
+ RDMA_NL_IWCM);
+ if (ret) {
+ cm_id->remote_addr = iw_event->remote_addr;
+ } else {
+ iw_cm_check_wildcard(&listen_id_priv->id.m_local_addr,
+ &iw_event->local_addr,
+ &cm_id->local_addr);
+ iw_event->local_addr = cm_id->local_addr;
+ iw_event->remote_addr = cm_id->remote_addr;
+ }
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ cm_id_priv->state = IW_CM_STATE_CONN_RECV;
+
+ /*
+ * We could be destroying the listening id. If so, ignore this
+ * upcall.
+ */
+ spin_lock_irqsave(&listen_id_priv->lock, flags);
+ if (listen_id_priv->state != IW_CM_STATE_LISTEN) {
+ spin_unlock_irqrestore(&listen_id_priv->lock, flags);
+ iw_cm_reject(cm_id, NULL, 0);
+ iw_destroy_cm_id(cm_id);
+ goto out;
+ }
+ spin_unlock_irqrestore(&listen_id_priv->lock, flags);
+
+ ret = alloc_work_entries(cm_id_priv, 3);
+ if (ret) {
+ iw_cm_reject(cm_id, NULL, 0);
+ iw_destroy_cm_id(cm_id);
+ goto out;
+ }
+
+ /* Call the client CM handler */
+ ret = cm_id->cm_handler(cm_id, iw_event);
+ if (ret) {
+ iw_cm_reject(cm_id, NULL, 0);
+ iw_destroy_cm_id(cm_id);
+ }
+
+out:
+ if (iw_event->private_data_len)
+ kfree(iw_event->private_data);
+}
+
+/*
+ * Passive Side: CM_ID <-- ESTABLISHED
+ *
+ * The provider generated an ESTABLISHED event which means that
+ * the MPA negotion has completed successfully and we are now in MPA
+ * FPDU mode.
+ *
+ * This event can only be received in the CONN_RECV state. If the
+ * remote peer closed, the ESTABLISHED event would be received followed
+ * by the CLOSE event. If the app closes, it will block until we wake
+ * it up after processing this event.
+ */
+static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+ /*
+ * We clear the CONNECT_WAIT bit here to allow the callback
+ * function to call iw_cm_disconnect. Calling iw_destroy_cm_id
+ * from a callback handler is not allowed.
+ */
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
+ cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+ wake_up_all(&cm_id_priv->connect_wait);
+
+ return ret;
+}
+
+/*
+ * Active Side: CM_ID <-- ESTABLISHED
+ *
+ * The app has called connect and is waiting for the established event to
+ * post it's requests to the server. This event will wake up anyone
+ * blocked in iw_cm_disconnect or iw_destroy_id.
+ */
+static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ /*
+ * Clear the connect wait bit so a callback function calling
+ * iw_cm_disconnect will not wait and deadlock this thread
+ */
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
+ if (iw_event->status == 0) {
+ cm_id_priv->id.m_local_addr = iw_event->local_addr;
+ cm_id_priv->id.m_remote_addr = iw_event->remote_addr;
+ iw_event->local_addr = cm_id_priv->id.local_addr;
+ iw_event->remote_addr = cm_id_priv->id.remote_addr;
+ cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
+ } else {
+ /* REJECTED or RESET */
+ cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+ cm_id_priv->qp = NULL;
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+
+ if (iw_event->private_data_len)
+ kfree(iw_event->private_data);
+
+ /* Wake up waiters on connect complete */
+ wake_up_all(&cm_id_priv->connect_wait);
+
+ return ret;
+}
+
+/*
+ * CM_ID <-- CLOSING
+ *
+ * If in the ESTABLISHED state, move to CLOSING.
+ */
+static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id_priv->state == IW_CM_STATE_ESTABLISHED)
+ cm_id_priv->state = IW_CM_STATE_CLOSING;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+}
+
+/*
+ * CM_ID <-- IDLE
+ *
+ * If in the ESTBLISHED or CLOSING states, the QP will have have been
+ * moved by the provider to the ERR state. Disassociate the CM_ID from
+ * the QP, move to IDLE, and remove the 'connected' reference.
+ *
+ * If in some other state, the cm_id was destroyed asynchronously.
+ * This is the last reference that will result in waking up
+ * the app thread blocked in iw_destroy_cm_id.
+ */
+static int cm_close_handler(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ unsigned long flags;
+ int ret = 0;
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+ if (cm_id_priv->qp) {
+ cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+ cm_id_priv->qp = NULL;
+ }
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_ESTABLISHED:
+ case IW_CM_STATE_CLOSING:
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ break;
+ case IW_CM_STATE_DESTROYING:
+ break;
+ default:
+ BUG();
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ return ret;
+}
+
+static int process_event(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ int ret = 0;
+
+ switch (iw_event->event) {
+ case IW_CM_EVENT_CONNECT_REQUEST:
+ cm_conn_req_handler(cm_id_priv, iw_event);
+ break;
+ case IW_CM_EVENT_CONNECT_REPLY:
+ ret = cm_conn_rep_handler(cm_id_priv, iw_event);
+ break;
+ case IW_CM_EVENT_ESTABLISHED:
+ ret = cm_conn_est_handler(cm_id_priv, iw_event);
+ break;
+ case IW_CM_EVENT_DISCONNECT:
+ cm_disconnect_handler(cm_id_priv, iw_event);
+ break;
+ case IW_CM_EVENT_CLOSE:
+ ret = cm_close_handler(cm_id_priv, iw_event);
+ break;
+ default:
+ BUG();
+ }
+
+ return ret;
+}
+
+/*
+ * Process events on the work_list for the cm_id. If the callback
+ * function requests that the cm_id be deleted, a flag is set in the
+ * cm_id flags to indicate that when the last reference is
+ * removed, the cm_id is to be destroyed. This is necessary to
+ * distinguish between an object that will be destroyed by the app
+ * thread asleep on the destroy_comp list vs. an object destroyed
+ * here synchronously when the last reference is removed.
+ */
+static void cm_work_handler(struct work_struct *_work)
+{
+ struct iwcm_work *work = container_of(_work, struct iwcm_work, work);
+ struct iw_cm_event levent;
+ struct iwcm_id_private *cm_id_priv = work->cm_id;
+ unsigned long flags;
+ int empty;
+ int ret = 0;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ empty = list_empty(&cm_id_priv->work_list);
+ while (!empty) {
+ work = list_entry(cm_id_priv->work_list.next,
+ struct iwcm_work, list);
+ list_del_init(&work->list);
+ empty = list_empty(&cm_id_priv->work_list);
+ levent = work->event;
+ put_work(work);
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) {
+ ret = process_event(cm_id_priv, &levent);
+ if (ret)
+ destroy_cm_id(&cm_id_priv->id);
+ } else
+ pr_debug("dropping event %d\n", levent.event);
+ if (iwcm_deref_id(cm_id_priv))
+ return;
+ if (empty)
+ return;
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+}
+
+/*
+ * This function is called on interrupt context. Schedule events on
+ * the iwcm_wq thread to allow callback functions to downcall into
+ * the CM and/or block. Events are queued to a per-CM_ID
+ * work_list. If this is the first event on the work_list, the work
+ * element is also queued on the iwcm_wq thread.
+ *
+ * Each event holds a reference on the cm_id. Until the last posted
+ * event has been delivered and processed, the cm_id cannot be
+ * deleted.
+ *
+ * Returns:
+ * 0 - the event was handled.
+ * -ENOMEM - the event was not handled due to lack of resources.
+ */
+static int cm_event_handler(struct iw_cm_id *cm_id,
+ struct iw_cm_event *iw_event)
+{
+ struct iwcm_work *work;
+ struct iwcm_id_private *cm_id_priv;
+ unsigned long flags;
+ int ret = 0;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ work = get_work(cm_id_priv);
+ if (!work) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ INIT_WORK(&work->work, cm_work_handler);
+ work->cm_id = cm_id_priv;
+ work->event = *iw_event;
+
+ if ((work->event.event == IW_CM_EVENT_CONNECT_REQUEST ||
+ work->event.event == IW_CM_EVENT_CONNECT_REPLY) &&
+ work->event.private_data_len) {
+ ret = copy_private_data(&work->event);
+ if (ret) {
+ put_work(work);
+ goto out;
+ }
+ }
+
+ atomic_inc(&cm_id_priv->refcount);
+ if (list_empty(&cm_id_priv->work_list)) {
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ queue_work(iwcm_wq, &work->work);
+ } else
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+out:
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_IDLE:
+ case IW_CM_STATE_CONN_SENT:
+ case IW_CM_STATE_CONN_RECV:
+ case IW_CM_STATE_ESTABLISHED:
+ *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
+ qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE|
+ IB_ACCESS_REMOTE_READ;
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+static int iwcm_init_qp_rts_attr(struct iwcm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_IDLE:
+ case IW_CM_STATE_CONN_SENT:
+ case IW_CM_STATE_CONN_RECV:
+ case IW_CM_STATE_ESTABLISHED:
+ *qp_attr_mask = 0;
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+int iw_cm_init_qp_attr(struct iw_cm_id *cm_id,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ struct iwcm_id_private *cm_id_priv;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ switch (qp_attr->qp_state) {
+ case IB_QPS_INIT:
+ case IB_QPS_RTR:
+ ret = iwcm_init_qp_init_attr(cm_id_priv,
+ qp_attr, qp_attr_mask);
+ break;
+ case IB_QPS_RTS:
+ ret = iwcm_init_qp_rts_attr(cm_id_priv,
+ qp_attr, qp_attr_mask);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_init_qp_attr);
+
+static int __init iw_cm_init(void)
+{
+ int ret;
+
+ ret = iwpm_init(RDMA_NL_IWCM);
+ if (ret)
+ return ret;
+
+ iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", 0);
+ if (!iwcm_wq)
+ goto err_alloc;
+
+ iwcm_ctl_table_hdr = register_net_sysctl(&init_net, "net/iw_cm",
+ iwcm_ctl_table);
+ if (!iwcm_ctl_table_hdr) {
+ pr_err("iw_cm: couldn't register sysctl paths\n");
+ goto err_sysctl;
+ }
+
+ rdma_nl_register(RDMA_NL_IWCM, iwcm_nl_cb_table);
+ return 0;
+
+err_sysctl:
+ destroy_workqueue(iwcm_wq);
+err_alloc:
+ iwpm_exit(RDMA_NL_IWCM);
+ return -ENOMEM;
+}
+
+static void __exit iw_cm_cleanup(void)
+{
+ rdma_nl_unregister(RDMA_NL_IWCM);
+ unregister_net_sysctl_table(iwcm_ctl_table_hdr);
+ destroy_workqueue(iwcm_wq);
+ iwpm_exit(RDMA_NL_IWCM);
+}
+
+MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_IWCM, 2);
+
+module_init(iw_cm_init);
+module_exit(iw_cm_cleanup);
diff --git a/drivers/infiniband/core/iwcm.h b/drivers/infiniband/core/iwcm.h
new file mode 100644
index 000000000..82c2cd1b0
--- /dev/null
+++ b/drivers/infiniband/core/iwcm.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef IWCM_H
+#define IWCM_H
+
+enum iw_cm_state {
+ IW_CM_STATE_IDLE, /* unbound, inactive */
+ IW_CM_STATE_LISTEN, /* listen waiting for connect */
+ IW_CM_STATE_CONN_RECV, /* inbound waiting for user accept */
+ IW_CM_STATE_CONN_SENT, /* outbound waiting for peer accept */
+ IW_CM_STATE_ESTABLISHED, /* established */
+ IW_CM_STATE_CLOSING, /* disconnect */
+ IW_CM_STATE_DESTROYING /* object being deleted */
+};
+
+struct iwcm_id_private {
+ struct iw_cm_id id;
+ enum iw_cm_state state;
+ unsigned long flags;
+ struct ib_qp *qp;
+ struct completion destroy_comp;
+ wait_queue_head_t connect_wait;
+ struct list_head work_list;
+ spinlock_t lock;
+ atomic_t refcount;
+ struct list_head work_free_list;
+};
+
+#define IWCM_F_DROP_EVENTS 1
+#define IWCM_F_CONNECT_WAIT 2
+
+#endif /* IWCM_H */
diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
new file mode 100644
index 000000000..8861c0521
--- /dev/null
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -0,0 +1,750 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "iwpm_util.h"
+
+static const char iwpm_ulib_name[IWPM_ULIBNAME_SIZE] = "iWarpPortMapperUser";
+static int iwpm_ulib_version = 3;
+static int iwpm_user_pid = IWPM_PID_UNDEFINED;
+static atomic_t echo_nlmsg_seq;
+
+int iwpm_valid_pid(void)
+{
+ return iwpm_user_pid > 0;
+}
+
+/*
+ * iwpm_register_pid - Send a netlink query to user space
+ * for the iwarp port mapper pid
+ *
+ * nlmsg attributes:
+ * [IWPM_NLA_REG_PID_SEQ]
+ * [IWPM_NLA_REG_IF_NAME]
+ * [IWPM_NLA_REG_IBDEV_NAME]
+ * [IWPM_NLA_REG_ULIB_NAME]
+ */
+int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client)
+{
+ struct sk_buff *skb = NULL;
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ struct nlmsghdr *nlh;
+ u32 msg_seq;
+ const char *err_str = "";
+ int ret = -EINVAL;
+
+ if (!iwpm_valid_client(nl_client)) {
+ err_str = "Invalid port mapper client";
+ goto pid_query_error;
+ }
+ if (iwpm_check_registration(nl_client, IWPM_REG_VALID) ||
+ iwpm_user_pid == IWPM_PID_UNAVAILABLE)
+ return 0;
+ skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REG_PID, &nlh, nl_client);
+ if (!skb) {
+ err_str = "Unable to create a nlmsg";
+ goto pid_query_error;
+ }
+ nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+ nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL);
+ if (!nlmsg_request) {
+ err_str = "Unable to allocate netlink request";
+ goto pid_query_error;
+ }
+ msg_seq = atomic_read(&echo_nlmsg_seq);
+
+ /* fill in the pid request message */
+ err_str = "Unable to put attribute of the nlmsg";
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_REG_PID_SEQ);
+ if (ret)
+ goto pid_query_error;
+ ret = ibnl_put_attr(skb, nlh, IFNAMSIZ,
+ pm_msg->if_name, IWPM_NLA_REG_IF_NAME);
+ if (ret)
+ goto pid_query_error;
+ ret = ibnl_put_attr(skb, nlh, IWPM_DEVNAME_SIZE,
+ pm_msg->dev_name, IWPM_NLA_REG_IBDEV_NAME);
+ if (ret)
+ goto pid_query_error;
+ ret = ibnl_put_attr(skb, nlh, IWPM_ULIBNAME_SIZE,
+ (char *)iwpm_ulib_name, IWPM_NLA_REG_ULIB_NAME);
+ if (ret)
+ goto pid_query_error;
+
+ nlmsg_end(skb, nlh);
+
+ pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n",
+ __func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name);
+
+ ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_IWPM, GFP_KERNEL);
+ if (ret) {
+ skb = NULL; /* skb is freed in the netlink send-op handling */
+ iwpm_user_pid = IWPM_PID_UNAVAILABLE;
+ err_str = "Unable to send a nlmsg";
+ goto pid_query_error;
+ }
+ nlmsg_request->req_buffer = pm_msg;
+ ret = iwpm_wait_complete_req(nlmsg_request);
+ return ret;
+pid_query_error:
+ pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+ if (skb)
+ dev_kfree_skb(skb);
+ if (nlmsg_request)
+ iwpm_free_nlmsg_request(&nlmsg_request->kref);
+ return ret;
+}
+
+/*
+ * iwpm_add_mapping - Send a netlink add mapping message
+ * to the port mapper
+ * nlmsg attributes:
+ * [IWPM_NLA_MANAGE_MAPPING_SEQ]
+ * [IWPM_NLA_MANAGE_ADDR]
+ */
+int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
+{
+ struct sk_buff *skb = NULL;
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ struct nlmsghdr *nlh;
+ u32 msg_seq;
+ const char *err_str = "";
+ int ret = -EINVAL;
+
+ if (!iwpm_valid_client(nl_client)) {
+ err_str = "Invalid port mapper client";
+ goto add_mapping_error;
+ }
+ if (!iwpm_valid_pid())
+ return 0;
+ if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) {
+ err_str = "Unregistered port mapper client";
+ goto add_mapping_error;
+ }
+ skb = iwpm_create_nlmsg(RDMA_NL_IWPM_ADD_MAPPING, &nlh, nl_client);
+ if (!skb) {
+ err_str = "Unable to create a nlmsg";
+ goto add_mapping_error;
+ }
+ nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+ nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL);
+ if (!nlmsg_request) {
+ err_str = "Unable to allocate netlink request";
+ goto add_mapping_error;
+ }
+ msg_seq = atomic_read(&echo_nlmsg_seq);
+ /* fill in the add mapping message */
+ err_str = "Unable to put attribute of the nlmsg";
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq,
+ IWPM_NLA_MANAGE_MAPPING_SEQ);
+ if (ret)
+ goto add_mapping_error;
+ ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+ &pm_msg->loc_addr, IWPM_NLA_MANAGE_ADDR);
+ if (ret)
+ goto add_mapping_error;
+
+ nlmsg_end(skb, nlh);
+ nlmsg_request->req_buffer = pm_msg;
+
+ ret = rdma_nl_unicast_wait(skb, iwpm_user_pid);
+ if (ret) {
+ skb = NULL; /* skb is freed in the netlink send-op handling */
+ iwpm_user_pid = IWPM_PID_UNDEFINED;
+ err_str = "Unable to send a nlmsg";
+ goto add_mapping_error;
+ }
+ ret = iwpm_wait_complete_req(nlmsg_request);
+ return ret;
+add_mapping_error:
+ pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+ if (skb)
+ dev_kfree_skb(skb);
+ if (nlmsg_request)
+ iwpm_free_nlmsg_request(&nlmsg_request->kref);
+ return ret;
+}
+
+/*
+ * iwpm_add_and_query_mapping - Send a netlink add and query
+ * mapping message to the port mapper
+ * nlmsg attributes:
+ * [IWPM_NLA_QUERY_MAPPING_SEQ]
+ * [IWPM_NLA_QUERY_LOCAL_ADDR]
+ * [IWPM_NLA_QUERY_REMOTE_ADDR]
+ */
+int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
+{
+ struct sk_buff *skb = NULL;
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ struct nlmsghdr *nlh;
+ u32 msg_seq;
+ const char *err_str = "";
+ int ret = -EINVAL;
+
+ if (!iwpm_valid_client(nl_client)) {
+ err_str = "Invalid port mapper client";
+ goto query_mapping_error;
+ }
+ if (!iwpm_valid_pid())
+ return 0;
+ if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) {
+ err_str = "Unregistered port mapper client";
+ goto query_mapping_error;
+ }
+ ret = -ENOMEM;
+ skb = iwpm_create_nlmsg(RDMA_NL_IWPM_QUERY_MAPPING, &nlh, nl_client);
+ if (!skb) {
+ err_str = "Unable to create a nlmsg";
+ goto query_mapping_error;
+ }
+ nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+ nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq,
+ nl_client, GFP_KERNEL);
+ if (!nlmsg_request) {
+ err_str = "Unable to allocate netlink request";
+ goto query_mapping_error;
+ }
+ msg_seq = atomic_read(&echo_nlmsg_seq);
+
+ /* fill in the query message */
+ err_str = "Unable to put attribute of the nlmsg";
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq,
+ IWPM_NLA_QUERY_MAPPING_SEQ);
+ if (ret)
+ goto query_mapping_error;
+ ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+ &pm_msg->loc_addr, IWPM_NLA_QUERY_LOCAL_ADDR);
+ if (ret)
+ goto query_mapping_error;
+ ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+ &pm_msg->rem_addr, IWPM_NLA_QUERY_REMOTE_ADDR);
+ if (ret)
+ goto query_mapping_error;
+
+ nlmsg_end(skb, nlh);
+ nlmsg_request->req_buffer = pm_msg;
+
+ ret = rdma_nl_unicast_wait(skb, iwpm_user_pid);
+ if (ret) {
+ skb = NULL; /* skb is freed in the netlink send-op handling */
+ err_str = "Unable to send a nlmsg";
+ goto query_mapping_error;
+ }
+ ret = iwpm_wait_complete_req(nlmsg_request);
+ return ret;
+query_mapping_error:
+ pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+ if (skb)
+ dev_kfree_skb(skb);
+ if (nlmsg_request)
+ iwpm_free_nlmsg_request(&nlmsg_request->kref);
+ return ret;
+}
+
+/*
+ * iwpm_remove_mapping - Send a netlink remove mapping message
+ * to the port mapper
+ * nlmsg attributes:
+ * [IWPM_NLA_MANAGE_MAPPING_SEQ]
+ * [IWPM_NLA_MANAGE_ADDR]
+ */
+int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client)
+{
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ u32 msg_seq;
+ const char *err_str = "";
+ int ret = -EINVAL;
+
+ if (!iwpm_valid_client(nl_client)) {
+ err_str = "Invalid port mapper client";
+ goto remove_mapping_error;
+ }
+ if (!iwpm_valid_pid())
+ return 0;
+ if (iwpm_check_registration(nl_client, IWPM_REG_UNDEF)) {
+ err_str = "Unregistered port mapper client";
+ goto remove_mapping_error;
+ }
+ skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REMOVE_MAPPING, &nlh, nl_client);
+ if (!skb) {
+ ret = -ENOMEM;
+ err_str = "Unable to create a nlmsg";
+ goto remove_mapping_error;
+ }
+ msg_seq = atomic_read(&echo_nlmsg_seq);
+ nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+ err_str = "Unable to put attribute of the nlmsg";
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq,
+ IWPM_NLA_MANAGE_MAPPING_SEQ);
+ if (ret)
+ goto remove_mapping_error;
+ ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+ local_addr, IWPM_NLA_MANAGE_ADDR);
+ if (ret)
+ goto remove_mapping_error;
+
+ nlmsg_end(skb, nlh);
+
+ ret = rdma_nl_unicast_wait(skb, iwpm_user_pid);
+ if (ret) {
+ skb = NULL; /* skb is freed in the netlink send-op handling */
+ iwpm_user_pid = IWPM_PID_UNDEFINED;
+ err_str = "Unable to send a nlmsg";
+ goto remove_mapping_error;
+ }
+ iwpm_print_sockaddr(local_addr,
+ "remove_mapping: Local sockaddr:");
+ return 0;
+remove_mapping_error:
+ pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+ if (skb)
+ dev_kfree_skb_any(skb);
+ return ret;
+}
+
+/* netlink attribute policy for the received response to register pid request */
+static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = {
+ [IWPM_NLA_RREG_PID_SEQ] = { .type = NLA_U32 },
+ [IWPM_NLA_RREG_IBDEV_NAME] = { .type = NLA_STRING,
+ .len = IWPM_DEVNAME_SIZE - 1 },
+ [IWPM_NLA_RREG_ULIB_NAME] = { .type = NLA_STRING,
+ .len = IWPM_ULIBNAME_SIZE - 1 },
+ [IWPM_NLA_RREG_ULIB_VER] = { .type = NLA_U16 },
+ [IWPM_NLA_RREG_PID_ERR] = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_register_pid_cb - Process a port mapper response to
+ * iwpm_register_pid()
+ */
+int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ struct nlattr *nltb[IWPM_NLA_RREG_PID_MAX];
+ struct iwpm_dev_data *pm_msg;
+ char *dev_name, *iwpm_name;
+ u32 msg_seq;
+ u8 nl_client;
+ u16 iwpm_version;
+ const char *msg_type = "Register Pid response";
+
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_RREG_PID_MAX,
+ resp_reg_policy, nltb, msg_type))
+ return -EINVAL;
+
+ msg_seq = nla_get_u32(nltb[IWPM_NLA_RREG_PID_SEQ]);
+ nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+ if (!nlmsg_request) {
+ pr_info("%s: Could not find a matching request (seq = %u)\n",
+ __func__, msg_seq);
+ return -EINVAL;
+ }
+ pm_msg = nlmsg_request->req_buffer;
+ nl_client = nlmsg_request->nl_client;
+ dev_name = (char *)nla_data(nltb[IWPM_NLA_RREG_IBDEV_NAME]);
+ iwpm_name = (char *)nla_data(nltb[IWPM_NLA_RREG_ULIB_NAME]);
+ iwpm_version = nla_get_u16(nltb[IWPM_NLA_RREG_ULIB_VER]);
+
+ /* check device name, ulib name and version */
+ if (strcmp(pm_msg->dev_name, dev_name) ||
+ strcmp(iwpm_ulib_name, iwpm_name) ||
+ iwpm_version != iwpm_ulib_version) {
+
+ pr_info("%s: Incorrect info (dev = %s name = %s version = %d)\n",
+ __func__, dev_name, iwpm_name, iwpm_version);
+ nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+ goto register_pid_response_exit;
+ }
+ iwpm_user_pid = cb->nlh->nlmsg_pid;
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+ pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
+ __func__, iwpm_user_pid);
+ if (iwpm_valid_client(nl_client))
+ iwpm_set_registration(nl_client, IWPM_REG_VALID);
+register_pid_response_exit:
+ nlmsg_request->request_done = 1;
+ /* always for found nlmsg_request */
+ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+ barrier();
+ up(&nlmsg_request->sem);
+ return 0;
+}
+
+/* netlink attribute policy for the received response to add mapping request */
+static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = {
+ [IWPM_NLA_MANAGE_MAPPING_SEQ] = { .type = NLA_U32 },
+ [IWPM_NLA_MANAGE_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_MANAGE_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_add_mapping_cb - Process a port mapper response to
+ * iwpm_add_mapping()
+ */
+int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct iwpm_sa_data *pm_msg;
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ struct nlattr *nltb[IWPM_NLA_RMANAGE_MAPPING_MAX];
+ struct sockaddr_storage *local_sockaddr;
+ struct sockaddr_storage *mapped_sockaddr;
+ const char *msg_type;
+ u32 msg_seq;
+
+ msg_type = "Add Mapping response";
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_RMANAGE_MAPPING_MAX,
+ resp_add_policy, nltb, msg_type))
+ return -EINVAL;
+
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+
+ msg_seq = nla_get_u32(nltb[IWPM_NLA_MANAGE_MAPPING_SEQ]);
+ nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+ if (!nlmsg_request) {
+ pr_info("%s: Could not find a matching request (seq = %u)\n",
+ __func__, msg_seq);
+ return -EINVAL;
+ }
+ pm_msg = nlmsg_request->req_buffer;
+ local_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_MANAGE_ADDR]);
+ mapped_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR]);
+
+ if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) {
+ nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+ goto add_mapping_response_exit;
+ }
+ if (mapped_sockaddr->ss_family != local_sockaddr->ss_family) {
+ pr_info("%s: Sockaddr family doesn't match the requested one\n",
+ __func__);
+ nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+ goto add_mapping_response_exit;
+ }
+ memcpy(&pm_msg->mapped_loc_addr, mapped_sockaddr,
+ sizeof(*mapped_sockaddr));
+ iwpm_print_sockaddr(&pm_msg->loc_addr,
+ "add_mapping: Local sockaddr:");
+ iwpm_print_sockaddr(&pm_msg->mapped_loc_addr,
+ "add_mapping: Mapped local sockaddr:");
+
+add_mapping_response_exit:
+ nlmsg_request->request_done = 1;
+ /* always for found request */
+ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+ barrier();
+ up(&nlmsg_request->sem);
+ return 0;
+}
+
+/* netlink attribute policy for the response to add and query mapping request
+ * and response with remote address info */
+static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = {
+ [IWPM_NLA_QUERY_MAPPING_SEQ] = { .type = NLA_U32 },
+ [IWPM_NLA_QUERY_LOCAL_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_QUERY_REMOTE_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RQUERY_MAPPING_ERR] = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_add_and_query_mapping_cb - Process a port mapper response to
+ * iwpm_add_and_query_mapping()
+ */
+int iwpm_add_and_query_mapping_cb(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct iwpm_sa_data *pm_msg;
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ struct nlattr *nltb[IWPM_NLA_RQUERY_MAPPING_MAX];
+ struct sockaddr_storage *local_sockaddr, *remote_sockaddr;
+ struct sockaddr_storage *mapped_loc_sockaddr, *mapped_rem_sockaddr;
+ const char *msg_type;
+ u32 msg_seq;
+ u16 err_code;
+
+ msg_type = "Query Mapping response";
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_RQUERY_MAPPING_MAX,
+ resp_query_policy, nltb, msg_type))
+ return -EINVAL;
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+
+ msg_seq = nla_get_u32(nltb[IWPM_NLA_QUERY_MAPPING_SEQ]);
+ nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+ if (!nlmsg_request) {
+ pr_info("%s: Could not find a matching request (seq = %u)\n",
+ __func__, msg_seq);
+ return -EINVAL;
+ }
+ pm_msg = nlmsg_request->req_buffer;
+ local_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]);
+ remote_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]);
+ mapped_loc_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]);
+ mapped_rem_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_REM_ADDR]);
+
+ err_code = nla_get_u16(nltb[IWPM_NLA_RQUERY_MAPPING_ERR]);
+ if (err_code == IWPM_REMOTE_QUERY_REJECT) {
+ pr_info("%s: Received a Reject (pid = %u, echo seq = %u)\n",
+ __func__, cb->nlh->nlmsg_pid, msg_seq);
+ nlmsg_request->err_code = IWPM_REMOTE_QUERY_REJECT;
+ }
+ if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr) ||
+ iwpm_compare_sockaddr(remote_sockaddr, &pm_msg->rem_addr)) {
+ pr_info("%s: Incorrect local sockaddr\n", __func__);
+ nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+ goto query_mapping_response_exit;
+ }
+ if (mapped_loc_sockaddr->ss_family != local_sockaddr->ss_family ||
+ mapped_rem_sockaddr->ss_family != remote_sockaddr->ss_family) {
+ pr_info("%s: Sockaddr family doesn't match the requested one\n",
+ __func__);
+ nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+ goto query_mapping_response_exit;
+ }
+ memcpy(&pm_msg->mapped_loc_addr, mapped_loc_sockaddr,
+ sizeof(*mapped_loc_sockaddr));
+ memcpy(&pm_msg->mapped_rem_addr, mapped_rem_sockaddr,
+ sizeof(*mapped_rem_sockaddr));
+
+ iwpm_print_sockaddr(&pm_msg->loc_addr,
+ "query_mapping: Local sockaddr:");
+ iwpm_print_sockaddr(&pm_msg->mapped_loc_addr,
+ "query_mapping: Mapped local sockaddr:");
+ iwpm_print_sockaddr(&pm_msg->rem_addr,
+ "query_mapping: Remote sockaddr:");
+ iwpm_print_sockaddr(&pm_msg->mapped_rem_addr,
+ "query_mapping: Mapped remote sockaddr:");
+query_mapping_response_exit:
+ nlmsg_request->request_done = 1;
+ /* always for found request */
+ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+ barrier();
+ up(&nlmsg_request->sem);
+ return 0;
+}
+
+/*
+ * iwpm_remote_info_cb - Process a port mapper message, containing
+ * the remote connecting peer address info
+ */
+int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nlattr *nltb[IWPM_NLA_RQUERY_MAPPING_MAX];
+ struct sockaddr_storage *local_sockaddr, *remote_sockaddr;
+ struct sockaddr_storage *mapped_loc_sockaddr, *mapped_rem_sockaddr;
+ struct iwpm_remote_info *rem_info;
+ const char *msg_type;
+ u8 nl_client;
+ int ret = -EINVAL;
+
+ msg_type = "Remote Mapping info";
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_RQUERY_MAPPING_MAX,
+ resp_query_policy, nltb, msg_type))
+ return ret;
+
+ nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+ if (!iwpm_valid_client(nl_client)) {
+ pr_info("%s: Invalid port mapper client = %d\n",
+ __func__, nl_client);
+ return ret;
+ }
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+
+ local_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]);
+ remote_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]);
+ mapped_loc_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]);
+ mapped_rem_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_REM_ADDR]);
+
+ if (mapped_loc_sockaddr->ss_family != local_sockaddr->ss_family ||
+ mapped_rem_sockaddr->ss_family != remote_sockaddr->ss_family) {
+ pr_info("%s: Sockaddr family doesn't match the requested one\n",
+ __func__);
+ return ret;
+ }
+ rem_info = kzalloc(sizeof(struct iwpm_remote_info), GFP_ATOMIC);
+ if (!rem_info) {
+ ret = -ENOMEM;
+ return ret;
+ }
+ memcpy(&rem_info->mapped_loc_sockaddr, mapped_loc_sockaddr,
+ sizeof(struct sockaddr_storage));
+ memcpy(&rem_info->remote_sockaddr, remote_sockaddr,
+ sizeof(struct sockaddr_storage));
+ memcpy(&rem_info->mapped_rem_sockaddr, mapped_rem_sockaddr,
+ sizeof(struct sockaddr_storage));
+ rem_info->nl_client = nl_client;
+
+ iwpm_add_remote_info(rem_info);
+
+ iwpm_print_sockaddr(local_sockaddr,
+ "remote_info: Local sockaddr:");
+ iwpm_print_sockaddr(mapped_loc_sockaddr,
+ "remote_info: Mapped local sockaddr:");
+ iwpm_print_sockaddr(remote_sockaddr,
+ "remote_info: Remote sockaddr:");
+ iwpm_print_sockaddr(mapped_rem_sockaddr,
+ "remote_info: Mapped remote sockaddr:");
+ return ret;
+}
+
+/* netlink attribute policy for the received request for mapping info */
+static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = {
+ [IWPM_NLA_MAPINFO_ULIB_NAME] = { .type = NLA_STRING,
+ .len = IWPM_ULIBNAME_SIZE - 1 },
+ [IWPM_NLA_MAPINFO_ULIB_VER] = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_mapping_info_cb - Process a port mapper request for mapping info
+ */
+int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nlattr *nltb[IWPM_NLA_MAPINFO_REQ_MAX];
+ const char *msg_type = "Mapping Info response";
+ u8 nl_client;
+ char *iwpm_name;
+ u16 iwpm_version;
+ int ret = -EINVAL;
+
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_REQ_MAX,
+ resp_mapinfo_policy, nltb, msg_type)) {
+ pr_info("%s: Unable to parse nlmsg\n", __func__);
+ return ret;
+ }
+ iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]);
+ iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]);
+ if (strcmp(iwpm_ulib_name, iwpm_name) ||
+ iwpm_version != iwpm_ulib_version) {
+ pr_info("%s: Invalid port mapper name = %s version = %d\n",
+ __func__, iwpm_name, iwpm_version);
+ return ret;
+ }
+ nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+ if (!iwpm_valid_client(nl_client)) {
+ pr_info("%s: Invalid port mapper client = %d\n",
+ __func__, nl_client);
+ return ret;
+ }
+ iwpm_set_registration(nl_client, IWPM_REG_INCOMPL);
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+ iwpm_user_pid = cb->nlh->nlmsg_pid;
+ if (!iwpm_mapinfo_available())
+ return 0;
+ pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
+ __func__, iwpm_user_pid);
+ ret = iwpm_send_mapinfo(nl_client, iwpm_user_pid);
+ return ret;
+}
+
+/* netlink attribute policy for the received mapping info ack */
+static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = {
+ [IWPM_NLA_MAPINFO_SEQ] = { .type = NLA_U32 },
+ [IWPM_NLA_MAPINFO_SEND_NUM] = { .type = NLA_U32 },
+ [IWPM_NLA_MAPINFO_ACK_NUM] = { .type = NLA_U32 }
+};
+
+/*
+ * iwpm_ack_mapping_info_cb - Process a port mapper ack for
+ * the provided mapping info records
+ */
+int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nlattr *nltb[IWPM_NLA_MAPINFO_NUM_MAX];
+ u32 mapinfo_send, mapinfo_ack;
+ const char *msg_type = "Mapping Info Ack";
+
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_NUM_MAX,
+ ack_mapinfo_policy, nltb, msg_type))
+ return -EINVAL;
+ mapinfo_send = nla_get_u32(nltb[IWPM_NLA_MAPINFO_SEND_NUM]);
+ mapinfo_ack = nla_get_u32(nltb[IWPM_NLA_MAPINFO_ACK_NUM]);
+ if (mapinfo_ack != mapinfo_send)
+ pr_info("%s: Invalid mapinfo number (sent = %u ack-ed = %u)\n",
+ __func__, mapinfo_send, mapinfo_ack);
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+ return 0;
+}
+
+/* netlink attribute policy for the received port mapper error message */
+static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = {
+ [IWPM_NLA_ERR_SEQ] = { .type = NLA_U32 },
+ [IWPM_NLA_ERR_CODE] = { .type = NLA_U16 },
+};
+
+/*
+ * iwpm_mapping_error_cb - Process a port mapper error message
+ */
+int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ int nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+ struct nlattr *nltb[IWPM_NLA_ERR_MAX];
+ u32 msg_seq;
+ u16 err_code;
+ const char *msg_type = "Mapping Error Msg";
+
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_ERR_MAX,
+ map_error_policy, nltb, msg_type))
+ return -EINVAL;
+
+ msg_seq = nla_get_u32(nltb[IWPM_NLA_ERR_SEQ]);
+ err_code = nla_get_u16(nltb[IWPM_NLA_ERR_CODE]);
+ pr_info("%s: Received msg seq = %u err code = %u client = %d\n",
+ __func__, msg_seq, err_code, nl_client);
+ /* look for nlmsg_request */
+ nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+ if (!nlmsg_request) {
+ /* not all errors have associated requests */
+ pr_debug("Could not find matching req (seq = %u)\n", msg_seq);
+ return 0;
+ }
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+ nlmsg_request->err_code = err_code;
+ nlmsg_request->request_done = 1;
+ /* always for found request */
+ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+ barrier();
+ up(&nlmsg_request->sem);
+ return 0;
+}
diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c
new file mode 100644
index 000000000..cdb63f3f4
--- /dev/null
+++ b/drivers/infiniband/core/iwpm_util.c
@@ -0,0 +1,756 @@
+/*
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "iwpm_util.h"
+
+#define IWPM_MAPINFO_HASH_SIZE 512
+#define IWPM_MAPINFO_HASH_MASK (IWPM_MAPINFO_HASH_SIZE - 1)
+#define IWPM_REMINFO_HASH_SIZE 64
+#define IWPM_REMINFO_HASH_MASK (IWPM_REMINFO_HASH_SIZE - 1)
+#define IWPM_MSG_SIZE 512
+
+static LIST_HEAD(iwpm_nlmsg_req_list);
+static DEFINE_SPINLOCK(iwpm_nlmsg_req_lock);
+
+static struct hlist_head *iwpm_hash_bucket;
+static DEFINE_SPINLOCK(iwpm_mapinfo_lock);
+
+static struct hlist_head *iwpm_reminfo_bucket;
+static DEFINE_SPINLOCK(iwpm_reminfo_lock);
+
+static DEFINE_MUTEX(iwpm_admin_lock);
+static struct iwpm_admin_data iwpm_admin;
+
+int iwpm_init(u8 nl_client)
+{
+ int ret = 0;
+ mutex_lock(&iwpm_admin_lock);
+ if (atomic_read(&iwpm_admin.refcount) == 0) {
+ iwpm_hash_bucket = kcalloc(IWPM_MAPINFO_HASH_SIZE,
+ sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!iwpm_hash_bucket) {
+ ret = -ENOMEM;
+ goto init_exit;
+ }
+ iwpm_reminfo_bucket = kcalloc(IWPM_REMINFO_HASH_SIZE,
+ sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!iwpm_reminfo_bucket) {
+ kfree(iwpm_hash_bucket);
+ ret = -ENOMEM;
+ goto init_exit;
+ }
+ }
+ atomic_inc(&iwpm_admin.refcount);
+init_exit:
+ mutex_unlock(&iwpm_admin_lock);
+ if (!ret) {
+ iwpm_set_valid(nl_client, 1);
+ iwpm_set_registration(nl_client, IWPM_REG_UNDEF);
+ pr_debug("%s: Mapinfo and reminfo tables are created\n",
+ __func__);
+ }
+ return ret;
+}
+
+static void free_hash_bucket(void);
+static void free_reminfo_bucket(void);
+
+int iwpm_exit(u8 nl_client)
+{
+
+ if (!iwpm_valid_client(nl_client))
+ return -EINVAL;
+ mutex_lock(&iwpm_admin_lock);
+ if (atomic_read(&iwpm_admin.refcount) == 0) {
+ mutex_unlock(&iwpm_admin_lock);
+ pr_err("%s Incorrect usage - negative refcount\n", __func__);
+ return -EINVAL;
+ }
+ if (atomic_dec_and_test(&iwpm_admin.refcount)) {
+ free_hash_bucket();
+ free_reminfo_bucket();
+ pr_debug("%s: Resources are destroyed\n", __func__);
+ }
+ mutex_unlock(&iwpm_admin_lock);
+ iwpm_set_valid(nl_client, 0);
+ iwpm_set_registration(nl_client, IWPM_REG_UNDEF);
+ return 0;
+}
+
+static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *,
+ struct sockaddr_storage *);
+
+int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr,
+ struct sockaddr_storage *mapped_sockaddr,
+ u8 nl_client)
+{
+ struct hlist_head *hash_bucket_head = NULL;
+ struct iwpm_mapping_info *map_info;
+ unsigned long flags;
+ int ret = -EINVAL;
+
+ if (!iwpm_valid_client(nl_client))
+ return ret;
+ map_info = kzalloc(sizeof(struct iwpm_mapping_info), GFP_KERNEL);
+ if (!map_info)
+ return -ENOMEM;
+
+ memcpy(&map_info->local_sockaddr, local_sockaddr,
+ sizeof(struct sockaddr_storage));
+ memcpy(&map_info->mapped_sockaddr, mapped_sockaddr,
+ sizeof(struct sockaddr_storage));
+ map_info->nl_client = nl_client;
+
+ spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+ if (iwpm_hash_bucket) {
+ hash_bucket_head = get_mapinfo_hash_bucket(
+ &map_info->local_sockaddr,
+ &map_info->mapped_sockaddr);
+ if (hash_bucket_head) {
+ hlist_add_head(&map_info->hlist_node, hash_bucket_head);
+ ret = 0;
+ }
+ }
+ spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+
+ if (!hash_bucket_head)
+ kfree(map_info);
+ return ret;
+}
+
+int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr,
+ struct sockaddr_storage *mapped_local_addr)
+{
+ struct hlist_node *tmp_hlist_node;
+ struct hlist_head *hash_bucket_head;
+ struct iwpm_mapping_info *map_info = NULL;
+ unsigned long flags;
+ int ret = -EINVAL;
+
+ spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+ if (iwpm_hash_bucket) {
+ hash_bucket_head = get_mapinfo_hash_bucket(
+ local_sockaddr,
+ mapped_local_addr);
+ if (!hash_bucket_head)
+ goto remove_mapinfo_exit;
+
+ hlist_for_each_entry_safe(map_info, tmp_hlist_node,
+ hash_bucket_head, hlist_node) {
+
+ if (!iwpm_compare_sockaddr(&map_info->mapped_sockaddr,
+ mapped_local_addr)) {
+
+ hlist_del_init(&map_info->hlist_node);
+ kfree(map_info);
+ ret = 0;
+ break;
+ }
+ }
+ }
+remove_mapinfo_exit:
+ spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+ return ret;
+}
+
+static void free_hash_bucket(void)
+{
+ struct hlist_node *tmp_hlist_node;
+ struct iwpm_mapping_info *map_info;
+ unsigned long flags;
+ int i;
+
+ /* remove all the mapinfo data from the list */
+ spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+ for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) {
+ hlist_for_each_entry_safe(map_info, tmp_hlist_node,
+ &iwpm_hash_bucket[i], hlist_node) {
+
+ hlist_del_init(&map_info->hlist_node);
+ kfree(map_info);
+ }
+ }
+ /* free the hash list */
+ kfree(iwpm_hash_bucket);
+ iwpm_hash_bucket = NULL;
+ spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+}
+
+static void free_reminfo_bucket(void)
+{
+ struct hlist_node *tmp_hlist_node;
+ struct iwpm_remote_info *rem_info;
+ unsigned long flags;
+ int i;
+
+ /* remove all the remote info from the list */
+ spin_lock_irqsave(&iwpm_reminfo_lock, flags);
+ for (i = 0; i < IWPM_REMINFO_HASH_SIZE; i++) {
+ hlist_for_each_entry_safe(rem_info, tmp_hlist_node,
+ &iwpm_reminfo_bucket[i], hlist_node) {
+
+ hlist_del_init(&rem_info->hlist_node);
+ kfree(rem_info);
+ }
+ }
+ /* free the hash list */
+ kfree(iwpm_reminfo_bucket);
+ iwpm_reminfo_bucket = NULL;
+ spin_unlock_irqrestore(&iwpm_reminfo_lock, flags);
+}
+
+static struct hlist_head *get_reminfo_hash_bucket(struct sockaddr_storage *,
+ struct sockaddr_storage *);
+
+void iwpm_add_remote_info(struct iwpm_remote_info *rem_info)
+{
+ struct hlist_head *hash_bucket_head;
+ unsigned long flags;
+
+ spin_lock_irqsave(&iwpm_reminfo_lock, flags);
+ if (iwpm_reminfo_bucket) {
+ hash_bucket_head = get_reminfo_hash_bucket(
+ &rem_info->mapped_loc_sockaddr,
+ &rem_info->mapped_rem_sockaddr);
+ if (hash_bucket_head)
+ hlist_add_head(&rem_info->hlist_node, hash_bucket_head);
+ }
+ spin_unlock_irqrestore(&iwpm_reminfo_lock, flags);
+}
+
+int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr,
+ struct sockaddr_storage *mapped_rem_addr,
+ struct sockaddr_storage *remote_addr,
+ u8 nl_client)
+{
+ struct hlist_node *tmp_hlist_node;
+ struct hlist_head *hash_bucket_head;
+ struct iwpm_remote_info *rem_info = NULL;
+ unsigned long flags;
+ int ret = -EINVAL;
+
+ if (!iwpm_valid_client(nl_client)) {
+ pr_info("%s: Invalid client = %d\n", __func__, nl_client);
+ return ret;
+ }
+ spin_lock_irqsave(&iwpm_reminfo_lock, flags);
+ if (iwpm_reminfo_bucket) {
+ hash_bucket_head = get_reminfo_hash_bucket(
+ mapped_loc_addr,
+ mapped_rem_addr);
+ if (!hash_bucket_head)
+ goto get_remote_info_exit;
+ hlist_for_each_entry_safe(rem_info, tmp_hlist_node,
+ hash_bucket_head, hlist_node) {
+
+ if (!iwpm_compare_sockaddr(&rem_info->mapped_loc_sockaddr,
+ mapped_loc_addr) &&
+ !iwpm_compare_sockaddr(&rem_info->mapped_rem_sockaddr,
+ mapped_rem_addr)) {
+
+ memcpy(remote_addr, &rem_info->remote_sockaddr,
+ sizeof(struct sockaddr_storage));
+ iwpm_print_sockaddr(remote_addr,
+ "get_remote_info: Remote sockaddr:");
+
+ hlist_del_init(&rem_info->hlist_node);
+ kfree(rem_info);
+ ret = 0;
+ break;
+ }
+ }
+ }
+get_remote_info_exit:
+ spin_unlock_irqrestore(&iwpm_reminfo_lock, flags);
+ return ret;
+}
+
+struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq,
+ u8 nl_client, gfp_t gfp)
+{
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ unsigned long flags;
+
+ nlmsg_request = kzalloc(sizeof(struct iwpm_nlmsg_request), gfp);
+ if (!nlmsg_request)
+ return NULL;
+
+ spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags);
+ list_add_tail(&nlmsg_request->inprocess_list, &iwpm_nlmsg_req_list);
+ spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags);
+
+ kref_init(&nlmsg_request->kref);
+ kref_get(&nlmsg_request->kref);
+ nlmsg_request->nlmsg_seq = nlmsg_seq;
+ nlmsg_request->nl_client = nl_client;
+ nlmsg_request->request_done = 0;
+ nlmsg_request->err_code = 0;
+ sema_init(&nlmsg_request->sem, 1);
+ down(&nlmsg_request->sem);
+ return nlmsg_request;
+}
+
+void iwpm_free_nlmsg_request(struct kref *kref)
+{
+ struct iwpm_nlmsg_request *nlmsg_request;
+ unsigned long flags;
+
+ nlmsg_request = container_of(kref, struct iwpm_nlmsg_request, kref);
+
+ spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags);
+ list_del_init(&nlmsg_request->inprocess_list);
+ spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags);
+
+ if (!nlmsg_request->request_done)
+ pr_debug("%s Freeing incomplete nlmsg request (seq = %u).\n",
+ __func__, nlmsg_request->nlmsg_seq);
+ kfree(nlmsg_request);
+}
+
+struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq)
+{
+ struct iwpm_nlmsg_request *nlmsg_request;
+ struct iwpm_nlmsg_request *found_request = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags);
+ list_for_each_entry(nlmsg_request, &iwpm_nlmsg_req_list,
+ inprocess_list) {
+ if (nlmsg_request->nlmsg_seq == echo_seq) {
+ found_request = nlmsg_request;
+ kref_get(&nlmsg_request->kref);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags);
+ return found_request;
+}
+
+int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request)
+{
+ int ret;
+
+ ret = down_timeout(&nlmsg_request->sem, IWPM_NL_TIMEOUT);
+ if (ret) {
+ ret = -EINVAL;
+ pr_info("%s: Timeout %d sec for netlink request (seq = %u)\n",
+ __func__, (IWPM_NL_TIMEOUT/HZ), nlmsg_request->nlmsg_seq);
+ } else {
+ ret = nlmsg_request->err_code;
+ }
+ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+ return ret;
+}
+
+int iwpm_get_nlmsg_seq(void)
+{
+ return atomic_inc_return(&iwpm_admin.nlmsg_seq);
+}
+
+int iwpm_valid_client(u8 nl_client)
+{
+ return iwpm_admin.client_list[nl_client];
+}
+
+void iwpm_set_valid(u8 nl_client, int valid)
+{
+ iwpm_admin.client_list[nl_client] = valid;
+}
+
+/* valid client */
+u32 iwpm_get_registration(u8 nl_client)
+{
+ return iwpm_admin.reg_list[nl_client];
+}
+
+/* valid client */
+void iwpm_set_registration(u8 nl_client, u32 reg)
+{
+ iwpm_admin.reg_list[nl_client] = reg;
+}
+
+/* valid client */
+u32 iwpm_check_registration(u8 nl_client, u32 reg)
+{
+ return (iwpm_get_registration(nl_client) & reg);
+}
+
+int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr,
+ struct sockaddr_storage *b_sockaddr)
+{
+ if (a_sockaddr->ss_family != b_sockaddr->ss_family)
+ return 1;
+ if (a_sockaddr->ss_family == AF_INET) {
+ struct sockaddr_in *a4_sockaddr =
+ (struct sockaddr_in *)a_sockaddr;
+ struct sockaddr_in *b4_sockaddr =
+ (struct sockaddr_in *)b_sockaddr;
+ if (!memcmp(&a4_sockaddr->sin_addr,
+ &b4_sockaddr->sin_addr, sizeof(struct in_addr))
+ && a4_sockaddr->sin_port == b4_sockaddr->sin_port)
+ return 0;
+
+ } else if (a_sockaddr->ss_family == AF_INET6) {
+ struct sockaddr_in6 *a6_sockaddr =
+ (struct sockaddr_in6 *)a_sockaddr;
+ struct sockaddr_in6 *b6_sockaddr =
+ (struct sockaddr_in6 *)b_sockaddr;
+ if (!memcmp(&a6_sockaddr->sin6_addr,
+ &b6_sockaddr->sin6_addr, sizeof(struct in6_addr))
+ && a6_sockaddr->sin6_port == b6_sockaddr->sin6_port)
+ return 0;
+
+ } else {
+ pr_err("%s: Invalid sockaddr family\n", __func__);
+ }
+ return 1;
+}
+
+struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh,
+ int nl_client)
+{
+ struct sk_buff *skb = NULL;
+
+ skb = dev_alloc_skb(IWPM_MSG_SIZE);
+ if (!skb)
+ goto create_nlmsg_exit;
+
+ if (!(ibnl_put_msg(skb, nlh, 0, 0, nl_client, nl_op,
+ NLM_F_REQUEST))) {
+ pr_warn("%s: Unable to put the nlmsg header\n", __func__);
+ dev_kfree_skb(skb);
+ skb = NULL;
+ }
+create_nlmsg_exit:
+ return skb;
+}
+
+int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max,
+ const struct nla_policy *nlmsg_policy,
+ struct nlattr *nltb[], const char *msg_type)
+{
+ int nlh_len = 0;
+ int ret;
+ const char *err_str = "";
+
+ ret = nlmsg_validate(cb->nlh, nlh_len, policy_max - 1, nlmsg_policy,
+ NULL);
+ if (ret) {
+ err_str = "Invalid attribute";
+ goto parse_nlmsg_error;
+ }
+ ret = nlmsg_parse(cb->nlh, nlh_len, nltb, policy_max - 1,
+ nlmsg_policy, NULL);
+ if (ret) {
+ err_str = "Unable to parse the nlmsg";
+ goto parse_nlmsg_error;
+ }
+ ret = iwpm_validate_nlmsg_attr(nltb, policy_max);
+ if (ret) {
+ err_str = "Invalid NULL attribute";
+ goto parse_nlmsg_error;
+ }
+ return 0;
+parse_nlmsg_error:
+ pr_warn("%s: %s (msg type %s ret = %d)\n",
+ __func__, err_str, msg_type, ret);
+ return ret;
+}
+
+void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg)
+{
+ struct sockaddr_in6 *sockaddr_v6;
+ struct sockaddr_in *sockaddr_v4;
+
+ switch (sockaddr->ss_family) {
+ case AF_INET:
+ sockaddr_v4 = (struct sockaddr_in *)sockaddr;
+ pr_debug("%s IPV4 %pI4: %u(0x%04X)\n",
+ msg, &sockaddr_v4->sin_addr,
+ ntohs(sockaddr_v4->sin_port),
+ ntohs(sockaddr_v4->sin_port));
+ break;
+ case AF_INET6:
+ sockaddr_v6 = (struct sockaddr_in6 *)sockaddr;
+ pr_debug("%s IPV6 %pI6: %u(0x%04X)\n",
+ msg, &sockaddr_v6->sin6_addr,
+ ntohs(sockaddr_v6->sin6_port),
+ ntohs(sockaddr_v6->sin6_port));
+ break;
+ default:
+ break;
+ }
+}
+
+static u32 iwpm_ipv6_jhash(struct sockaddr_in6 *ipv6_sockaddr)
+{
+ u32 ipv6_hash = jhash(&ipv6_sockaddr->sin6_addr, sizeof(struct in6_addr), 0);
+ u32 hash = jhash_2words(ipv6_hash, (__force u32) ipv6_sockaddr->sin6_port, 0);
+ return hash;
+}
+
+static u32 iwpm_ipv4_jhash(struct sockaddr_in *ipv4_sockaddr)
+{
+ u32 ipv4_hash = jhash(&ipv4_sockaddr->sin_addr, sizeof(struct in_addr), 0);
+ u32 hash = jhash_2words(ipv4_hash, (__force u32) ipv4_sockaddr->sin_port, 0);
+ return hash;
+}
+
+static int get_hash_bucket(struct sockaddr_storage *a_sockaddr,
+ struct sockaddr_storage *b_sockaddr, u32 *hash)
+{
+ u32 a_hash, b_hash;
+
+ if (a_sockaddr->ss_family == AF_INET) {
+ a_hash = iwpm_ipv4_jhash((struct sockaddr_in *) a_sockaddr);
+ b_hash = iwpm_ipv4_jhash((struct sockaddr_in *) b_sockaddr);
+
+ } else if (a_sockaddr->ss_family == AF_INET6) {
+ a_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) a_sockaddr);
+ b_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) b_sockaddr);
+ } else {
+ pr_err("%s: Invalid sockaddr family\n", __func__);
+ return -EINVAL;
+ }
+
+ if (a_hash == b_hash) /* if port mapper isn't available */
+ *hash = a_hash;
+ else
+ *hash = jhash_2words(a_hash, b_hash, 0);
+ return 0;
+}
+
+static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage
+ *local_sockaddr, struct sockaddr_storage
+ *mapped_sockaddr)
+{
+ u32 hash;
+ int ret;
+
+ ret = get_hash_bucket(local_sockaddr, mapped_sockaddr, &hash);
+ if (ret)
+ return NULL;
+ return &iwpm_hash_bucket[hash & IWPM_MAPINFO_HASH_MASK];
+}
+
+static struct hlist_head *get_reminfo_hash_bucket(struct sockaddr_storage
+ *mapped_loc_sockaddr, struct sockaddr_storage
+ *mapped_rem_sockaddr)
+{
+ u32 hash;
+ int ret;
+
+ ret = get_hash_bucket(mapped_loc_sockaddr, mapped_rem_sockaddr, &hash);
+ if (ret)
+ return NULL;
+ return &iwpm_reminfo_bucket[hash & IWPM_REMINFO_HASH_MASK];
+}
+
+static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid)
+{
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ u32 msg_seq;
+ const char *err_str = "";
+ int ret = -EINVAL;
+
+ skb = iwpm_create_nlmsg(RDMA_NL_IWPM_MAPINFO_NUM, &nlh, nl_client);
+ if (!skb) {
+ err_str = "Unable to create a nlmsg";
+ goto mapinfo_num_error;
+ }
+ nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+ msg_seq = 0;
+ err_str = "Unable to put attribute of mapinfo number nlmsg";
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_MAPINFO_SEQ);
+ if (ret)
+ goto mapinfo_num_error;
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32),
+ &mapping_num, IWPM_NLA_MAPINFO_SEND_NUM);
+ if (ret)
+ goto mapinfo_num_error;
+
+ nlmsg_end(skb, nlh);
+
+ ret = rdma_nl_unicast(skb, iwpm_pid);
+ if (ret) {
+ skb = NULL;
+ err_str = "Unable to send a nlmsg";
+ goto mapinfo_num_error;
+ }
+ pr_debug("%s: Sent mapping number = %d\n", __func__, mapping_num);
+ return 0;
+mapinfo_num_error:
+ pr_info("%s: %s\n", __func__, err_str);
+ if (skb)
+ dev_kfree_skb(skb);
+ return ret;
+}
+
+static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid)
+{
+ struct nlmsghdr *nlh = NULL;
+ int ret = 0;
+
+ if (!skb)
+ return ret;
+ if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client,
+ RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) {
+ pr_warn("%s Unable to put NLMSG_DONE\n", __func__);
+ dev_kfree_skb(skb);
+ return -ENOMEM;
+ }
+ nlh->nlmsg_type = NLMSG_DONE;
+ ret = rdma_nl_unicast(skb, iwpm_pid);
+ if (ret)
+ pr_warn("%s Unable to send a nlmsg\n", __func__);
+ return ret;
+}
+
+int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid)
+{
+ struct iwpm_mapping_info *map_info;
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ int skb_num = 0, mapping_num = 0;
+ int i = 0, nlmsg_bytes = 0;
+ unsigned long flags;
+ const char *err_str = "";
+ int ret;
+
+ skb = dev_alloc_skb(NLMSG_GOODSIZE);
+ if (!skb) {
+ ret = -ENOMEM;
+ err_str = "Unable to allocate skb";
+ goto send_mapping_info_exit;
+ }
+ skb_num++;
+ spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+ ret = -EINVAL;
+ for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) {
+ hlist_for_each_entry(map_info, &iwpm_hash_bucket[i],
+ hlist_node) {
+ if (map_info->nl_client != nl_client)
+ continue;
+ nlh = NULL;
+ if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client,
+ RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) {
+ ret = -ENOMEM;
+ err_str = "Unable to put the nlmsg header";
+ goto send_mapping_info_unlock;
+ }
+ err_str = "Unable to put attribute of the nlmsg";
+ ret = ibnl_put_attr(skb, nlh,
+ sizeof(struct sockaddr_storage),
+ &map_info->local_sockaddr,
+ IWPM_NLA_MAPINFO_LOCAL_ADDR);
+ if (ret)
+ goto send_mapping_info_unlock;
+
+ ret = ibnl_put_attr(skb, nlh,
+ sizeof(struct sockaddr_storage),
+ &map_info->mapped_sockaddr,
+ IWPM_NLA_MAPINFO_MAPPED_ADDR);
+ if (ret)
+ goto send_mapping_info_unlock;
+
+ nlmsg_end(skb, nlh);
+
+ iwpm_print_sockaddr(&map_info->local_sockaddr,
+ "send_mapping_info: Local sockaddr:");
+ iwpm_print_sockaddr(&map_info->mapped_sockaddr,
+ "send_mapping_info: Mapped local sockaddr:");
+ mapping_num++;
+ nlmsg_bytes += nlh->nlmsg_len;
+
+ /* check if all mappings can fit in one skb */
+ if (NLMSG_GOODSIZE - nlmsg_bytes < nlh->nlmsg_len * 2) {
+ /* and leave room for NLMSG_DONE */
+ nlmsg_bytes = 0;
+ skb_num++;
+ spin_unlock_irqrestore(&iwpm_mapinfo_lock,
+ flags);
+ /* send the skb */
+ ret = send_nlmsg_done(skb, nl_client, iwpm_pid);
+ skb = NULL;
+ if (ret) {
+ err_str = "Unable to send map info";
+ goto send_mapping_info_exit;
+ }
+ if (skb_num == IWPM_MAPINFO_SKB_COUNT) {
+ ret = -ENOMEM;
+ err_str = "Insufficient skbs for map info";
+ goto send_mapping_info_exit;
+ }
+ skb = dev_alloc_skb(NLMSG_GOODSIZE);
+ if (!skb) {
+ ret = -ENOMEM;
+ err_str = "Unable to allocate skb";
+ goto send_mapping_info_exit;
+ }
+ spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+ }
+ }
+ }
+send_mapping_info_unlock:
+ spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+send_mapping_info_exit:
+ if (ret) {
+ pr_warn("%s: %s (ret = %d)\n", __func__, err_str, ret);
+ if (skb)
+ dev_kfree_skb(skb);
+ return ret;
+ }
+ send_nlmsg_done(skb, nl_client, iwpm_pid);
+ return send_mapinfo_num(mapping_num, nl_client, iwpm_pid);
+}
+
+int iwpm_mapinfo_available(void)
+{
+ unsigned long flags;
+ int full_bucket = 0, i = 0;
+
+ spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+ if (iwpm_hash_bucket) {
+ for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) {
+ if (!hlist_empty(&iwpm_hash_bucket[i])) {
+ full_bucket = 1;
+ break;
+ }
+ }
+ }
+ spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+ return full_bucket;
+}
diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h
new file mode 100644
index 000000000..af1fc14a0
--- /dev/null
+++ b/drivers/infiniband/core/iwpm_util.h
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _IWPM_UTIL_H
+#define _IWPM_UTIL_H
+
+#include <linux/module.h>
+#include <linux/io.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/delay.h>
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
+#include <linux/jhash.h>
+#include <linux/kref.h>
+#include <net/netlink.h>
+#include <linux/errno.h>
+#include <rdma/iw_portmap.h>
+#include <rdma/rdma_netlink.h>
+
+
+#define IWPM_NL_RETRANS 3
+#define IWPM_NL_TIMEOUT (10*HZ)
+#define IWPM_MAPINFO_SKB_COUNT 20
+
+#define IWPM_PID_UNDEFINED -1
+#define IWPM_PID_UNAVAILABLE -2
+
+#define IWPM_REG_UNDEF 0x01
+#define IWPM_REG_VALID 0x02
+#define IWPM_REG_INCOMPL 0x04
+
+struct iwpm_nlmsg_request {
+ struct list_head inprocess_list;
+ __u32 nlmsg_seq;
+ void *req_buffer;
+ u8 nl_client;
+ u8 request_done;
+ u16 err_code;
+ struct semaphore sem;
+ struct kref kref;
+};
+
+struct iwpm_mapping_info {
+ struct hlist_node hlist_node;
+ struct sockaddr_storage local_sockaddr;
+ struct sockaddr_storage mapped_sockaddr;
+ u8 nl_client;
+};
+
+struct iwpm_remote_info {
+ struct hlist_node hlist_node;
+ struct sockaddr_storage remote_sockaddr;
+ struct sockaddr_storage mapped_loc_sockaddr;
+ struct sockaddr_storage mapped_rem_sockaddr;
+ u8 nl_client;
+};
+
+struct iwpm_admin_data {
+ atomic_t refcount;
+ atomic_t nlmsg_seq;
+ int client_list[RDMA_NL_NUM_CLIENTS];
+ u32 reg_list[RDMA_NL_NUM_CLIENTS];
+};
+
+/**
+ * iwpm_get_nlmsg_request - Allocate and initialize netlink message request
+ * @nlmsg_seq: Sequence number of the netlink message
+ * @nl_client: The index of the netlink client
+ * @gfp: Indicates how the memory for the request should be allocated
+ *
+ * Returns the newly allocated netlink request object if successful,
+ * otherwise returns NULL
+ */
+struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq,
+ u8 nl_client, gfp_t gfp);
+
+/**
+ * iwpm_free_nlmsg_request - Deallocate netlink message request
+ * @kref: Holds reference of netlink message request
+ */
+void iwpm_free_nlmsg_request(struct kref *kref);
+
+/**
+ * iwpm_find_nlmsg_request - Find netlink message request in the request list
+ * @echo_seq: Sequence number of the netlink request to find
+ *
+ * Returns the found netlink message request,
+ * if not found, returns NULL
+ */
+struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq);
+
+/**
+ * iwpm_wait_complete_req - Block while servicing the netlink request
+ * @nlmsg_request: Netlink message request to service
+ *
+ * Wakes up, after the request is completed or expired
+ * Returns 0 if the request is complete without error
+ */
+int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request);
+
+/**
+ * iwpm_get_nlmsg_seq - Get the sequence number for a netlink
+ * message to send to the port mapper
+ *
+ * Returns the sequence number for the netlink message.
+ */
+int iwpm_get_nlmsg_seq(void);
+
+/**
+ * iwpm_add_reminfo - Add remote address info of the connecting peer
+ * to the remote info hash table
+ * @reminfo: The remote info to be added
+ */
+void iwpm_add_remote_info(struct iwpm_remote_info *reminfo);
+
+/**
+ * iwpm_valid_client - Check if the port mapper client is valid
+ * @nl_client: The index of the netlink client
+ *
+ * Valid clients need to call iwpm_init() before using
+ * the port mapper
+ */
+int iwpm_valid_client(u8 nl_client);
+
+/**
+ * iwpm_set_valid - Set the port mapper client to valid or not
+ * @nl_client: The index of the netlink client
+ * @valid: 1 if valid or 0 if invalid
+ */
+void iwpm_set_valid(u8 nl_client, int valid);
+
+/**
+ * iwpm_check_registration - Check if the client registration
+ * matches the given one
+ * @nl_client: The index of the netlink client
+ * @reg: The given registration type to compare with
+ *
+ * Call iwpm_register_pid() to register a client
+ * Returns true if the client registration matches reg,
+ * otherwise returns false
+ */
+u32 iwpm_check_registration(u8 nl_client, u32 reg);
+
+/**
+ * iwpm_set_registration - Set the client registration
+ * @nl_client: The index of the netlink client
+ * @reg: Registration type to set
+ */
+void iwpm_set_registration(u8 nl_client, u32 reg);
+
+/**
+ * iwpm_get_registration
+ * @nl_client: The index of the netlink client
+ *
+ * Returns the client registration type
+ */
+u32 iwpm_get_registration(u8 nl_client);
+
+/**
+ * iwpm_send_mapinfo - Send local and mapped IPv4/IPv6 address info of
+ * a client to the user space port mapper
+ * @nl_client: The index of the netlink client
+ * @iwpm_pid: The pid of the user space port mapper
+ *
+ * If successful, returns the number of sent mapping info records
+ */
+int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid);
+
+/**
+ * iwpm_mapinfo_available - Check if any mapping info records is available
+ * in the hash table
+ *
+ * Returns 1 if mapping information is available, otherwise returns 0
+ */
+int iwpm_mapinfo_available(void);
+
+/**
+ * iwpm_compare_sockaddr - Compare two sockaddr storage structs
+ *
+ * Returns 0 if they are holding the same ip/tcp address info,
+ * otherwise returns 1
+ */
+int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr,
+ struct sockaddr_storage *b_sockaddr);
+
+/**
+ * iwpm_validate_nlmsg_attr - Check for NULL netlink attributes
+ * @nltb: Holds address of each netlink message attributes
+ * @nla_count: Number of netlink message attributes
+ *
+ * Returns error if any of the nla_count attributes is NULL
+ */
+static inline int iwpm_validate_nlmsg_attr(struct nlattr *nltb[],
+ int nla_count)
+{
+ int i;
+ for (i = 1; i < nla_count; i++) {
+ if (!nltb[i])
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * iwpm_create_nlmsg - Allocate skb and form a netlink message
+ * @nl_op: Netlink message opcode
+ * @nlh: Holds address of the netlink message header in skb
+ * @nl_client: The index of the netlink client
+ *
+ * Returns the newly allcated skb, or NULL if the tailroom of the skb
+ * is insufficient to store the message header and payload
+ */
+struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh,
+ int nl_client);
+
+/**
+ * iwpm_parse_nlmsg - Validate and parse the received netlink message
+ * @cb: Netlink callback structure
+ * @policy_max: Maximum attribute type to be expected
+ * @nlmsg_policy: Validation policy
+ * @nltb: Array to store policy_max parsed elements
+ * @msg_type: Type of netlink message
+ *
+ * Returns 0 on success or a negative error code
+ */
+int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max,
+ const struct nla_policy *nlmsg_policy,
+ struct nlattr *nltb[], const char *msg_type);
+
+/**
+ * iwpm_print_sockaddr - Print IPv4/IPv6 address and TCP port
+ * @sockaddr: Socket address to print
+ * @msg: Message to print
+ */
+void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg);
+#endif
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
new file mode 100644
index 000000000..a36b3b4f5
--- /dev/null
+++ b/drivers/infiniband/core/mad.c
@@ -0,0 +1,3372 @@
+/*
+ * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/dma-mapping.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/security.h>
+#include <rdma/ib_cache.h>
+
+#include "mad_priv.h"
+#include "core_priv.h"
+#include "mad_rmpp.h"
+#include "smi.h"
+#include "opa_smi.h"
+#include "agent.h"
+
+static int mad_sendq_size = IB_MAD_QP_SEND_SIZE;
+static int mad_recvq_size = IB_MAD_QP_RECV_SIZE;
+
+module_param_named(send_queue_size, mad_sendq_size, int, 0444);
+MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests");
+module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
+MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
+
+/*
+ * The mlx4 driver uses the top byte to distinguish which virtual function
+ * generated the MAD, so we must avoid using it.
+ */
+#define AGENT_ID_LIMIT (1 << 24)
+static DEFINE_IDR(ib_mad_clients);
+static struct list_head ib_mad_port_list;
+
+/* Port list lock */
+static DEFINE_SPINLOCK(ib_mad_port_list_lock);
+
+/* Forward declarations */
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+ struct ib_mad_reg_req *mad_reg_req);
+static void remove_mad_reg_req(struct ib_mad_agent_private *priv);
+static struct ib_mad_agent_private *find_mad_agent(
+ struct ib_mad_port_private *port_priv,
+ const struct ib_mad_hdr *mad);
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_private *mad);
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv);
+static void timeout_sends(struct work_struct *work);
+static void local_completions(struct work_struct *work);
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+ struct ib_mad_agent_private *agent_priv,
+ u8 mgmt_class);
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+ struct ib_mad_agent_private *agent_priv);
+static bool ib_mad_send_error(struct ib_mad_port_private *port_priv,
+ struct ib_wc *wc);
+static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc);
+
+/*
+ * Returns a ib_mad_port_private structure or NULL for a device/port
+ * Assumes ib_mad_port_list_lock is being held
+ */
+static inline struct ib_mad_port_private *
+__ib_get_mad_port(struct ib_device *device, int port_num)
+{
+ struct ib_mad_port_private *entry;
+
+ list_for_each_entry(entry, &ib_mad_port_list, port_list) {
+ if (entry->device == device && entry->port_num == port_num)
+ return entry;
+ }
+ return NULL;
+}
+
+/*
+ * Wrapper function to return a ib_mad_port_private structure or NULL
+ * for a device/port
+ */
+static inline struct ib_mad_port_private *
+ib_get_mad_port(struct ib_device *device, int port_num)
+{
+ struct ib_mad_port_private *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+ entry = __ib_get_mad_port(device, port_num);
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+ return entry;
+}
+
+static inline u8 convert_mgmt_class(u8 mgmt_class)
+{
+ /* Alias IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE to 0 */
+ return mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE ?
+ 0 : mgmt_class;
+}
+
+static int get_spl_qp_index(enum ib_qp_type qp_type)
+{
+ switch (qp_type)
+ {
+ case IB_QPT_SMI:
+ return 0;
+ case IB_QPT_GSI:
+ return 1;
+ default:
+ return -1;
+ }
+}
+
+static int vendor_class_index(u8 mgmt_class)
+{
+ return mgmt_class - IB_MGMT_CLASS_VENDOR_RANGE2_START;
+}
+
+static int is_vendor_class(u8 mgmt_class)
+{
+ if ((mgmt_class < IB_MGMT_CLASS_VENDOR_RANGE2_START) ||
+ (mgmt_class > IB_MGMT_CLASS_VENDOR_RANGE2_END))
+ return 0;
+ return 1;
+}
+
+static int is_vendor_oui(char *oui)
+{
+ if (oui[0] || oui[1] || oui[2])
+ return 1;
+ return 0;
+}
+
+static int is_vendor_method_in_use(
+ struct ib_mad_mgmt_vendor_class *vendor_class,
+ struct ib_mad_reg_req *mad_reg_req)
+{
+ struct ib_mad_mgmt_method_table *method;
+ int i;
+
+ for (i = 0; i < MAX_MGMT_OUI; i++) {
+ if (!memcmp(vendor_class->oui[i], mad_reg_req->oui, 3)) {
+ method = vendor_class->method_table[i];
+ if (method) {
+ if (method_in_use(&method, mad_reg_req))
+ return 1;
+ else
+ break;
+ }
+ }
+ }
+ return 0;
+}
+
+int ib_response_mad(const struct ib_mad_hdr *hdr)
+{
+ return ((hdr->method & IB_MGMT_METHOD_RESP) ||
+ (hdr->method == IB_MGMT_METHOD_TRAP_REPRESS) ||
+ ((hdr->mgmt_class == IB_MGMT_CLASS_BM) &&
+ (hdr->attr_mod & IB_BM_ATTR_MOD_RESP)));
+}
+EXPORT_SYMBOL(ib_response_mad);
+
+/*
+ * ib_register_mad_agent - Register to send/receive MADs
+ *
+ * Context: Process context.
+ */
+struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
+ u8 port_num,
+ enum ib_qp_type qp_type,
+ struct ib_mad_reg_req *mad_reg_req,
+ u8 rmpp_version,
+ ib_mad_send_handler send_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context,
+ u32 registration_flags)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_agent *ret = ERR_PTR(-EINVAL);
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_reg_req *reg_req = NULL;
+ struct ib_mad_mgmt_class_table *class;
+ struct ib_mad_mgmt_vendor_class_table *vendor;
+ struct ib_mad_mgmt_vendor_class *vendor_class;
+ struct ib_mad_mgmt_method_table *method;
+ int ret2, qpn;
+ u8 mgmt_class, vclass;
+
+ /* Validate parameters */
+ qpn = get_spl_qp_index(qp_type);
+ if (qpn == -1) {
+ dev_dbg_ratelimited(&device->dev, "%s: invalid QP Type %d\n",
+ __func__, qp_type);
+ goto error1;
+ }
+
+ if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) {
+ dev_dbg_ratelimited(&device->dev,
+ "%s: invalid RMPP Version %u\n",
+ __func__, rmpp_version);
+ goto error1;
+ }
+
+ /* Validate MAD registration request if supplied */
+ if (mad_reg_req) {
+ if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) {
+ dev_dbg_ratelimited(&device->dev,
+ "%s: invalid Class Version %u\n",
+ __func__,
+ mad_reg_req->mgmt_class_version);
+ goto error1;
+ }
+ if (!recv_handler) {
+ dev_dbg_ratelimited(&device->dev,
+ "%s: no recv_handler\n", __func__);
+ goto error1;
+ }
+ if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) {
+ /*
+ * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE is the only
+ * one in this range currently allowed
+ */
+ if (mad_reg_req->mgmt_class !=
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ dev_dbg_ratelimited(&device->dev,
+ "%s: Invalid Mgmt Class 0x%x\n",
+ __func__, mad_reg_req->mgmt_class);
+ goto error1;
+ }
+ } else if (mad_reg_req->mgmt_class == 0) {
+ /*
+ * Class 0 is reserved in IBA and is used for
+ * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ */
+ dev_dbg_ratelimited(&device->dev,
+ "%s: Invalid Mgmt Class 0\n",
+ __func__);
+ goto error1;
+ } else if (is_vendor_class(mad_reg_req->mgmt_class)) {
+ /*
+ * If class is in "new" vendor range,
+ * ensure supplied OUI is not zero
+ */
+ if (!is_vendor_oui(mad_reg_req->oui)) {
+ dev_dbg_ratelimited(&device->dev,
+ "%s: No OUI specified for class 0x%x\n",
+ __func__,
+ mad_reg_req->mgmt_class);
+ goto error1;
+ }
+ }
+ /* Make sure class supplied is consistent with RMPP */
+ if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) {
+ if (rmpp_version) {
+ dev_dbg_ratelimited(&device->dev,
+ "%s: RMPP version for non-RMPP class 0x%x\n",
+ __func__, mad_reg_req->mgmt_class);
+ goto error1;
+ }
+ }
+
+ /* Make sure class supplied is consistent with QP type */
+ if (qp_type == IB_QPT_SMI) {
+ if ((mad_reg_req->mgmt_class !=
+ IB_MGMT_CLASS_SUBN_LID_ROUTED) &&
+ (mad_reg_req->mgmt_class !=
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+ dev_dbg_ratelimited(&device->dev,
+ "%s: Invalid SM QP type: class 0x%x\n",
+ __func__, mad_reg_req->mgmt_class);
+ goto error1;
+ }
+ } else {
+ if ((mad_reg_req->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+ (mad_reg_req->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+ dev_dbg_ratelimited(&device->dev,
+ "%s: Invalid GS QP type: class 0x%x\n",
+ __func__, mad_reg_req->mgmt_class);
+ goto error1;
+ }
+ }
+ } else {
+ /* No registration request supplied */
+ if (!send_handler)
+ goto error1;
+ if (registration_flags & IB_MAD_USER_RMPP)
+ goto error1;
+ }
+
+ /* Validate device and port */
+ port_priv = ib_get_mad_port(device, port_num);
+ if (!port_priv) {
+ dev_dbg_ratelimited(&device->dev, "%s: Invalid port %d\n",
+ __func__, port_num);
+ ret = ERR_PTR(-ENODEV);
+ goto error1;
+ }
+
+ /* Verify the QP requested is supported. For example, Ethernet devices
+ * will not have QP0.
+ */
+ if (!port_priv->qp_info[qpn].qp) {
+ dev_dbg_ratelimited(&device->dev, "%s: QP %d not supported\n",
+ __func__, qpn);
+ ret = ERR_PTR(-EPROTONOSUPPORT);
+ goto error1;
+ }
+
+ /* Allocate structures */
+ mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL);
+ if (!mad_agent_priv) {
+ ret = ERR_PTR(-ENOMEM);
+ goto error1;
+ }
+
+ if (mad_reg_req) {
+ reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL);
+ if (!reg_req) {
+ ret = ERR_PTR(-ENOMEM);
+ goto error3;
+ }
+ }
+
+ /* Now, fill in the various structures */
+ mad_agent_priv->qp_info = &port_priv->qp_info[qpn];
+ mad_agent_priv->reg_req = reg_req;
+ mad_agent_priv->agent.rmpp_version = rmpp_version;
+ mad_agent_priv->agent.device = device;
+ mad_agent_priv->agent.recv_handler = recv_handler;
+ mad_agent_priv->agent.send_handler = send_handler;
+ mad_agent_priv->agent.context = context;
+ mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp;
+ mad_agent_priv->agent.port_num = port_num;
+ mad_agent_priv->agent.flags = registration_flags;
+ spin_lock_init(&mad_agent_priv->lock);
+ INIT_LIST_HEAD(&mad_agent_priv->send_list);
+ INIT_LIST_HEAD(&mad_agent_priv->wait_list);
+ INIT_LIST_HEAD(&mad_agent_priv->done_list);
+ INIT_LIST_HEAD(&mad_agent_priv->rmpp_list);
+ INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends);
+ INIT_LIST_HEAD(&mad_agent_priv->local_list);
+ INIT_WORK(&mad_agent_priv->local_work, local_completions);
+ atomic_set(&mad_agent_priv->refcount, 1);
+ init_completion(&mad_agent_priv->comp);
+
+ ret2 = ib_mad_agent_security_setup(&mad_agent_priv->agent, qp_type);
+ if (ret2) {
+ ret = ERR_PTR(ret2);
+ goto error4;
+ }
+
+ idr_preload(GFP_KERNEL);
+ idr_lock(&ib_mad_clients);
+ ret2 = idr_alloc_cyclic(&ib_mad_clients, mad_agent_priv, 0,
+ AGENT_ID_LIMIT, GFP_ATOMIC);
+ idr_unlock(&ib_mad_clients);
+ idr_preload_end();
+
+ if (ret2 < 0) {
+ ret = ERR_PTR(ret2);
+ goto error5;
+ }
+ mad_agent_priv->agent.hi_tid = ret2;
+
+ /*
+ * Make sure MAD registration (if supplied)
+ * is non overlapping with any existing ones
+ */
+ spin_lock_irq(&port_priv->reg_lock);
+ if (mad_reg_req) {
+ mgmt_class = convert_mgmt_class(mad_reg_req->mgmt_class);
+ if (!is_vendor_class(mgmt_class)) {
+ class = port_priv->version[mad_reg_req->
+ mgmt_class_version].class;
+ if (class) {
+ method = class->method_table[mgmt_class];
+ if (method) {
+ if (method_in_use(&method,
+ mad_reg_req))
+ goto error6;
+ }
+ }
+ ret2 = add_nonoui_reg_req(mad_reg_req, mad_agent_priv,
+ mgmt_class);
+ } else {
+ /* "New" vendor class range */
+ vendor = port_priv->version[mad_reg_req->
+ mgmt_class_version].vendor;
+ if (vendor) {
+ vclass = vendor_class_index(mgmt_class);
+ vendor_class = vendor->vendor_class[vclass];
+ if (vendor_class) {
+ if (is_vendor_method_in_use(
+ vendor_class,
+ mad_reg_req))
+ goto error6;
+ }
+ }
+ ret2 = add_oui_reg_req(mad_reg_req, mad_agent_priv);
+ }
+ if (ret2) {
+ ret = ERR_PTR(ret2);
+ goto error6;
+ }
+ }
+ spin_unlock_irq(&port_priv->reg_lock);
+
+ return &mad_agent_priv->agent;
+error6:
+ spin_unlock_irq(&port_priv->reg_lock);
+ idr_lock(&ib_mad_clients);
+ idr_remove(&ib_mad_clients, mad_agent_priv->agent.hi_tid);
+ idr_unlock(&ib_mad_clients);
+error5:
+ ib_mad_agent_security_cleanup(&mad_agent_priv->agent);
+error4:
+ kfree(reg_req);
+error3:
+ kfree(mad_agent_priv);
+error1:
+ return ret;
+}
+EXPORT_SYMBOL(ib_register_mad_agent);
+
+static inline int is_snooping_sends(int mad_snoop_flags)
+{
+ return (mad_snoop_flags &
+ (/*IB_MAD_SNOOP_POSTED_SENDS |
+ IB_MAD_SNOOP_RMPP_SENDS |*/
+ IB_MAD_SNOOP_SEND_COMPLETIONS /*|
+ IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS*/));
+}
+
+static inline int is_snooping_recvs(int mad_snoop_flags)
+{
+ return (mad_snoop_flags &
+ (IB_MAD_SNOOP_RECVS /*|
+ IB_MAD_SNOOP_RMPP_RECVS*/));
+}
+
+static int register_snoop_agent(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_snoop_private *mad_snoop_priv)
+{
+ struct ib_mad_snoop_private **new_snoop_table;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ /* Check for empty slot in array. */
+ for (i = 0; i < qp_info->snoop_table_size; i++)
+ if (!qp_info->snoop_table[i])
+ break;
+
+ if (i == qp_info->snoop_table_size) {
+ /* Grow table. */
+ new_snoop_table = krealloc(qp_info->snoop_table,
+ sizeof mad_snoop_priv *
+ (qp_info->snoop_table_size + 1),
+ GFP_ATOMIC);
+ if (!new_snoop_table) {
+ i = -ENOMEM;
+ goto out;
+ }
+
+ qp_info->snoop_table = new_snoop_table;
+ qp_info->snoop_table_size++;
+ }
+ qp_info->snoop_table[i] = mad_snoop_priv;
+ atomic_inc(&qp_info->snoop_count);
+out:
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+ return i;
+}
+
+struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device,
+ u8 port_num,
+ enum ib_qp_type qp_type,
+ int mad_snoop_flags,
+ ib_mad_snoop_handler snoop_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_agent *ret;
+ struct ib_mad_snoop_private *mad_snoop_priv;
+ int qpn;
+ int err;
+
+ /* Validate parameters */
+ if ((is_snooping_sends(mad_snoop_flags) && !snoop_handler) ||
+ (is_snooping_recvs(mad_snoop_flags) && !recv_handler)) {
+ ret = ERR_PTR(-EINVAL);
+ goto error1;
+ }
+ qpn = get_spl_qp_index(qp_type);
+ if (qpn == -1) {
+ ret = ERR_PTR(-EINVAL);
+ goto error1;
+ }
+ port_priv = ib_get_mad_port(device, port_num);
+ if (!port_priv) {
+ ret = ERR_PTR(-ENODEV);
+ goto error1;
+ }
+ /* Allocate structures */
+ mad_snoop_priv = kzalloc(sizeof *mad_snoop_priv, GFP_KERNEL);
+ if (!mad_snoop_priv) {
+ ret = ERR_PTR(-ENOMEM);
+ goto error1;
+ }
+
+ /* Now, fill in the various structures */
+ mad_snoop_priv->qp_info = &port_priv->qp_info[qpn];
+ mad_snoop_priv->agent.device = device;
+ mad_snoop_priv->agent.recv_handler = recv_handler;
+ mad_snoop_priv->agent.snoop_handler = snoop_handler;
+ mad_snoop_priv->agent.context = context;
+ mad_snoop_priv->agent.qp = port_priv->qp_info[qpn].qp;
+ mad_snoop_priv->agent.port_num = port_num;
+ mad_snoop_priv->mad_snoop_flags = mad_snoop_flags;
+ init_completion(&mad_snoop_priv->comp);
+
+ err = ib_mad_agent_security_setup(&mad_snoop_priv->agent, qp_type);
+ if (err) {
+ ret = ERR_PTR(err);
+ goto error2;
+ }
+
+ mad_snoop_priv->snoop_index = register_snoop_agent(
+ &port_priv->qp_info[qpn],
+ mad_snoop_priv);
+ if (mad_snoop_priv->snoop_index < 0) {
+ ret = ERR_PTR(mad_snoop_priv->snoop_index);
+ goto error3;
+ }
+
+ atomic_set(&mad_snoop_priv->refcount, 1);
+ return &mad_snoop_priv->agent;
+error3:
+ ib_mad_agent_security_cleanup(&mad_snoop_priv->agent);
+error2:
+ kfree(mad_snoop_priv);
+error1:
+ return ret;
+}
+EXPORT_SYMBOL(ib_register_mad_snoop);
+
+static inline void deref_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
+{
+ if (atomic_dec_and_test(&mad_agent_priv->refcount))
+ complete(&mad_agent_priv->comp);
+}
+
+static inline void deref_snoop_agent(struct ib_mad_snoop_private *mad_snoop_priv)
+{
+ if (atomic_dec_and_test(&mad_snoop_priv->refcount))
+ complete(&mad_snoop_priv->comp);
+}
+
+static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
+{
+ struct ib_mad_port_private *port_priv;
+
+ /* Note that we could still be handling received MADs */
+
+ /*
+ * Canceling all sends results in dropping received response
+ * MADs, preventing us from queuing additional work
+ */
+ cancel_mads(mad_agent_priv);
+ port_priv = mad_agent_priv->qp_info->port_priv;
+ cancel_delayed_work(&mad_agent_priv->timed_work);
+
+ spin_lock_irq(&port_priv->reg_lock);
+ remove_mad_reg_req(mad_agent_priv);
+ spin_unlock_irq(&port_priv->reg_lock);
+ idr_lock(&ib_mad_clients);
+ idr_remove(&ib_mad_clients, mad_agent_priv->agent.hi_tid);
+ idr_unlock(&ib_mad_clients);
+
+ flush_workqueue(port_priv->wq);
+
+ deref_mad_agent(mad_agent_priv);
+ wait_for_completion(&mad_agent_priv->comp);
+ ib_cancel_rmpp_recvs(mad_agent_priv);
+
+ ib_mad_agent_security_cleanup(&mad_agent_priv->agent);
+
+ kfree(mad_agent_priv->reg_req);
+ kfree_rcu(mad_agent_priv, rcu);
+}
+
+static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv)
+{
+ struct ib_mad_qp_info *qp_info;
+ unsigned long flags;
+
+ qp_info = mad_snoop_priv->qp_info;
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ qp_info->snoop_table[mad_snoop_priv->snoop_index] = NULL;
+ atomic_dec(&qp_info->snoop_count);
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+
+ deref_snoop_agent(mad_snoop_priv);
+ wait_for_completion(&mad_snoop_priv->comp);
+
+ ib_mad_agent_security_cleanup(&mad_snoop_priv->agent);
+
+ kfree(mad_snoop_priv);
+}
+
+/*
+ * ib_unregister_mad_agent - Unregisters a client from using MAD services
+ *
+ * Context: Process context.
+ */
+void ib_unregister_mad_agent(struct ib_mad_agent *mad_agent)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_snoop_private *mad_snoop_priv;
+
+ /* If the TID is zero, the agent can only snoop. */
+ if (mad_agent->hi_tid) {
+ mad_agent_priv = container_of(mad_agent,
+ struct ib_mad_agent_private,
+ agent);
+ unregister_mad_agent(mad_agent_priv);
+ } else {
+ mad_snoop_priv = container_of(mad_agent,
+ struct ib_mad_snoop_private,
+ agent);
+ unregister_mad_snoop(mad_snoop_priv);
+ }
+}
+EXPORT_SYMBOL(ib_unregister_mad_agent);
+
+static void dequeue_mad(struct ib_mad_list_head *mad_list)
+{
+ struct ib_mad_queue *mad_queue;
+ unsigned long flags;
+
+ mad_queue = mad_list->mad_queue;
+ spin_lock_irqsave(&mad_queue->lock, flags);
+ list_del(&mad_list->list);
+ mad_queue->count--;
+ spin_unlock_irqrestore(&mad_queue->lock, flags);
+}
+
+static void snoop_send(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_send_buf *send_buf,
+ struct ib_mad_send_wc *mad_send_wc,
+ int mad_snoop_flags)
+{
+ struct ib_mad_snoop_private *mad_snoop_priv;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ for (i = 0; i < qp_info->snoop_table_size; i++) {
+ mad_snoop_priv = qp_info->snoop_table[i];
+ if (!mad_snoop_priv ||
+ !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags))
+ continue;
+
+ atomic_inc(&mad_snoop_priv->refcount);
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+ mad_snoop_priv->agent.snoop_handler(&mad_snoop_priv->agent,
+ send_buf, mad_send_wc);
+ deref_snoop_agent(mad_snoop_priv);
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ }
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+}
+
+static void snoop_recv(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_recv_wc *mad_recv_wc,
+ int mad_snoop_flags)
+{
+ struct ib_mad_snoop_private *mad_snoop_priv;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ for (i = 0; i < qp_info->snoop_table_size; i++) {
+ mad_snoop_priv = qp_info->snoop_table[i];
+ if (!mad_snoop_priv ||
+ !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags))
+ continue;
+
+ atomic_inc(&mad_snoop_priv->refcount);
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+ mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, NULL,
+ mad_recv_wc);
+ deref_snoop_agent(mad_snoop_priv);
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ }
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+}
+
+static void build_smp_wc(struct ib_qp *qp, struct ib_cqe *cqe, u16 slid,
+ u16 pkey_index, u8 port_num, struct ib_wc *wc)
+{
+ memset(wc, 0, sizeof *wc);
+ wc->wr_cqe = cqe;
+ wc->status = IB_WC_SUCCESS;
+ wc->opcode = IB_WC_RECV;
+ wc->pkey_index = pkey_index;
+ wc->byte_len = sizeof(struct ib_mad) + sizeof(struct ib_grh);
+ wc->src_qp = IB_QP0;
+ wc->qp = qp;
+ wc->slid = slid;
+ wc->sl = 0;
+ wc->dlid_path_bits = 0;
+ wc->port_num = port_num;
+}
+
+static size_t mad_priv_size(const struct ib_mad_private *mp)
+{
+ return sizeof(struct ib_mad_private) + mp->mad_size;
+}
+
+static struct ib_mad_private *alloc_mad_private(size_t mad_size, gfp_t flags)
+{
+ size_t size = sizeof(struct ib_mad_private) + mad_size;
+ struct ib_mad_private *ret = kzalloc(size, flags);
+
+ if (ret)
+ ret->mad_size = mad_size;
+
+ return ret;
+}
+
+static size_t port_mad_size(const struct ib_mad_port_private *port_priv)
+{
+ return rdma_max_mad_size(port_priv->device, port_priv->port_num);
+}
+
+static size_t mad_priv_dma_size(const struct ib_mad_private *mp)
+{
+ return sizeof(struct ib_grh) + mp->mad_size;
+}
+
+/*
+ * Return 0 if SMP is to be sent
+ * Return 1 if SMP was consumed locally (whether or not solicited)
+ * Return < 0 if error
+ */
+static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_send_wr_private *mad_send_wr)
+{
+ int ret = 0;
+ struct ib_smp *smp = mad_send_wr->send_buf.mad;
+ struct opa_smp *opa_smp = (struct opa_smp *)smp;
+ unsigned long flags;
+ struct ib_mad_local_private *local;
+ struct ib_mad_private *mad_priv;
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_agent_private *recv_mad_agent = NULL;
+ struct ib_device *device = mad_agent_priv->agent.device;
+ u8 port_num;
+ struct ib_wc mad_wc;
+ struct ib_ud_wr *send_wr = &mad_send_wr->send_wr;
+ size_t mad_size = port_mad_size(mad_agent_priv->qp_info->port_priv);
+ u16 out_mad_pkey_index = 0;
+ u16 drslid;
+ bool opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device,
+ mad_agent_priv->qp_info->port_priv->port_num);
+
+ if (rdma_cap_ib_switch(device) &&
+ smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ port_num = send_wr->port_num;
+ else
+ port_num = mad_agent_priv->agent.port_num;
+
+ /*
+ * Directed route handling starts if the initial LID routed part of
+ * a request or the ending LID routed part of a response is empty.
+ * If we are at the start of the LID routed part, don't update the
+ * hop_ptr or hop_cnt. See section 14.2.2, Vol 1 IB spec.
+ */
+ if (opa && smp->class_version == OPA_SM_CLASS_VERSION) {
+ u32 opa_drslid;
+
+ if ((opa_get_smp_direction(opa_smp)
+ ? opa_smp->route.dr.dr_dlid : opa_smp->route.dr.dr_slid) ==
+ OPA_LID_PERMISSIVE &&
+ opa_smi_handle_dr_smp_send(opa_smp,
+ rdma_cap_ib_switch(device),
+ port_num) == IB_SMI_DISCARD) {
+ ret = -EINVAL;
+ dev_err(&device->dev, "OPA Invalid directed route\n");
+ goto out;
+ }
+ opa_drslid = be32_to_cpu(opa_smp->route.dr.dr_slid);
+ if (opa_drslid != be32_to_cpu(OPA_LID_PERMISSIVE) &&
+ opa_drslid & 0xffff0000) {
+ ret = -EINVAL;
+ dev_err(&device->dev, "OPA Invalid dr_slid 0x%x\n",
+ opa_drslid);
+ goto out;
+ }
+ drslid = (u16)(opa_drslid & 0x0000ffff);
+
+ /* Check to post send on QP or process locally */
+ if (opa_smi_check_local_smp(opa_smp, device) == IB_SMI_DISCARD &&
+ opa_smi_check_local_returning_smp(opa_smp, device) == IB_SMI_DISCARD)
+ goto out;
+ } else {
+ if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) ==
+ IB_LID_PERMISSIVE &&
+ smi_handle_dr_smp_send(smp, rdma_cap_ib_switch(device), port_num) ==
+ IB_SMI_DISCARD) {
+ ret = -EINVAL;
+ dev_err(&device->dev, "Invalid directed route\n");
+ goto out;
+ }
+ drslid = be16_to_cpu(smp->dr_slid);
+
+ /* Check to post send on QP or process locally */
+ if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD &&
+ smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD)
+ goto out;
+ }
+
+ local = kmalloc(sizeof *local, GFP_ATOMIC);
+ if (!local) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ local->mad_priv = NULL;
+ local->recv_mad_agent = NULL;
+ mad_priv = alloc_mad_private(mad_size, GFP_ATOMIC);
+ if (!mad_priv) {
+ ret = -ENOMEM;
+ kfree(local);
+ goto out;
+ }
+
+ build_smp_wc(mad_agent_priv->agent.qp,
+ send_wr->wr.wr_cqe, drslid,
+ send_wr->pkey_index,
+ send_wr->port_num, &mad_wc);
+
+ if (opa && smp->base_version == OPA_MGMT_BASE_VERSION) {
+ mad_wc.byte_len = mad_send_wr->send_buf.hdr_len
+ + mad_send_wr->send_buf.data_len
+ + sizeof(struct ib_grh);
+ }
+
+ /* No GRH for DR SMP */
+ ret = device->process_mad(device, 0, port_num, &mad_wc, NULL,
+ (const struct ib_mad_hdr *)smp, mad_size,
+ (struct ib_mad_hdr *)mad_priv->mad,
+ &mad_size, &out_mad_pkey_index);
+ switch (ret)
+ {
+ case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY:
+ if (ib_response_mad((const struct ib_mad_hdr *)mad_priv->mad) &&
+ mad_agent_priv->agent.recv_handler) {
+ local->mad_priv = mad_priv;
+ local->recv_mad_agent = mad_agent_priv;
+ /*
+ * Reference MAD agent until receive
+ * side of local completion handled
+ */
+ atomic_inc(&mad_agent_priv->refcount);
+ } else
+ kfree(mad_priv);
+ break;
+ case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED:
+ kfree(mad_priv);
+ break;
+ case IB_MAD_RESULT_SUCCESS:
+ /* Treat like an incoming receive MAD */
+ port_priv = ib_get_mad_port(mad_agent_priv->agent.device,
+ mad_agent_priv->agent.port_num);
+ if (port_priv) {
+ memcpy(mad_priv->mad, smp, mad_priv->mad_size);
+ recv_mad_agent = find_mad_agent(port_priv,
+ (const struct ib_mad_hdr *)mad_priv->mad);
+ }
+ if (!port_priv || !recv_mad_agent) {
+ /*
+ * No receiving agent so drop packet and
+ * generate send completion.
+ */
+ kfree(mad_priv);
+ break;
+ }
+ local->mad_priv = mad_priv;
+ local->recv_mad_agent = recv_mad_agent;
+ break;
+ default:
+ kfree(mad_priv);
+ kfree(local);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ local->mad_send_wr = mad_send_wr;
+ if (opa) {
+ local->mad_send_wr->send_wr.pkey_index = out_mad_pkey_index;
+ local->return_wc_byte_len = mad_size;
+ }
+ /* Reference MAD agent until send side of local completion handled */
+ atomic_inc(&mad_agent_priv->refcount);
+ /* Queue local completion to local list */
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_add_tail(&local->completion_list, &mad_agent_priv->local_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ queue_work(mad_agent_priv->qp_info->port_priv->wq,
+ &mad_agent_priv->local_work);
+ ret = 1;
+out:
+ return ret;
+}
+
+static int get_pad_size(int hdr_len, int data_len, size_t mad_size)
+{
+ int seg_size, pad;
+
+ seg_size = mad_size - hdr_len;
+ if (data_len && seg_size) {
+ pad = seg_size - data_len % seg_size;
+ return pad == seg_size ? 0 : pad;
+ } else
+ return seg_size;
+}
+
+static void free_send_rmpp_list(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_rmpp_segment *s, *t;
+
+ list_for_each_entry_safe(s, t, &mad_send_wr->rmpp_list, list) {
+ list_del(&s->list);
+ kfree(s);
+ }
+}
+
+static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr,
+ size_t mad_size, gfp_t gfp_mask)
+{
+ struct ib_mad_send_buf *send_buf = &send_wr->send_buf;
+ struct ib_rmpp_mad *rmpp_mad = send_buf->mad;
+ struct ib_rmpp_segment *seg = NULL;
+ int left, seg_size, pad;
+
+ send_buf->seg_size = mad_size - send_buf->hdr_len;
+ send_buf->seg_rmpp_size = mad_size - IB_MGMT_RMPP_HDR;
+ seg_size = send_buf->seg_size;
+ pad = send_wr->pad;
+
+ /* Allocate data segments. */
+ for (left = send_buf->data_len + pad; left > 0; left -= seg_size) {
+ seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask);
+ if (!seg) {
+ free_send_rmpp_list(send_wr);
+ return -ENOMEM;
+ }
+ seg->num = ++send_buf->seg_count;
+ list_add_tail(&seg->list, &send_wr->rmpp_list);
+ }
+
+ /* Zero any padding */
+ if (pad)
+ memset(seg->data + seg_size - pad, 0, pad);
+
+ rmpp_mad->rmpp_hdr.rmpp_version = send_wr->mad_agent_priv->
+ agent.rmpp_version;
+ rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA;
+ ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+
+ send_wr->cur_seg = container_of(send_wr->rmpp_list.next,
+ struct ib_rmpp_segment, list);
+ send_wr->last_ack_seg = send_wr->cur_seg;
+ return 0;
+}
+
+int ib_mad_kernel_rmpp_agent(const struct ib_mad_agent *agent)
+{
+ return agent->rmpp_version && !(agent->flags & IB_MAD_USER_RMPP);
+}
+EXPORT_SYMBOL(ib_mad_kernel_rmpp_agent);
+
+struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
+ u32 remote_qpn, u16 pkey_index,
+ int rmpp_active,
+ int hdr_len, int data_len,
+ gfp_t gfp_mask,
+ u8 base_version)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ int pad, message_size, ret, size;
+ void *buf;
+ size_t mad_size;
+ bool opa;
+
+ mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
+ agent);
+
+ opa = rdma_cap_opa_mad(mad_agent->device, mad_agent->port_num);
+
+ if (opa && base_version == OPA_MGMT_BASE_VERSION)
+ mad_size = sizeof(struct opa_mad);
+ else
+ mad_size = sizeof(struct ib_mad);
+
+ pad = get_pad_size(hdr_len, data_len, mad_size);
+ message_size = hdr_len + data_len + pad;
+
+ if (ib_mad_kernel_rmpp_agent(mad_agent)) {
+ if (!rmpp_active && message_size > mad_size)
+ return ERR_PTR(-EINVAL);
+ } else
+ if (rmpp_active || message_size > mad_size)
+ return ERR_PTR(-EINVAL);
+
+ size = rmpp_active ? hdr_len : mad_size;
+ buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ mad_send_wr = buf + size;
+ INIT_LIST_HEAD(&mad_send_wr->rmpp_list);
+ mad_send_wr->send_buf.mad = buf;
+ mad_send_wr->send_buf.hdr_len = hdr_len;
+ mad_send_wr->send_buf.data_len = data_len;
+ mad_send_wr->pad = pad;
+
+ mad_send_wr->mad_agent_priv = mad_agent_priv;
+ mad_send_wr->sg_list[0].length = hdr_len;
+ mad_send_wr->sg_list[0].lkey = mad_agent->qp->pd->local_dma_lkey;
+
+ /* OPA MADs don't have to be the full 2048 bytes */
+ if (opa && base_version == OPA_MGMT_BASE_VERSION &&
+ data_len < mad_size - hdr_len)
+ mad_send_wr->sg_list[1].length = data_len;
+ else
+ mad_send_wr->sg_list[1].length = mad_size - hdr_len;
+
+ mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey;
+
+ mad_send_wr->mad_list.cqe.done = ib_mad_send_done;
+
+ mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe;
+ mad_send_wr->send_wr.wr.sg_list = mad_send_wr->sg_list;
+ mad_send_wr->send_wr.wr.num_sge = 2;
+ mad_send_wr->send_wr.wr.opcode = IB_WR_SEND;
+ mad_send_wr->send_wr.wr.send_flags = IB_SEND_SIGNALED;
+ mad_send_wr->send_wr.remote_qpn = remote_qpn;
+ mad_send_wr->send_wr.remote_qkey = IB_QP_SET_QKEY;
+ mad_send_wr->send_wr.pkey_index = pkey_index;
+
+ if (rmpp_active) {
+ ret = alloc_send_rmpp_list(mad_send_wr, mad_size, gfp_mask);
+ if (ret) {
+ kfree(buf);
+ return ERR_PTR(ret);
+ }
+ }
+
+ mad_send_wr->send_buf.mad_agent = mad_agent;
+ atomic_inc(&mad_agent_priv->refcount);
+ return &mad_send_wr->send_buf;
+}
+EXPORT_SYMBOL(ib_create_send_mad);
+
+int ib_get_mad_data_offset(u8 mgmt_class)
+{
+ if (mgmt_class == IB_MGMT_CLASS_SUBN_ADM)
+ return IB_MGMT_SA_HDR;
+ else if ((mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) ||
+ (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) ||
+ (mgmt_class == IB_MGMT_CLASS_BIS))
+ return IB_MGMT_DEVICE_HDR;
+ else if ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+ (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END))
+ return IB_MGMT_VENDOR_HDR;
+ else
+ return IB_MGMT_MAD_HDR;
+}
+EXPORT_SYMBOL(ib_get_mad_data_offset);
+
+int ib_is_mad_class_rmpp(u8 mgmt_class)
+{
+ if ((mgmt_class == IB_MGMT_CLASS_SUBN_ADM) ||
+ (mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) ||
+ (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) ||
+ (mgmt_class == IB_MGMT_CLASS_BIS) ||
+ ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+ (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END)))
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL(ib_is_mad_class_rmpp);
+
+void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct list_head *list;
+
+ mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+ send_buf);
+ list = &mad_send_wr->cur_seg->list;
+
+ if (mad_send_wr->cur_seg->num < seg_num) {
+ list_for_each_entry(mad_send_wr->cur_seg, list, list)
+ if (mad_send_wr->cur_seg->num == seg_num)
+ break;
+ } else if (mad_send_wr->cur_seg->num > seg_num) {
+ list_for_each_entry_reverse(mad_send_wr->cur_seg, list, list)
+ if (mad_send_wr->cur_seg->num == seg_num)
+ break;
+ }
+ return mad_send_wr->cur_seg->data;
+}
+EXPORT_SYMBOL(ib_get_rmpp_segment);
+
+static inline void *ib_get_payload(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ if (mad_send_wr->send_buf.seg_count)
+ return ib_get_rmpp_segment(&mad_send_wr->send_buf,
+ mad_send_wr->seg_num);
+ else
+ return mad_send_wr->send_buf.mad +
+ mad_send_wr->send_buf.hdr_len;
+}
+
+void ib_free_send_mad(struct ib_mad_send_buf *send_buf)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *mad_send_wr;
+
+ mad_agent_priv = container_of(send_buf->mad_agent,
+ struct ib_mad_agent_private, agent);
+ mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+ send_buf);
+
+ free_send_rmpp_list(mad_send_wr);
+ kfree(send_buf->mad);
+ deref_mad_agent(mad_agent_priv);
+}
+EXPORT_SYMBOL(ib_free_send_mad);
+
+int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_mad_qp_info *qp_info;
+ struct list_head *list;
+ struct ib_mad_agent *mad_agent;
+ struct ib_sge *sge;
+ unsigned long flags;
+ int ret;
+
+ /* Set WR ID to find mad_send_wr upon completion */
+ qp_info = mad_send_wr->mad_agent_priv->qp_info;
+ mad_send_wr->mad_list.mad_queue = &qp_info->send_queue;
+ mad_send_wr->mad_list.cqe.done = ib_mad_send_done;
+ mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe;
+
+ mad_agent = mad_send_wr->send_buf.mad_agent;
+ sge = mad_send_wr->sg_list;
+ sge[0].addr = ib_dma_map_single(mad_agent->device,
+ mad_send_wr->send_buf.mad,
+ sge[0].length,
+ DMA_TO_DEVICE);
+ if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[0].addr)))
+ return -ENOMEM;
+
+ mad_send_wr->header_mapping = sge[0].addr;
+
+ sge[1].addr = ib_dma_map_single(mad_agent->device,
+ ib_get_payload(mad_send_wr),
+ sge[1].length,
+ DMA_TO_DEVICE);
+ if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[1].addr))) {
+ ib_dma_unmap_single(mad_agent->device,
+ mad_send_wr->header_mapping,
+ sge[0].length, DMA_TO_DEVICE);
+ return -ENOMEM;
+ }
+ mad_send_wr->payload_mapping = sge[1].addr;
+
+ spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+ if (qp_info->send_queue.count < qp_info->send_queue.max_active) {
+ ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr.wr,
+ NULL);
+ list = &qp_info->send_queue.list;
+ } else {
+ ret = 0;
+ list = &qp_info->overflow_list;
+ }
+
+ if (!ret) {
+ qp_info->send_queue.count++;
+ list_add_tail(&mad_send_wr->mad_list.list, list);
+ }
+ spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+ if (ret) {
+ ib_dma_unmap_single(mad_agent->device,
+ mad_send_wr->header_mapping,
+ sge[0].length, DMA_TO_DEVICE);
+ ib_dma_unmap_single(mad_agent->device,
+ mad_send_wr->payload_mapping,
+ sge[1].length, DMA_TO_DEVICE);
+ }
+ return ret;
+}
+
+/*
+ * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
+ * with the registered client
+ */
+int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
+ struct ib_mad_send_buf **bad_send_buf)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_buf *next_send_buf;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ unsigned long flags;
+ int ret = -EINVAL;
+
+ /* Walk list of send WRs and post each on send list */
+ for (; send_buf; send_buf = next_send_buf) {
+ mad_send_wr = container_of(send_buf,
+ struct ib_mad_send_wr_private,
+ send_buf);
+ mad_agent_priv = mad_send_wr->mad_agent_priv;
+
+ ret = ib_mad_enforce_security(mad_agent_priv,
+ mad_send_wr->send_wr.pkey_index);
+ if (ret)
+ goto error;
+
+ if (!send_buf->mad_agent->send_handler ||
+ (send_buf->timeout_ms &&
+ !send_buf->mad_agent->recv_handler)) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ if (!ib_is_mad_class_rmpp(((struct ib_mad_hdr *) send_buf->mad)->mgmt_class)) {
+ if (mad_agent_priv->agent.rmpp_version) {
+ ret = -EINVAL;
+ goto error;
+ }
+ }
+
+ /*
+ * Save pointer to next work request to post in case the
+ * current one completes, and the user modifies the work
+ * request associated with the completion
+ */
+ next_send_buf = send_buf->next;
+ mad_send_wr->send_wr.ah = send_buf->ah;
+
+ if (((struct ib_mad_hdr *) send_buf->mad)->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ ret = handle_outgoing_dr_smp(mad_agent_priv,
+ mad_send_wr);
+ if (ret < 0) /* error */
+ goto error;
+ else if (ret == 1) /* locally consumed */
+ continue;
+ }
+
+ mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid;
+ /* Timeout will be updated after send completes */
+ mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms);
+ mad_send_wr->max_retries = send_buf->retries;
+ mad_send_wr->retries_left = send_buf->retries;
+ send_buf->retries = 0;
+ /* Reference for work request to QP + response */
+ mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
+ mad_send_wr->status = IB_WC_SUCCESS;
+
+ /* Reference MAD agent until send completes */
+ atomic_inc(&mad_agent_priv->refcount);
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_add_tail(&mad_send_wr->agent_list,
+ &mad_agent_priv->send_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
+ ret = ib_send_rmpp_mad(mad_send_wr);
+ if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED)
+ ret = ib_send_mad(mad_send_wr);
+ } else
+ ret = ib_send_mad(mad_send_wr);
+ if (ret < 0) {
+ /* Fail send request */
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_del(&mad_send_wr->agent_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ atomic_dec(&mad_agent_priv->refcount);
+ goto error;
+ }
+ }
+ return 0;
+error:
+ if (bad_send_buf)
+ *bad_send_buf = send_buf;
+ return ret;
+}
+EXPORT_SYMBOL(ib_post_send_mad);
+
+/*
+ * ib_free_recv_mad - Returns data buffers used to receive
+ * a MAD to the access layer
+ */
+void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_mad_recv_buf *mad_recv_buf, *temp_recv_buf;
+ struct ib_mad_private_header *mad_priv_hdr;
+ struct ib_mad_private *priv;
+ struct list_head free_list;
+
+ INIT_LIST_HEAD(&free_list);
+ list_splice_init(&mad_recv_wc->rmpp_list, &free_list);
+
+ list_for_each_entry_safe(mad_recv_buf, temp_recv_buf,
+ &free_list, list) {
+ mad_recv_wc = container_of(mad_recv_buf, struct ib_mad_recv_wc,
+ recv_buf);
+ mad_priv_hdr = container_of(mad_recv_wc,
+ struct ib_mad_private_header,
+ recv_wc);
+ priv = container_of(mad_priv_hdr, struct ib_mad_private,
+ header);
+ kfree(priv);
+ }
+}
+EXPORT_SYMBOL(ib_free_recv_mad);
+
+struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp,
+ u8 rmpp_version,
+ ib_mad_send_handler send_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context)
+{
+ return ERR_PTR(-EINVAL); /* XXX: for now */
+}
+EXPORT_SYMBOL(ib_redirect_mad_qp);
+
+int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
+ struct ib_wc *wc)
+{
+ dev_err(&mad_agent->device->dev,
+ "ib_process_mad_wc() not implemented yet\n");
+ return 0;
+}
+EXPORT_SYMBOL(ib_process_mad_wc);
+
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+ struct ib_mad_reg_req *mad_reg_req)
+{
+ int i;
+
+ for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) {
+ if ((*method)->agent[i]) {
+ pr_err("Method %d already in use\n", i);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static int allocate_method_table(struct ib_mad_mgmt_method_table **method)
+{
+ /* Allocate management method table */
+ *method = kzalloc(sizeof **method, GFP_ATOMIC);
+ return (*method) ? 0 : (-ENOMEM);
+}
+
+/*
+ * Check to see if there are any methods still in use
+ */
+static int check_method_table(struct ib_mad_mgmt_method_table *method)
+{
+ int i;
+
+ for (i = 0; i < IB_MGMT_MAX_METHODS; i++)
+ if (method->agent[i])
+ return 1;
+ return 0;
+}
+
+/*
+ * Check to see if there are any method tables for this class still in use
+ */
+static int check_class_table(struct ib_mad_mgmt_class_table *class)
+{
+ int i;
+
+ for (i = 0; i < MAX_MGMT_CLASS; i++)
+ if (class->method_table[i])
+ return 1;
+ return 0;
+}
+
+static int check_vendor_class(struct ib_mad_mgmt_vendor_class *vendor_class)
+{
+ int i;
+
+ for (i = 0; i < MAX_MGMT_OUI; i++)
+ if (vendor_class->method_table[i])
+ return 1;
+ return 0;
+}
+
+static int find_vendor_oui(struct ib_mad_mgmt_vendor_class *vendor_class,
+ const char *oui)
+{
+ int i;
+
+ for (i = 0; i < MAX_MGMT_OUI; i++)
+ /* Is there matching OUI for this vendor class ? */
+ if (!memcmp(vendor_class->oui[i], oui, 3))
+ return i;
+
+ return -1;
+}
+
+static int check_vendor_table(struct ib_mad_mgmt_vendor_class_table *vendor)
+{
+ int i;
+
+ for (i = 0; i < MAX_MGMT_VENDOR_RANGE2; i++)
+ if (vendor->vendor_class[i])
+ return 1;
+
+ return 0;
+}
+
+static void remove_methods_mad_agent(struct ib_mad_mgmt_method_table *method,
+ struct ib_mad_agent_private *agent)
+{
+ int i;
+
+ /* Remove any methods for this mad agent */
+ for (i = 0; i < IB_MGMT_MAX_METHODS; i++) {
+ if (method->agent[i] == agent) {
+ method->agent[i] = NULL;
+ }
+ }
+}
+
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+ struct ib_mad_agent_private *agent_priv,
+ u8 mgmt_class)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_mgmt_class_table **class;
+ struct ib_mad_mgmt_method_table **method;
+ int i, ret;
+
+ port_priv = agent_priv->qp_info->port_priv;
+ class = &port_priv->version[mad_reg_req->mgmt_class_version].class;
+ if (!*class) {
+ /* Allocate management class table for "new" class version */
+ *class = kzalloc(sizeof **class, GFP_ATOMIC);
+ if (!*class) {
+ ret = -ENOMEM;
+ goto error1;
+ }
+
+ /* Allocate method table for this management class */
+ method = &(*class)->method_table[mgmt_class];
+ if ((ret = allocate_method_table(method)))
+ goto error2;
+ } else {
+ method = &(*class)->method_table[mgmt_class];
+ if (!*method) {
+ /* Allocate method table for this management class */
+ if ((ret = allocate_method_table(method)))
+ goto error1;
+ }
+ }
+
+ /* Now, make sure methods are not already in use */
+ if (method_in_use(method, mad_reg_req))
+ goto error3;
+
+ /* Finally, add in methods being registered */
+ for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS)
+ (*method)->agent[i] = agent_priv;
+
+ return 0;
+
+error3:
+ /* Remove any methods for this mad agent */
+ remove_methods_mad_agent(*method, agent_priv);
+ /* Now, check to see if there are any methods in use */
+ if (!check_method_table(*method)) {
+ /* If not, release management method table */
+ kfree(*method);
+ *method = NULL;
+ }
+ ret = -EINVAL;
+ goto error1;
+error2:
+ kfree(*class);
+ *class = NULL;
+error1:
+ return ret;
+}
+
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+ struct ib_mad_agent_private *agent_priv)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_mgmt_vendor_class_table **vendor_table;
+ struct ib_mad_mgmt_vendor_class_table *vendor = NULL;
+ struct ib_mad_mgmt_vendor_class *vendor_class = NULL;
+ struct ib_mad_mgmt_method_table **method;
+ int i, ret = -ENOMEM;
+ u8 vclass;
+
+ /* "New" vendor (with OUI) class */
+ vclass = vendor_class_index(mad_reg_req->mgmt_class);
+ port_priv = agent_priv->qp_info->port_priv;
+ vendor_table = &port_priv->version[
+ mad_reg_req->mgmt_class_version].vendor;
+ if (!*vendor_table) {
+ /* Allocate mgmt vendor class table for "new" class version */
+ vendor = kzalloc(sizeof *vendor, GFP_ATOMIC);
+ if (!vendor)
+ goto error1;
+
+ *vendor_table = vendor;
+ }
+ if (!(*vendor_table)->vendor_class[vclass]) {
+ /* Allocate table for this management vendor class */
+ vendor_class = kzalloc(sizeof *vendor_class, GFP_ATOMIC);
+ if (!vendor_class)
+ goto error2;
+
+ (*vendor_table)->vendor_class[vclass] = vendor_class;
+ }
+ for (i = 0; i < MAX_MGMT_OUI; i++) {
+ /* Is there matching OUI for this vendor class ? */
+ if (!memcmp((*vendor_table)->vendor_class[vclass]->oui[i],
+ mad_reg_req->oui, 3)) {
+ method = &(*vendor_table)->vendor_class[
+ vclass]->method_table[i];
+ if (!*method)
+ goto error3;
+ goto check_in_use;
+ }
+ }
+ for (i = 0; i < MAX_MGMT_OUI; i++) {
+ /* OUI slot available ? */
+ if (!is_vendor_oui((*vendor_table)->vendor_class[
+ vclass]->oui[i])) {
+ method = &(*vendor_table)->vendor_class[
+ vclass]->method_table[i];
+ /* Allocate method table for this OUI */
+ if (!*method) {
+ ret = allocate_method_table(method);
+ if (ret)
+ goto error3;
+ }
+ memcpy((*vendor_table)->vendor_class[vclass]->oui[i],
+ mad_reg_req->oui, 3);
+ goto check_in_use;
+ }
+ }
+ dev_err(&agent_priv->agent.device->dev, "All OUI slots in use\n");
+ goto error3;
+
+check_in_use:
+ /* Now, make sure methods are not already in use */
+ if (method_in_use(method, mad_reg_req))
+ goto error4;
+
+ /* Finally, add in methods being registered */
+ for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS)
+ (*method)->agent[i] = agent_priv;
+
+ return 0;
+
+error4:
+ /* Remove any methods for this mad agent */
+ remove_methods_mad_agent(*method, agent_priv);
+ /* Now, check to see if there are any methods in use */
+ if (!check_method_table(*method)) {
+ /* If not, release management method table */
+ kfree(*method);
+ *method = NULL;
+ }
+ ret = -EINVAL;
+error3:
+ if (vendor_class) {
+ (*vendor_table)->vendor_class[vclass] = NULL;
+ kfree(vendor_class);
+ }
+error2:
+ if (vendor) {
+ *vendor_table = NULL;
+ kfree(vendor);
+ }
+error1:
+ return ret;
+}
+
+static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_mgmt_class_table *class;
+ struct ib_mad_mgmt_method_table *method;
+ struct ib_mad_mgmt_vendor_class_table *vendor;
+ struct ib_mad_mgmt_vendor_class *vendor_class;
+ int index;
+ u8 mgmt_class;
+
+ /*
+ * Was MAD registration request supplied
+ * with original registration ?
+ */
+ if (!agent_priv->reg_req) {
+ goto out;
+ }
+
+ port_priv = agent_priv->qp_info->port_priv;
+ mgmt_class = convert_mgmt_class(agent_priv->reg_req->mgmt_class);
+ class = port_priv->version[
+ agent_priv->reg_req->mgmt_class_version].class;
+ if (!class)
+ goto vendor_check;
+
+ method = class->method_table[mgmt_class];
+ if (method) {
+ /* Remove any methods for this mad agent */
+ remove_methods_mad_agent(method, agent_priv);
+ /* Now, check to see if there are any methods still in use */
+ if (!check_method_table(method)) {
+ /* If not, release management method table */
+ kfree(method);
+ class->method_table[mgmt_class] = NULL;
+ /* Any management classes left ? */
+ if (!check_class_table(class)) {
+ /* If not, release management class table */
+ kfree(class);
+ port_priv->version[
+ agent_priv->reg_req->
+ mgmt_class_version].class = NULL;
+ }
+ }
+ }
+
+vendor_check:
+ if (!is_vendor_class(mgmt_class))
+ goto out;
+
+ /* normalize mgmt_class to vendor range 2 */
+ mgmt_class = vendor_class_index(agent_priv->reg_req->mgmt_class);
+ vendor = port_priv->version[
+ agent_priv->reg_req->mgmt_class_version].vendor;
+
+ if (!vendor)
+ goto out;
+
+ vendor_class = vendor->vendor_class[mgmt_class];
+ if (vendor_class) {
+ index = find_vendor_oui(vendor_class, agent_priv->reg_req->oui);
+ if (index < 0)
+ goto out;
+ method = vendor_class->method_table[index];
+ if (method) {
+ /* Remove any methods for this mad agent */
+ remove_methods_mad_agent(method, agent_priv);
+ /*
+ * Now, check to see if there are
+ * any methods still in use
+ */
+ if (!check_method_table(method)) {
+ /* If not, release management method table */
+ kfree(method);
+ vendor_class->method_table[index] = NULL;
+ memset(vendor_class->oui[index], 0, 3);
+ /* Any OUIs left ? */
+ if (!check_vendor_class(vendor_class)) {
+ /* If not, release vendor class table */
+ kfree(vendor_class);
+ vendor->vendor_class[mgmt_class] = NULL;
+ /* Any other vendor classes left ? */
+ if (!check_vendor_table(vendor)) {
+ kfree(vendor);
+ port_priv->version[
+ agent_priv->reg_req->
+ mgmt_class_version].
+ vendor = NULL;
+ }
+ }
+ }
+ }
+ }
+
+out:
+ return;
+}
+
+static struct ib_mad_agent_private *
+find_mad_agent(struct ib_mad_port_private *port_priv,
+ const struct ib_mad_hdr *mad_hdr)
+{
+ struct ib_mad_agent_private *mad_agent = NULL;
+ unsigned long flags;
+
+ if (ib_response_mad(mad_hdr)) {
+ u32 hi_tid;
+
+ /*
+ * Routing is based on high 32 bits of transaction ID
+ * of MAD.
+ */
+ hi_tid = be64_to_cpu(mad_hdr->tid) >> 32;
+ rcu_read_lock();
+ mad_agent = idr_find(&ib_mad_clients, hi_tid);
+ if (mad_agent && !atomic_inc_not_zero(&mad_agent->refcount))
+ mad_agent = NULL;
+ rcu_read_unlock();
+ } else {
+ struct ib_mad_mgmt_class_table *class;
+ struct ib_mad_mgmt_method_table *method;
+ struct ib_mad_mgmt_vendor_class_table *vendor;
+ struct ib_mad_mgmt_vendor_class *vendor_class;
+ const struct ib_vendor_mad *vendor_mad;
+ int index;
+
+ spin_lock_irqsave(&port_priv->reg_lock, flags);
+ /*
+ * Routing is based on version, class, and method
+ * For "newer" vendor MADs, also based on OUI
+ */
+ if (mad_hdr->class_version >= MAX_MGMT_VERSION)
+ goto out;
+ if (!is_vendor_class(mad_hdr->mgmt_class)) {
+ class = port_priv->version[
+ mad_hdr->class_version].class;
+ if (!class)
+ goto out;
+ if (convert_mgmt_class(mad_hdr->mgmt_class) >=
+ ARRAY_SIZE(class->method_table))
+ goto out;
+ method = class->method_table[convert_mgmt_class(
+ mad_hdr->mgmt_class)];
+ if (method)
+ mad_agent = method->agent[mad_hdr->method &
+ ~IB_MGMT_METHOD_RESP];
+ } else {
+ vendor = port_priv->version[
+ mad_hdr->class_version].vendor;
+ if (!vendor)
+ goto out;
+ vendor_class = vendor->vendor_class[vendor_class_index(
+ mad_hdr->mgmt_class)];
+ if (!vendor_class)
+ goto out;
+ /* Find matching OUI */
+ vendor_mad = (const struct ib_vendor_mad *)mad_hdr;
+ index = find_vendor_oui(vendor_class, vendor_mad->oui);
+ if (index == -1)
+ goto out;
+ method = vendor_class->method_table[index];
+ if (method) {
+ mad_agent = method->agent[mad_hdr->method &
+ ~IB_MGMT_METHOD_RESP];
+ }
+ }
+ if (mad_agent)
+ atomic_inc(&mad_agent->refcount);
+out:
+ spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+ }
+
+ if (mad_agent && !mad_agent->agent.recv_handler) {
+ dev_notice(&port_priv->device->dev,
+ "No receive handler for client %p on port %d\n",
+ &mad_agent->agent, port_priv->port_num);
+ deref_mad_agent(mad_agent);
+ mad_agent = NULL;
+ }
+
+ return mad_agent;
+}
+
+static int validate_mad(const struct ib_mad_hdr *mad_hdr,
+ const struct ib_mad_qp_info *qp_info,
+ bool opa)
+{
+ int valid = 0;
+ u32 qp_num = qp_info->qp->qp_num;
+
+ /* Make sure MAD base version is understood */
+ if (mad_hdr->base_version != IB_MGMT_BASE_VERSION &&
+ (!opa || mad_hdr->base_version != OPA_MGMT_BASE_VERSION)) {
+ pr_err("MAD received with unsupported base version %d %s\n",
+ mad_hdr->base_version, opa ? "(opa)" : "");
+ goto out;
+ }
+
+ /* Filter SMI packets sent to other than QP0 */
+ if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+ (mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+ if (qp_num == 0)
+ valid = 1;
+ } else {
+ /* CM attributes other than ClassPortInfo only use Send method */
+ if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_CM) &&
+ (mad_hdr->attr_id != IB_MGMT_CLASSPORTINFO_ATTR_ID) &&
+ (mad_hdr->method != IB_MGMT_METHOD_SEND))
+ goto out;
+ /* Filter GSI packets sent to QP0 */
+ if (qp_num != 0)
+ valid = 1;
+ }
+
+out:
+ return valid;
+}
+
+static int is_rmpp_data_mad(const struct ib_mad_agent_private *mad_agent_priv,
+ const struct ib_mad_hdr *mad_hdr)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *)mad_hdr;
+ return !mad_agent_priv->agent.rmpp_version ||
+ !ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) ||
+ !(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+ IB_MGMT_RMPP_FLAG_ACTIVE) ||
+ (rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA);
+}
+
+static inline int rcv_has_same_class(const struct ib_mad_send_wr_private *wr,
+ const struct ib_mad_recv_wc *rwc)
+{
+ return ((struct ib_mad_hdr *)(wr->send_buf.mad))->mgmt_class ==
+ rwc->recv_buf.mad->mad_hdr.mgmt_class;
+}
+
+static inline int rcv_has_same_gid(const struct ib_mad_agent_private *mad_agent_priv,
+ const struct ib_mad_send_wr_private *wr,
+ const struct ib_mad_recv_wc *rwc )
+{
+ struct rdma_ah_attr attr;
+ u8 send_resp, rcv_resp;
+ union ib_gid sgid;
+ struct ib_device *device = mad_agent_priv->agent.device;
+ u8 port_num = mad_agent_priv->agent.port_num;
+ u8 lmc;
+ bool has_grh;
+
+ send_resp = ib_response_mad((struct ib_mad_hdr *)wr->send_buf.mad);
+ rcv_resp = ib_response_mad(&rwc->recv_buf.mad->mad_hdr);
+
+ if (send_resp == rcv_resp)
+ /* both requests, or both responses. GIDs different */
+ return 0;
+
+ if (rdma_query_ah(wr->send_buf.ah, &attr))
+ /* Assume not equal, to avoid false positives. */
+ return 0;
+
+ has_grh = !!(rdma_ah_get_ah_flags(&attr) & IB_AH_GRH);
+ if (has_grh != !!(rwc->wc->wc_flags & IB_WC_GRH))
+ /* one has GID, other does not. Assume different */
+ return 0;
+
+ if (!send_resp && rcv_resp) {
+ /* is request/response. */
+ if (!has_grh) {
+ if (ib_get_cached_lmc(device, port_num, &lmc))
+ return 0;
+ return (!lmc || !((rdma_ah_get_path_bits(&attr) ^
+ rwc->wc->dlid_path_bits) &
+ ((1 << lmc) - 1)));
+ } else {
+ const struct ib_global_route *grh =
+ rdma_ah_read_grh(&attr);
+
+ if (rdma_query_gid(device, port_num,
+ grh->sgid_index, &sgid))
+ return 0;
+ return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw,
+ 16);
+ }
+ }
+
+ if (!has_grh)
+ return rdma_ah_get_dlid(&attr) == rwc->wc->slid;
+ else
+ return !memcmp(rdma_ah_read_grh(&attr)->dgid.raw,
+ rwc->recv_buf.grh->sgid.raw,
+ 16);
+}
+
+static inline int is_direct(u8 class)
+{
+ return (class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE);
+}
+
+struct ib_mad_send_wr_private*
+ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv,
+ const struct ib_mad_recv_wc *wc)
+{
+ struct ib_mad_send_wr_private *wr;
+ const struct ib_mad_hdr *mad_hdr;
+
+ mad_hdr = &wc->recv_buf.mad->mad_hdr;
+
+ list_for_each_entry(wr, &mad_agent_priv->wait_list, agent_list) {
+ if ((wr->tid == mad_hdr->tid) &&
+ rcv_has_same_class(wr, wc) &&
+ /*
+ * Don't check GID for direct routed MADs.
+ * These might have permissive LIDs.
+ */
+ (is_direct(mad_hdr->mgmt_class) ||
+ rcv_has_same_gid(mad_agent_priv, wr, wc)))
+ return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+ }
+
+ /*
+ * It's possible to receive the response before we've
+ * been notified that the send has completed
+ */
+ list_for_each_entry(wr, &mad_agent_priv->send_list, agent_list) {
+ if (is_rmpp_data_mad(mad_agent_priv, wr->send_buf.mad) &&
+ wr->tid == mad_hdr->tid &&
+ wr->timeout &&
+ rcv_has_same_class(wr, wc) &&
+ /*
+ * Don't check GID for direct routed MADs.
+ * These might have permissive LIDs.
+ */
+ (is_direct(mad_hdr->mgmt_class) ||
+ rcv_has_same_gid(mad_agent_priv, wr, wc)))
+ /* Verify request has not been canceled */
+ return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+ }
+ return NULL;
+}
+
+void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ mad_send_wr->timeout = 0;
+ if (mad_send_wr->refcount == 1)
+ list_move_tail(&mad_send_wr->agent_list,
+ &mad_send_wr->mad_agent_priv->done_list);
+}
+
+static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_send_wc mad_send_wc;
+ unsigned long flags;
+ int ret;
+
+ INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
+ ret = ib_mad_enforce_security(mad_agent_priv,
+ mad_recv_wc->wc->pkey_index);
+ if (ret) {
+ ib_free_recv_mad(mad_recv_wc);
+ deref_mad_agent(mad_agent_priv);
+ return;
+ }
+
+ list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list);
+ if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
+ mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv,
+ mad_recv_wc);
+ if (!mad_recv_wc) {
+ deref_mad_agent(mad_agent_priv);
+ return;
+ }
+ }
+
+ /* Complete corresponding request */
+ if (ib_response_mad(&mad_recv_wc->recv_buf.mad->mad_hdr)) {
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
+ if (!mad_send_wr) {
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ if (!ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)
+ && ib_is_mad_class_rmpp(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class)
+ && (ib_get_rmpp_flags(&((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr)
+ & IB_MGMT_RMPP_FLAG_ACTIVE)) {
+ /* user rmpp is in effect
+ * and this is an active RMPP MAD
+ */
+ mad_agent_priv->agent.recv_handler(
+ &mad_agent_priv->agent, NULL,
+ mad_recv_wc);
+ atomic_dec(&mad_agent_priv->refcount);
+ } else {
+ /* not user rmpp, revert to normal behavior and
+ * drop the mad */
+ ib_free_recv_mad(mad_recv_wc);
+ deref_mad_agent(mad_agent_priv);
+ return;
+ }
+ } else {
+ ib_mark_mad_done(mad_send_wr);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ /* Defined behavior is to complete response before request */
+ mad_agent_priv->agent.recv_handler(
+ &mad_agent_priv->agent,
+ &mad_send_wr->send_buf,
+ mad_recv_wc);
+ atomic_dec(&mad_agent_priv->refcount);
+
+ mad_send_wc.status = IB_WC_SUCCESS;
+ mad_send_wc.vendor_err = 0;
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+ }
+ } else {
+ mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, NULL,
+ mad_recv_wc);
+ deref_mad_agent(mad_agent_priv);
+ }
+
+ return;
+}
+
+static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv,
+ const struct ib_mad_qp_info *qp_info,
+ const struct ib_wc *wc,
+ int port_num,
+ struct ib_mad_private *recv,
+ struct ib_mad_private *response)
+{
+ enum smi_forward_action retsmi;
+ struct ib_smp *smp = (struct ib_smp *)recv->mad;
+
+ if (smi_handle_dr_smp_recv(smp,
+ rdma_cap_ib_switch(port_priv->device),
+ port_num,
+ port_priv->device->phys_port_cnt) ==
+ IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+
+ retsmi = smi_check_forward_dr_smp(smp);
+ if (retsmi == IB_SMI_LOCAL)
+ return IB_SMI_HANDLE;
+
+ if (retsmi == IB_SMI_SEND) { /* don't forward */
+ if (smi_handle_dr_smp_send(smp,
+ rdma_cap_ib_switch(port_priv->device),
+ port_num) == IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+
+ if (smi_check_local_smp(smp, port_priv->device) == IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+ } else if (rdma_cap_ib_switch(port_priv->device)) {
+ /* forward case for switches */
+ memcpy(response, recv, mad_priv_size(response));
+ response->header.recv_wc.wc = &response->header.wc;
+ response->header.recv_wc.recv_buf.mad = (struct ib_mad *)response->mad;
+ response->header.recv_wc.recv_buf.grh = &response->grh;
+
+ agent_send_response((const struct ib_mad_hdr *)response->mad,
+ &response->grh, wc,
+ port_priv->device,
+ smi_get_fwd_port(smp),
+ qp_info->qp->qp_num,
+ response->mad_size,
+ false);
+
+ return IB_SMI_DISCARD;
+ }
+ return IB_SMI_HANDLE;
+}
+
+static bool generate_unmatched_resp(const struct ib_mad_private *recv,
+ struct ib_mad_private *response,
+ size_t *resp_len, bool opa)
+{
+ const struct ib_mad_hdr *recv_hdr = (const struct ib_mad_hdr *)recv->mad;
+ struct ib_mad_hdr *resp_hdr = (struct ib_mad_hdr *)response->mad;
+
+ if (recv_hdr->method == IB_MGMT_METHOD_GET ||
+ recv_hdr->method == IB_MGMT_METHOD_SET) {
+ memcpy(response, recv, mad_priv_size(response));
+ response->header.recv_wc.wc = &response->header.wc;
+ response->header.recv_wc.recv_buf.mad = (struct ib_mad *)response->mad;
+ response->header.recv_wc.recv_buf.grh = &response->grh;
+ resp_hdr->method = IB_MGMT_METHOD_GET_RESP;
+ resp_hdr->status = cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB);
+ if (recv_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ resp_hdr->status |= IB_SMP_DIRECTION;
+
+ if (opa && recv_hdr->base_version == OPA_MGMT_BASE_VERSION) {
+ if (recv_hdr->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ recv_hdr->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ *resp_len = opa_get_smp_header_size(
+ (struct opa_smp *)recv->mad);
+ else
+ *resp_len = sizeof(struct ib_mad_hdr);
+ }
+
+ return true;
+ } else {
+ return false;
+ }
+}
+
+static enum smi_action
+handle_opa_smi(struct ib_mad_port_private *port_priv,
+ struct ib_mad_qp_info *qp_info,
+ struct ib_wc *wc,
+ int port_num,
+ struct ib_mad_private *recv,
+ struct ib_mad_private *response)
+{
+ enum smi_forward_action retsmi;
+ struct opa_smp *smp = (struct opa_smp *)recv->mad;
+
+ if (opa_smi_handle_dr_smp_recv(smp,
+ rdma_cap_ib_switch(port_priv->device),
+ port_num,
+ port_priv->device->phys_port_cnt) ==
+ IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+
+ retsmi = opa_smi_check_forward_dr_smp(smp);
+ if (retsmi == IB_SMI_LOCAL)
+ return IB_SMI_HANDLE;
+
+ if (retsmi == IB_SMI_SEND) { /* don't forward */
+ if (opa_smi_handle_dr_smp_send(smp,
+ rdma_cap_ib_switch(port_priv->device),
+ port_num) == IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+
+ if (opa_smi_check_local_smp(smp, port_priv->device) ==
+ IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+
+ } else if (rdma_cap_ib_switch(port_priv->device)) {
+ /* forward case for switches */
+ memcpy(response, recv, mad_priv_size(response));
+ response->header.recv_wc.wc = &response->header.wc;
+ response->header.recv_wc.recv_buf.opa_mad =
+ (struct opa_mad *)response->mad;
+ response->header.recv_wc.recv_buf.grh = &response->grh;
+
+ agent_send_response((const struct ib_mad_hdr *)response->mad,
+ &response->grh, wc,
+ port_priv->device,
+ opa_smi_get_fwd_port(smp),
+ qp_info->qp->qp_num,
+ recv->header.wc.byte_len,
+ true);
+
+ return IB_SMI_DISCARD;
+ }
+
+ return IB_SMI_HANDLE;
+}
+
+static enum smi_action
+handle_smi(struct ib_mad_port_private *port_priv,
+ struct ib_mad_qp_info *qp_info,
+ struct ib_wc *wc,
+ int port_num,
+ struct ib_mad_private *recv,
+ struct ib_mad_private *response,
+ bool opa)
+{
+ struct ib_mad_hdr *mad_hdr = (struct ib_mad_hdr *)recv->mad;
+
+ if (opa && mad_hdr->base_version == OPA_MGMT_BASE_VERSION &&
+ mad_hdr->class_version == OPA_SM_CLASS_VERSION)
+ return handle_opa_smi(port_priv, qp_info, wc, port_num, recv,
+ response);
+
+ return handle_ib_smi(port_priv, qp_info, wc, port_num, recv, response);
+}
+
+static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_mad_port_private *port_priv = cq->cq_context;
+ struct ib_mad_list_head *mad_list =
+ container_of(wc->wr_cqe, struct ib_mad_list_head, cqe);
+ struct ib_mad_qp_info *qp_info;
+ struct ib_mad_private_header *mad_priv_hdr;
+ struct ib_mad_private *recv, *response = NULL;
+ struct ib_mad_agent_private *mad_agent;
+ int port_num;
+ int ret = IB_MAD_RESULT_SUCCESS;
+ size_t mad_size;
+ u16 resp_mad_pkey_index = 0;
+ bool opa;
+
+ if (list_empty_careful(&port_priv->port_list))
+ return;
+
+ if (wc->status != IB_WC_SUCCESS) {
+ /*
+ * Receive errors indicate that the QP has entered the error
+ * state - error handling/shutdown code will cleanup
+ */
+ return;
+ }
+
+ qp_info = mad_list->mad_queue->qp_info;
+ dequeue_mad(mad_list);
+
+ opa = rdma_cap_opa_mad(qp_info->port_priv->device,
+ qp_info->port_priv->port_num);
+
+ mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header,
+ mad_list);
+ recv = container_of(mad_priv_hdr, struct ib_mad_private, header);
+ ib_dma_unmap_single(port_priv->device,
+ recv->header.mapping,
+ mad_priv_dma_size(recv),
+ DMA_FROM_DEVICE);
+
+ /* Setup MAD receive work completion from "normal" work completion */
+ recv->header.wc = *wc;
+ recv->header.recv_wc.wc = &recv->header.wc;
+
+ if (opa && ((struct ib_mad_hdr *)(recv->mad))->base_version == OPA_MGMT_BASE_VERSION) {
+ recv->header.recv_wc.mad_len = wc->byte_len - sizeof(struct ib_grh);
+ recv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad);
+ } else {
+ recv->header.recv_wc.mad_len = sizeof(struct ib_mad);
+ recv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad);
+ }
+
+ recv->header.recv_wc.recv_buf.mad = (struct ib_mad *)recv->mad;
+ recv->header.recv_wc.recv_buf.grh = &recv->grh;
+
+ if (atomic_read(&qp_info->snoop_count))
+ snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS);
+
+ /* Validate MAD */
+ if (!validate_mad((const struct ib_mad_hdr *)recv->mad, qp_info, opa))
+ goto out;
+
+ mad_size = recv->mad_size;
+ response = alloc_mad_private(mad_size, GFP_KERNEL);
+ if (!response)
+ goto out;
+
+ if (rdma_cap_ib_switch(port_priv->device))
+ port_num = wc->port_num;
+ else
+ port_num = port_priv->port_num;
+
+ if (((struct ib_mad_hdr *)recv->mad)->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ if (handle_smi(port_priv, qp_info, wc, port_num, recv,
+ response, opa)
+ == IB_SMI_DISCARD)
+ goto out;
+ }
+
+ /* Give driver "right of first refusal" on incoming MAD */
+ if (port_priv->device->process_mad) {
+ ret = port_priv->device->process_mad(port_priv->device, 0,
+ port_priv->port_num,
+ wc, &recv->grh,
+ (const struct ib_mad_hdr *)recv->mad,
+ recv->mad_size,
+ (struct ib_mad_hdr *)response->mad,
+ &mad_size, &resp_mad_pkey_index);
+
+ if (opa)
+ wc->pkey_index = resp_mad_pkey_index;
+
+ if (ret & IB_MAD_RESULT_SUCCESS) {
+ if (ret & IB_MAD_RESULT_CONSUMED)
+ goto out;
+ if (ret & IB_MAD_RESULT_REPLY) {
+ agent_send_response((const struct ib_mad_hdr *)response->mad,
+ &recv->grh, wc,
+ port_priv->device,
+ port_num,
+ qp_info->qp->qp_num,
+ mad_size, opa);
+ goto out;
+ }
+ }
+ }
+
+ mad_agent = find_mad_agent(port_priv, (const struct ib_mad_hdr *)recv->mad);
+ if (mad_agent) {
+ ib_mad_complete_recv(mad_agent, &recv->header.recv_wc);
+ /*
+ * recv is freed up in error cases in ib_mad_complete_recv
+ * or via recv_handler in ib_mad_complete_recv()
+ */
+ recv = NULL;
+ } else if ((ret & IB_MAD_RESULT_SUCCESS) &&
+ generate_unmatched_resp(recv, response, &mad_size, opa)) {
+ agent_send_response((const struct ib_mad_hdr *)response->mad, &recv->grh, wc,
+ port_priv->device, port_num,
+ qp_info->qp->qp_num, mad_size, opa);
+ }
+
+out:
+ /* Post another receive request for this QP */
+ if (response) {
+ ib_mad_post_receive_mads(qp_info, response);
+ kfree(recv);
+ } else
+ ib_mad_post_receive_mads(qp_info, recv);
+}
+
+static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ unsigned long delay;
+
+ if (list_empty(&mad_agent_priv->wait_list)) {
+ cancel_delayed_work(&mad_agent_priv->timed_work);
+ } else {
+ mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+ struct ib_mad_send_wr_private,
+ agent_list);
+
+ if (time_after(mad_agent_priv->timeout,
+ mad_send_wr->timeout)) {
+ mad_agent_priv->timeout = mad_send_wr->timeout;
+ delay = mad_send_wr->timeout - jiffies;
+ if ((long)delay <= 0)
+ delay = 1;
+ mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq,
+ &mad_agent_priv->timed_work, delay);
+ }
+ }
+}
+
+static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *temp_mad_send_wr;
+ struct list_head *list_item;
+ unsigned long delay;
+
+ mad_agent_priv = mad_send_wr->mad_agent_priv;
+ list_del(&mad_send_wr->agent_list);
+
+ delay = mad_send_wr->timeout;
+ mad_send_wr->timeout += jiffies;
+
+ if (delay) {
+ list_for_each_prev(list_item, &mad_agent_priv->wait_list) {
+ temp_mad_send_wr = list_entry(list_item,
+ struct ib_mad_send_wr_private,
+ agent_list);
+ if (time_after(mad_send_wr->timeout,
+ temp_mad_send_wr->timeout))
+ break;
+ }
+ }
+ else
+ list_item = &mad_agent_priv->wait_list;
+ list_add(&mad_send_wr->agent_list, list_item);
+
+ /* Reschedule a work item if we have a shorter timeout */
+ if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list)
+ mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq,
+ &mad_agent_priv->timed_work, delay);
+}
+
+void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
+ int timeout_ms)
+{
+ mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
+ wait_for_response(mad_send_wr);
+}
+
+/*
+ * Process a send work completion
+ */
+void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ unsigned long flags;
+ int ret;
+
+ mad_agent_priv = mad_send_wr->mad_agent_priv;
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
+ ret = ib_process_rmpp_send_wc(mad_send_wr, mad_send_wc);
+ if (ret == IB_RMPP_RESULT_CONSUMED)
+ goto done;
+ } else
+ ret = IB_RMPP_RESULT_UNHANDLED;
+
+ if (mad_send_wc->status != IB_WC_SUCCESS &&
+ mad_send_wr->status == IB_WC_SUCCESS) {
+ mad_send_wr->status = mad_send_wc->status;
+ mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+ }
+
+ if (--mad_send_wr->refcount > 0) {
+ if (mad_send_wr->refcount == 1 && mad_send_wr->timeout &&
+ mad_send_wr->status == IB_WC_SUCCESS) {
+ wait_for_response(mad_send_wr);
+ }
+ goto done;
+ }
+
+ /* Remove send from MAD agent and notify client of completion */
+ list_del(&mad_send_wr->agent_list);
+ adjust_timeout(mad_agent_priv);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ if (mad_send_wr->status != IB_WC_SUCCESS )
+ mad_send_wc->status = mad_send_wr->status;
+ if (ret == IB_RMPP_RESULT_INTERNAL)
+ ib_rmpp_send_handler(mad_send_wc);
+ else
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ mad_send_wc);
+
+ /* Release reference on agent taken when sending */
+ deref_mad_agent(mad_agent_priv);
+ return;
+done:
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_mad_port_private *port_priv = cq->cq_context;
+ struct ib_mad_list_head *mad_list =
+ container_of(wc->wr_cqe, struct ib_mad_list_head, cqe);
+ struct ib_mad_send_wr_private *mad_send_wr, *queued_send_wr;
+ struct ib_mad_qp_info *qp_info;
+ struct ib_mad_queue *send_queue;
+ struct ib_mad_send_wc mad_send_wc;
+ unsigned long flags;
+ int ret;
+
+ if (list_empty_careful(&port_priv->port_list))
+ return;
+
+ if (wc->status != IB_WC_SUCCESS) {
+ if (!ib_mad_send_error(port_priv, wc))
+ return;
+ }
+
+ mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+ mad_list);
+ send_queue = mad_list->mad_queue;
+ qp_info = send_queue->qp_info;
+
+retry:
+ ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,
+ mad_send_wr->header_mapping,
+ mad_send_wr->sg_list[0].length, DMA_TO_DEVICE);
+ ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,
+ mad_send_wr->payload_mapping,
+ mad_send_wr->sg_list[1].length, DMA_TO_DEVICE);
+ queued_send_wr = NULL;
+ spin_lock_irqsave(&send_queue->lock, flags);
+ list_del(&mad_list->list);
+
+ /* Move queued send to the send queue */
+ if (send_queue->count-- > send_queue->max_active) {
+ mad_list = container_of(qp_info->overflow_list.next,
+ struct ib_mad_list_head, list);
+ queued_send_wr = container_of(mad_list,
+ struct ib_mad_send_wr_private,
+ mad_list);
+ list_move_tail(&mad_list->list, &send_queue->list);
+ }
+ spin_unlock_irqrestore(&send_queue->lock, flags);
+
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ mad_send_wc.status = wc->status;
+ mad_send_wc.vendor_err = wc->vendor_err;
+ if (atomic_read(&qp_info->snoop_count))
+ snoop_send(qp_info, &mad_send_wr->send_buf, &mad_send_wc,
+ IB_MAD_SNOOP_SEND_COMPLETIONS);
+ ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+
+ if (queued_send_wr) {
+ ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr.wr,
+ NULL);
+ if (ret) {
+ dev_err(&port_priv->device->dev,
+ "ib_post_send failed: %d\n", ret);
+ mad_send_wr = queued_send_wr;
+ wc->status = IB_WC_LOC_QP_OP_ERR;
+ goto retry;
+ }
+ }
+}
+
+static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_list_head *mad_list;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+ list_for_each_entry(mad_list, &qp_info->send_queue.list, list) {
+ mad_send_wr = container_of(mad_list,
+ struct ib_mad_send_wr_private,
+ mad_list);
+ mad_send_wr->retry = 1;
+ }
+ spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+}
+
+static bool ib_mad_send_error(struct ib_mad_port_private *port_priv,
+ struct ib_wc *wc)
+{
+ struct ib_mad_list_head *mad_list =
+ container_of(wc->wr_cqe, struct ib_mad_list_head, cqe);
+ struct ib_mad_qp_info *qp_info = mad_list->mad_queue->qp_info;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ int ret;
+
+ /*
+ * Send errors will transition the QP to SQE - move
+ * QP to RTS and repost flushed work requests
+ */
+ mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+ mad_list);
+ if (wc->status == IB_WC_WR_FLUSH_ERR) {
+ if (mad_send_wr->retry) {
+ /* Repost send */
+ mad_send_wr->retry = 0;
+ ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr,
+ NULL);
+ if (!ret)
+ return false;
+ }
+ } else {
+ struct ib_qp_attr *attr;
+
+ /* Transition QP to RTS and fail offending send */
+ attr = kmalloc(sizeof *attr, GFP_KERNEL);
+ if (attr) {
+ attr->qp_state = IB_QPS_RTS;
+ attr->cur_qp_state = IB_QPS_SQE;
+ ret = ib_modify_qp(qp_info->qp, attr,
+ IB_QP_STATE | IB_QP_CUR_STATE);
+ kfree(attr);
+ if (ret)
+ dev_err(&port_priv->device->dev,
+ "%s - ib_modify_qp to RTS: %d\n",
+ __func__, ret);
+ else
+ mark_sends_for_retry(qp_info);
+ }
+ }
+
+ return true;
+}
+
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
+{
+ unsigned long flags;
+ struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr;
+ struct ib_mad_send_wc mad_send_wc;
+ struct list_head cancel_list;
+
+ INIT_LIST_HEAD(&cancel_list);
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+ &mad_agent_priv->send_list, agent_list) {
+ if (mad_send_wr->status == IB_WC_SUCCESS) {
+ mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+ mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+ }
+ }
+
+ /* Empty wait list to prevent receives from finding a request */
+ list_splice_init(&mad_agent_priv->wait_list, &cancel_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ /* Report all cancelled requests */
+ mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+ mad_send_wc.vendor_err = 0;
+
+ list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+ &cancel_list, agent_list) {
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ list_del(&mad_send_wr->agent_list);
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ &mad_send_wc);
+ atomic_dec(&mad_agent_priv->refcount);
+ }
+}
+
+static struct ib_mad_send_wr_private*
+find_send_wr(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_send_buf *send_buf)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+
+ list_for_each_entry(mad_send_wr, &mad_agent_priv->wait_list,
+ agent_list) {
+ if (&mad_send_wr->send_buf == send_buf)
+ return mad_send_wr;
+ }
+
+ list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list,
+ agent_list) {
+ if (is_rmpp_data_mad(mad_agent_priv,
+ mad_send_wr->send_buf.mad) &&
+ &mad_send_wr->send_buf == send_buf)
+ return mad_send_wr;
+ }
+ return NULL;
+}
+
+int ib_modify_mad(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf, u32 timeout_ms)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ unsigned long flags;
+ int active;
+
+ mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
+ agent);
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ mad_send_wr = find_send_wr(mad_agent_priv, send_buf);
+ if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) {
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ return -EINVAL;
+ }
+
+ active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1);
+ if (!timeout_ms) {
+ mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+ mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+ }
+
+ mad_send_wr->send_buf.timeout_ms = timeout_ms;
+ if (active)
+ mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
+ else
+ ib_reset_mad_timeout(mad_send_wr, timeout_ms);
+
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ return 0;
+}
+EXPORT_SYMBOL(ib_modify_mad);
+
+void ib_cancel_mad(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf)
+{
+ ib_modify_mad(mad_agent, send_buf, 0);
+}
+EXPORT_SYMBOL(ib_cancel_mad);
+
+static void local_completions(struct work_struct *work)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_local_private *local;
+ struct ib_mad_agent_private *recv_mad_agent;
+ unsigned long flags;
+ int free_mad;
+ struct ib_wc wc;
+ struct ib_mad_send_wc mad_send_wc;
+ bool opa;
+
+ mad_agent_priv =
+ container_of(work, struct ib_mad_agent_private, local_work);
+
+ opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device,
+ mad_agent_priv->qp_info->port_priv->port_num);
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ while (!list_empty(&mad_agent_priv->local_list)) {
+ local = list_entry(mad_agent_priv->local_list.next,
+ struct ib_mad_local_private,
+ completion_list);
+ list_del(&local->completion_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ free_mad = 0;
+ if (local->mad_priv) {
+ u8 base_version;
+ recv_mad_agent = local->recv_mad_agent;
+ if (!recv_mad_agent) {
+ dev_err(&mad_agent_priv->agent.device->dev,
+ "No receive MAD agent for local completion\n");
+ free_mad = 1;
+ goto local_send_completion;
+ }
+
+ /*
+ * Defined behavior is to complete response
+ * before request
+ */
+ build_smp_wc(recv_mad_agent->agent.qp,
+ local->mad_send_wr->send_wr.wr.wr_cqe,
+ be16_to_cpu(IB_LID_PERMISSIVE),
+ local->mad_send_wr->send_wr.pkey_index,
+ recv_mad_agent->agent.port_num, &wc);
+
+ local->mad_priv->header.recv_wc.wc = &wc;
+
+ base_version = ((struct ib_mad_hdr *)(local->mad_priv->mad))->base_version;
+ if (opa && base_version == OPA_MGMT_BASE_VERSION) {
+ local->mad_priv->header.recv_wc.mad_len = local->return_wc_byte_len;
+ local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad);
+ } else {
+ local->mad_priv->header.recv_wc.mad_len = sizeof(struct ib_mad);
+ local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad);
+ }
+
+ INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.rmpp_list);
+ list_add(&local->mad_priv->header.recv_wc.recv_buf.list,
+ &local->mad_priv->header.recv_wc.rmpp_list);
+ local->mad_priv->header.recv_wc.recv_buf.grh = NULL;
+ local->mad_priv->header.recv_wc.recv_buf.mad =
+ (struct ib_mad *)local->mad_priv->mad;
+ if (atomic_read(&recv_mad_agent->qp_info->snoop_count))
+ snoop_recv(recv_mad_agent->qp_info,
+ &local->mad_priv->header.recv_wc,
+ IB_MAD_SNOOP_RECVS);
+ recv_mad_agent->agent.recv_handler(
+ &recv_mad_agent->agent,
+ &local->mad_send_wr->send_buf,
+ &local->mad_priv->header.recv_wc);
+ spin_lock_irqsave(&recv_mad_agent->lock, flags);
+ atomic_dec(&recv_mad_agent->refcount);
+ spin_unlock_irqrestore(&recv_mad_agent->lock, flags);
+ }
+
+local_send_completion:
+ /* Complete send */
+ mad_send_wc.status = IB_WC_SUCCESS;
+ mad_send_wc.vendor_err = 0;
+ mad_send_wc.send_buf = &local->mad_send_wr->send_buf;
+ if (atomic_read(&mad_agent_priv->qp_info->snoop_count))
+ snoop_send(mad_agent_priv->qp_info,
+ &local->mad_send_wr->send_buf,
+ &mad_send_wc, IB_MAD_SNOOP_SEND_COMPLETIONS);
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ &mad_send_wc);
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ atomic_dec(&mad_agent_priv->refcount);
+ if (free_mad)
+ kfree(local->mad_priv);
+ kfree(local);
+ }
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ int ret;
+
+ if (!mad_send_wr->retries_left)
+ return -ETIMEDOUT;
+
+ mad_send_wr->retries_left--;
+ mad_send_wr->send_buf.retries++;
+
+ mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+
+ if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) {
+ ret = ib_retry_rmpp(mad_send_wr);
+ switch (ret) {
+ case IB_RMPP_RESULT_UNHANDLED:
+ ret = ib_send_mad(mad_send_wr);
+ break;
+ case IB_RMPP_RESULT_CONSUMED:
+ ret = 0;
+ break;
+ default:
+ ret = -ECOMM;
+ break;
+ }
+ } else
+ ret = ib_send_mad(mad_send_wr);
+
+ if (!ret) {
+ mad_send_wr->refcount++;
+ list_add_tail(&mad_send_wr->agent_list,
+ &mad_send_wr->mad_agent_priv->send_list);
+ }
+ return ret;
+}
+
+static void timeout_sends(struct work_struct *work)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_send_wc mad_send_wc;
+ unsigned long flags, delay;
+
+ mad_agent_priv = container_of(work, struct ib_mad_agent_private,
+ timed_work.work);
+ mad_send_wc.vendor_err = 0;
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ while (!list_empty(&mad_agent_priv->wait_list)) {
+ mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+ struct ib_mad_send_wr_private,
+ agent_list);
+
+ if (time_after(mad_send_wr->timeout, jiffies)) {
+ delay = mad_send_wr->timeout - jiffies;
+ if ((long)delay <= 0)
+ delay = 1;
+ queue_delayed_work(mad_agent_priv->qp_info->
+ port_priv->wq,
+ &mad_agent_priv->timed_work, delay);
+ break;
+ }
+
+ list_del(&mad_send_wr->agent_list);
+ if (mad_send_wr->status == IB_WC_SUCCESS &&
+ !retry_send(mad_send_wr))
+ continue;
+
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ if (mad_send_wr->status == IB_WC_SUCCESS)
+ mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR;
+ else
+ mad_send_wc.status = mad_send_wr->status;
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ &mad_send_wc);
+
+ atomic_dec(&mad_agent_priv->refcount);
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ }
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+/*
+ * Allocate receive MADs and post receive WRs for them
+ */
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_private *mad)
+{
+ unsigned long flags;
+ int post, ret;
+ struct ib_mad_private *mad_priv;
+ struct ib_sge sg_list;
+ struct ib_recv_wr recv_wr;
+ struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
+
+ /* Initialize common scatter list fields */
+ sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey;
+
+ /* Initialize common receive WR fields */
+ recv_wr.next = NULL;
+ recv_wr.sg_list = &sg_list;
+ recv_wr.num_sge = 1;
+
+ do {
+ /* Allocate and map receive buffer */
+ if (mad) {
+ mad_priv = mad;
+ mad = NULL;
+ } else {
+ mad_priv = alloc_mad_private(port_mad_size(qp_info->port_priv),
+ GFP_ATOMIC);
+ if (!mad_priv) {
+ ret = -ENOMEM;
+ break;
+ }
+ }
+ sg_list.length = mad_priv_dma_size(mad_priv);
+ sg_list.addr = ib_dma_map_single(qp_info->port_priv->device,
+ &mad_priv->grh,
+ mad_priv_dma_size(mad_priv),
+ DMA_FROM_DEVICE);
+ if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device,
+ sg_list.addr))) {
+ kfree(mad_priv);
+ ret = -ENOMEM;
+ break;
+ }
+ mad_priv->header.mapping = sg_list.addr;
+ mad_priv->header.mad_list.mad_queue = recv_queue;
+ mad_priv->header.mad_list.cqe.done = ib_mad_recv_done;
+ recv_wr.wr_cqe = &mad_priv->header.mad_list.cqe;
+
+ /* Post receive WR */
+ spin_lock_irqsave(&recv_queue->lock, flags);
+ post = (++recv_queue->count < recv_queue->max_active);
+ list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list);
+ spin_unlock_irqrestore(&recv_queue->lock, flags);
+ ret = ib_post_recv(qp_info->qp, &recv_wr, NULL);
+ if (ret) {
+ spin_lock_irqsave(&recv_queue->lock, flags);
+ list_del(&mad_priv->header.mad_list.list);
+ recv_queue->count--;
+ spin_unlock_irqrestore(&recv_queue->lock, flags);
+ ib_dma_unmap_single(qp_info->port_priv->device,
+ mad_priv->header.mapping,
+ mad_priv_dma_size(mad_priv),
+ DMA_FROM_DEVICE);
+ kfree(mad_priv);
+ dev_err(&qp_info->port_priv->device->dev,
+ "ib_post_recv failed: %d\n", ret);
+ break;
+ }
+ } while (post);
+
+ return ret;
+}
+
+/*
+ * Return all the posted receive MADs
+ */
+static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info)
+{
+ struct ib_mad_private_header *mad_priv_hdr;
+ struct ib_mad_private *recv;
+ struct ib_mad_list_head *mad_list;
+
+ if (!qp_info->qp)
+ return;
+
+ while (!list_empty(&qp_info->recv_queue.list)) {
+
+ mad_list = list_entry(qp_info->recv_queue.list.next,
+ struct ib_mad_list_head, list);
+ mad_priv_hdr = container_of(mad_list,
+ struct ib_mad_private_header,
+ mad_list);
+ recv = container_of(mad_priv_hdr, struct ib_mad_private,
+ header);
+
+ /* Remove from posted receive MAD list */
+ list_del(&mad_list->list);
+
+ ib_dma_unmap_single(qp_info->port_priv->device,
+ recv->header.mapping,
+ mad_priv_dma_size(recv),
+ DMA_FROM_DEVICE);
+ kfree(recv);
+ }
+
+ qp_info->recv_queue.count = 0;
+}
+
+/*
+ * Start the port
+ */
+static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
+{
+ int ret, i;
+ struct ib_qp_attr *attr;
+ struct ib_qp *qp;
+ u16 pkey_index;
+
+ attr = kmalloc(sizeof *attr, GFP_KERNEL);
+ if (!attr)
+ return -ENOMEM;
+
+ ret = ib_find_pkey(port_priv->device, port_priv->port_num,
+ IB_DEFAULT_PKEY_FULL, &pkey_index);
+ if (ret)
+ pkey_index = 0;
+
+ for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+ qp = port_priv->qp_info[i].qp;
+ if (!qp)
+ continue;
+
+ /*
+ * PKey index for QP1 is irrelevant but
+ * one is needed for the Reset to Init transition
+ */
+ attr->qp_state = IB_QPS_INIT;
+ attr->pkey_index = pkey_index;
+ attr->qkey = (qp->qp_num == 0) ? 0 : IB_QP1_QKEY;
+ ret = ib_modify_qp(qp, attr, IB_QP_STATE |
+ IB_QP_PKEY_INDEX | IB_QP_QKEY);
+ if (ret) {
+ dev_err(&port_priv->device->dev,
+ "Couldn't change QP%d state to INIT: %d\n",
+ i, ret);
+ goto out;
+ }
+
+ attr->qp_state = IB_QPS_RTR;
+ ret = ib_modify_qp(qp, attr, IB_QP_STATE);
+ if (ret) {
+ dev_err(&port_priv->device->dev,
+ "Couldn't change QP%d state to RTR: %d\n",
+ i, ret);
+ goto out;
+ }
+
+ attr->qp_state = IB_QPS_RTS;
+ attr->sq_psn = IB_MAD_SEND_Q_PSN;
+ ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN);
+ if (ret) {
+ dev_err(&port_priv->device->dev,
+ "Couldn't change QP%d state to RTS: %d\n",
+ i, ret);
+ goto out;
+ }
+ }
+
+ ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+ if (ret) {
+ dev_err(&port_priv->device->dev,
+ "Failed to request completion notification: %d\n",
+ ret);
+ goto out;
+ }
+
+ for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+ if (!port_priv->qp_info[i].qp)
+ continue;
+
+ ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL);
+ if (ret) {
+ dev_err(&port_priv->device->dev,
+ "Couldn't post receive WRs\n");
+ goto out;
+ }
+ }
+out:
+ kfree(attr);
+ return ret;
+}
+
+static void qp_event_handler(struct ib_event *event, void *qp_context)
+{
+ struct ib_mad_qp_info *qp_info = qp_context;
+
+ /* It's worse than that! He's dead, Jim! */
+ dev_err(&qp_info->port_priv->device->dev,
+ "Fatal error (%d) on MAD QP (%d)\n",
+ event->event, qp_info->qp->qp_num);
+}
+
+static void init_mad_queue(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_queue *mad_queue)
+{
+ mad_queue->qp_info = qp_info;
+ mad_queue->count = 0;
+ spin_lock_init(&mad_queue->lock);
+ INIT_LIST_HEAD(&mad_queue->list);
+}
+
+static void init_mad_qp(struct ib_mad_port_private *port_priv,
+ struct ib_mad_qp_info *qp_info)
+{
+ qp_info->port_priv = port_priv;
+ init_mad_queue(qp_info, &qp_info->send_queue);
+ init_mad_queue(qp_info, &qp_info->recv_queue);
+ INIT_LIST_HEAD(&qp_info->overflow_list);
+ spin_lock_init(&qp_info->snoop_lock);
+ qp_info->snoop_table = NULL;
+ qp_info->snoop_table_size = 0;
+ atomic_set(&qp_info->snoop_count, 0);
+}
+
+static int create_mad_qp(struct ib_mad_qp_info *qp_info,
+ enum ib_qp_type qp_type)
+{
+ struct ib_qp_init_attr qp_init_attr;
+ int ret;
+
+ memset(&qp_init_attr, 0, sizeof qp_init_attr);
+ qp_init_attr.send_cq = qp_info->port_priv->cq;
+ qp_init_attr.recv_cq = qp_info->port_priv->cq;
+ qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+ qp_init_attr.cap.max_send_wr = mad_sendq_size;
+ qp_init_attr.cap.max_recv_wr = mad_recvq_size;
+ qp_init_attr.cap.max_send_sge = IB_MAD_SEND_REQ_MAX_SG;
+ qp_init_attr.cap.max_recv_sge = IB_MAD_RECV_REQ_MAX_SG;
+ qp_init_attr.qp_type = qp_type;
+ qp_init_attr.port_num = qp_info->port_priv->port_num;
+ qp_init_attr.qp_context = qp_info;
+ qp_init_attr.event_handler = qp_event_handler;
+ qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr);
+ if (IS_ERR(qp_info->qp)) {
+ dev_err(&qp_info->port_priv->device->dev,
+ "Couldn't create ib_mad QP%d\n",
+ get_spl_qp_index(qp_type));
+ ret = PTR_ERR(qp_info->qp);
+ goto error;
+ }
+ /* Use minimum queue sizes unless the CQ is resized */
+ qp_info->send_queue.max_active = mad_sendq_size;
+ qp_info->recv_queue.max_active = mad_recvq_size;
+ return 0;
+
+error:
+ return ret;
+}
+
+static void destroy_mad_qp(struct ib_mad_qp_info *qp_info)
+{
+ if (!qp_info->qp)
+ return;
+
+ ib_destroy_qp(qp_info->qp);
+ kfree(qp_info->snoop_table);
+}
+
+/*
+ * Open the port
+ * Create the QP, PD, MR, and CQ if needed
+ */
+static int ib_mad_port_open(struct ib_device *device,
+ int port_num)
+{
+ int ret, cq_size;
+ struct ib_mad_port_private *port_priv;
+ unsigned long flags;
+ char name[sizeof "ib_mad123"];
+ int has_smi;
+
+ if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE))
+ return -EFAULT;
+
+ if (WARN_ON(rdma_cap_opa_mad(device, port_num) &&
+ rdma_max_mad_size(device, port_num) < OPA_MGMT_MAD_SIZE))
+ return -EFAULT;
+
+ /* Create new device info */
+ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
+ if (!port_priv)
+ return -ENOMEM;
+
+ port_priv->device = device;
+ port_priv->port_num = port_num;
+ spin_lock_init(&port_priv->reg_lock);
+ init_mad_qp(port_priv, &port_priv->qp_info[0]);
+ init_mad_qp(port_priv, &port_priv->qp_info[1]);
+
+ cq_size = mad_sendq_size + mad_recvq_size;
+ has_smi = rdma_cap_ib_smi(device, port_num);
+ if (has_smi)
+ cq_size *= 2;
+
+ port_priv->pd = ib_alloc_pd(device, 0);
+ if (IS_ERR(port_priv->pd)) {
+ dev_err(&device->dev, "Couldn't create ib_mad PD\n");
+ ret = PTR_ERR(port_priv->pd);
+ goto error3;
+ }
+
+ port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0,
+ IB_POLL_UNBOUND_WORKQUEUE);
+ if (IS_ERR(port_priv->cq)) {
+ dev_err(&device->dev, "Couldn't create ib_mad CQ\n");
+ ret = PTR_ERR(port_priv->cq);
+ goto error4;
+ }
+
+ if (has_smi) {
+ ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI);
+ if (ret)
+ goto error6;
+ }
+ ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI);
+ if (ret)
+ goto error7;
+
+ snprintf(name, sizeof name, "ib_mad%d", port_num);
+ port_priv->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
+ if (!port_priv->wq) {
+ ret = -ENOMEM;
+ goto error8;
+ }
+
+ spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+ list_add_tail(&port_priv->port_list, &ib_mad_port_list);
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+ ret = ib_mad_port_start(port_priv);
+ if (ret) {
+ dev_err(&device->dev, "Couldn't start port\n");
+ goto error9;
+ }
+
+ return 0;
+
+error9:
+ spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+ list_del_init(&port_priv->port_list);
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+ destroy_workqueue(port_priv->wq);
+error8:
+ destroy_mad_qp(&port_priv->qp_info[1]);
+error7:
+ destroy_mad_qp(&port_priv->qp_info[0]);
+error6:
+ ib_free_cq(port_priv->cq);
+ cleanup_recv_queue(&port_priv->qp_info[1]);
+ cleanup_recv_queue(&port_priv->qp_info[0]);
+error4:
+ ib_dealloc_pd(port_priv->pd);
+error3:
+ kfree(port_priv);
+
+ return ret;
+}
+
+/*
+ * Close the port
+ * If there are no classes using the port, free the port
+ * resources (CQ, MR, PD, QP) and remove the port's info structure
+ */
+static int ib_mad_port_close(struct ib_device *device, int port_num)
+{
+ struct ib_mad_port_private *port_priv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+ port_priv = __ib_get_mad_port(device, port_num);
+ if (port_priv == NULL) {
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+ dev_err(&device->dev, "Port %d not found\n", port_num);
+ return -ENODEV;
+ }
+ list_del_init(&port_priv->port_list);
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+ destroy_workqueue(port_priv->wq);
+ destroy_mad_qp(&port_priv->qp_info[1]);
+ destroy_mad_qp(&port_priv->qp_info[0]);
+ ib_free_cq(port_priv->cq);
+ ib_dealloc_pd(port_priv->pd);
+ cleanup_recv_queue(&port_priv->qp_info[1]);
+ cleanup_recv_queue(&port_priv->qp_info[0]);
+ /* XXX: Handle deallocation of MAD registration tables */
+
+ kfree(port_priv);
+
+ return 0;
+}
+
+static void ib_mad_init_device(struct ib_device *device)
+{
+ int start, i;
+
+ start = rdma_start_port(device);
+
+ for (i = start; i <= rdma_end_port(device); i++) {
+ if (!rdma_cap_ib_mad(device, i))
+ continue;
+
+ if (ib_mad_port_open(device, i)) {
+ dev_err(&device->dev, "Couldn't open port %d\n", i);
+ goto error;
+ }
+ if (ib_agent_port_open(device, i)) {
+ dev_err(&device->dev,
+ "Couldn't open port %d for agents\n", i);
+ goto error_agent;
+ }
+ }
+ return;
+
+error_agent:
+ if (ib_mad_port_close(device, i))
+ dev_err(&device->dev, "Couldn't close port %d\n", i);
+
+error:
+ while (--i >= start) {
+ if (!rdma_cap_ib_mad(device, i))
+ continue;
+
+ if (ib_agent_port_close(device, i))
+ dev_err(&device->dev,
+ "Couldn't close port %d for agents\n", i);
+ if (ib_mad_port_close(device, i))
+ dev_err(&device->dev, "Couldn't close port %d\n", i);
+ }
+}
+
+static void ib_mad_remove_device(struct ib_device *device, void *client_data)
+{
+ int i;
+
+ for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+ if (!rdma_cap_ib_mad(device, i))
+ continue;
+
+ if (ib_agent_port_close(device, i))
+ dev_err(&device->dev,
+ "Couldn't close port %d for agents\n", i);
+ if (ib_mad_port_close(device, i))
+ dev_err(&device->dev, "Couldn't close port %d\n", i);
+ }
+}
+
+static struct ib_client mad_client = {
+ .name = "mad",
+ .add = ib_mad_init_device,
+ .remove = ib_mad_remove_device
+};
+
+int ib_mad_init(void)
+{
+ mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE);
+ mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE);
+
+ mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE);
+ mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE);
+
+ INIT_LIST_HEAD(&ib_mad_port_list);
+
+ /* Client ID 0 is used for snoop-only clients */
+ idr_alloc(&ib_mad_clients, NULL, 0, 0, GFP_KERNEL);
+
+ if (ib_register_client(&mad_client)) {
+ pr_err("Couldn't register ib_mad client\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+void ib_mad_cleanup(void)
+{
+ ib_unregister_client(&mad_client);
+}
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
new file mode 100644
index 000000000..d84ae1671
--- /dev/null
+++ b/drivers/infiniband/core/mad_priv.h
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2004, 2005, Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __IB_MAD_PRIV_H__
+#define __IB_MAD_PRIV_H__
+
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/workqueue.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/opa_smi.h>
+
+#define IB_MAD_QPS_CORE 2 /* Always QP0 and QP1 as a minimum */
+
+/* QP and CQ parameters */
+#define IB_MAD_QP_SEND_SIZE 128
+#define IB_MAD_QP_RECV_SIZE 512
+#define IB_MAD_QP_MIN_SIZE 64
+#define IB_MAD_QP_MAX_SIZE 8192
+#define IB_MAD_SEND_REQ_MAX_SG 2
+#define IB_MAD_RECV_REQ_MAX_SG 1
+
+#define IB_MAD_SEND_Q_PSN 0
+
+/* Registration table sizes */
+#define MAX_MGMT_CLASS 80
+#define MAX_MGMT_VERSION 0x83
+#define MAX_MGMT_OUI 8
+#define MAX_MGMT_VENDOR_RANGE2 (IB_MGMT_CLASS_VENDOR_RANGE2_END - \
+ IB_MGMT_CLASS_VENDOR_RANGE2_START + 1)
+
+struct ib_mad_list_head {
+ struct list_head list;
+ struct ib_cqe cqe;
+ struct ib_mad_queue *mad_queue;
+};
+
+struct ib_mad_private_header {
+ struct ib_mad_list_head mad_list;
+ struct ib_mad_recv_wc recv_wc;
+ struct ib_wc wc;
+ u64 mapping;
+} __attribute__ ((packed));
+
+struct ib_mad_private {
+ struct ib_mad_private_header header;
+ size_t mad_size;
+ struct ib_grh grh;
+ u8 mad[0];
+} __attribute__ ((packed));
+
+struct ib_rmpp_segment {
+ struct list_head list;
+ u32 num;
+ u8 data[0];
+};
+
+struct ib_mad_agent_private {
+ struct ib_mad_agent agent;
+ struct ib_mad_reg_req *reg_req;
+ struct ib_mad_qp_info *qp_info;
+
+ spinlock_t lock;
+ struct list_head send_list;
+ struct list_head wait_list;
+ struct list_head done_list;
+ struct delayed_work timed_work;
+ unsigned long timeout;
+ struct list_head local_list;
+ struct work_struct local_work;
+ struct list_head rmpp_list;
+
+ atomic_t refcount;
+ union {
+ struct completion comp;
+ struct rcu_head rcu;
+ };
+};
+
+struct ib_mad_snoop_private {
+ struct ib_mad_agent agent;
+ struct ib_mad_qp_info *qp_info;
+ int snoop_index;
+ int mad_snoop_flags;
+ atomic_t refcount;
+ struct completion comp;
+};
+
+struct ib_mad_send_wr_private {
+ struct ib_mad_list_head mad_list;
+ struct list_head agent_list;
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_buf send_buf;
+ u64 header_mapping;
+ u64 payload_mapping;
+ struct ib_ud_wr send_wr;
+ struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
+ __be64 tid;
+ unsigned long timeout;
+ int max_retries;
+ int retries_left;
+ int retry;
+ int refcount;
+ enum ib_wc_status status;
+
+ /* RMPP control */
+ struct list_head rmpp_list;
+ struct ib_rmpp_segment *last_ack_seg;
+ struct ib_rmpp_segment *cur_seg;
+ int last_ack;
+ int seg_num;
+ int newwin;
+ int pad;
+};
+
+struct ib_mad_local_private {
+ struct list_head completion_list;
+ struct ib_mad_private *mad_priv;
+ struct ib_mad_agent_private *recv_mad_agent;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ size_t return_wc_byte_len;
+};
+
+struct ib_mad_mgmt_method_table {
+ struct ib_mad_agent_private *agent[IB_MGMT_MAX_METHODS];
+};
+
+struct ib_mad_mgmt_class_table {
+ struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_CLASS];
+};
+
+struct ib_mad_mgmt_vendor_class {
+ u8 oui[MAX_MGMT_OUI][3];
+ struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_OUI];
+};
+
+struct ib_mad_mgmt_vendor_class_table {
+ struct ib_mad_mgmt_vendor_class *vendor_class[MAX_MGMT_VENDOR_RANGE2];
+};
+
+struct ib_mad_mgmt_version_table {
+ struct ib_mad_mgmt_class_table *class;
+ struct ib_mad_mgmt_vendor_class_table *vendor;
+};
+
+struct ib_mad_queue {
+ spinlock_t lock;
+ struct list_head list;
+ int count;
+ int max_active;
+ struct ib_mad_qp_info *qp_info;
+};
+
+struct ib_mad_qp_info {
+ struct ib_mad_port_private *port_priv;
+ struct ib_qp *qp;
+ struct ib_mad_queue send_queue;
+ struct ib_mad_queue recv_queue;
+ struct list_head overflow_list;
+ spinlock_t snoop_lock;
+ struct ib_mad_snoop_private **snoop_table;
+ int snoop_table_size;
+ atomic_t snoop_count;
+};
+
+struct ib_mad_port_private {
+ struct list_head port_list;
+ struct ib_device *device;
+ int port_num;
+ struct ib_cq *cq;
+ struct ib_pd *pd;
+
+ spinlock_t reg_lock;
+ struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
+ struct workqueue_struct *wq;
+ struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE];
+};
+
+int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr);
+
+struct ib_mad_send_wr_private *
+ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv,
+ const struct ib_mad_recv_wc *mad_recv_wc);
+
+void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_send_wc *mad_send_wc);
+
+void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr);
+
+void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
+ int timeout_ms);
+
+#endif /* __IB_MAD_PRIV_H__ */
diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c
new file mode 100644
index 000000000..e5cf09c66
--- /dev/null
+++ b/drivers/infiniband/core/mad_rmpp.c
@@ -0,0 +1,968 @@
+/*
+ * Copyright (c) 2005 Intel Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "mad_priv.h"
+#include "mad_rmpp.h"
+
+enum rmpp_state {
+ RMPP_STATE_ACTIVE,
+ RMPP_STATE_TIMEOUT,
+ RMPP_STATE_COMPLETE,
+ RMPP_STATE_CANCELING
+};
+
+struct mad_rmpp_recv {
+ struct ib_mad_agent_private *agent;
+ struct list_head list;
+ struct delayed_work timeout_work;
+ struct delayed_work cleanup_work;
+ struct completion comp;
+ enum rmpp_state state;
+ spinlock_t lock;
+ atomic_t refcount;
+
+ struct ib_ah *ah;
+ struct ib_mad_recv_wc *rmpp_wc;
+ struct ib_mad_recv_buf *cur_seg_buf;
+ int last_ack;
+ int seg_num;
+ int newwin;
+ int repwin;
+
+ __be64 tid;
+ u32 src_qp;
+ u32 slid;
+ u8 mgmt_class;
+ u8 class_version;
+ u8 method;
+ u8 base_version;
+};
+
+static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
+{
+ if (atomic_dec_and_test(&rmpp_recv->refcount))
+ complete(&rmpp_recv->comp);
+}
+
+static void destroy_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
+{
+ deref_rmpp_recv(rmpp_recv);
+ wait_for_completion(&rmpp_recv->comp);
+ rdma_destroy_ah(rmpp_recv->ah);
+ kfree(rmpp_recv);
+}
+
+void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent)
+{
+ struct mad_rmpp_recv *rmpp_recv, *temp_rmpp_recv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&agent->lock, flags);
+ list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+ if (rmpp_recv->state != RMPP_STATE_COMPLETE)
+ ib_free_recv_mad(rmpp_recv->rmpp_wc);
+ rmpp_recv->state = RMPP_STATE_CANCELING;
+ }
+ spin_unlock_irqrestore(&agent->lock, flags);
+
+ list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+ cancel_delayed_work(&rmpp_recv->timeout_work);
+ cancel_delayed_work(&rmpp_recv->cleanup_work);
+ }
+
+ flush_workqueue(agent->qp_info->port_priv->wq);
+
+ list_for_each_entry_safe(rmpp_recv, temp_rmpp_recv,
+ &agent->rmpp_list, list) {
+ list_del(&rmpp_recv->list);
+ destroy_rmpp_recv(rmpp_recv);
+ }
+}
+
+static void format_ack(struct ib_mad_send_buf *msg,
+ struct ib_rmpp_mad *data,
+ struct mad_rmpp_recv *rmpp_recv)
+{
+ struct ib_rmpp_mad *ack = msg->mad;
+ unsigned long flags;
+
+ memcpy(ack, &data->mad_hdr, msg->hdr_len);
+
+ ack->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+ ack->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ACK;
+ ib_set_rmpp_flags(&ack->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+
+ spin_lock_irqsave(&rmpp_recv->lock, flags);
+ rmpp_recv->last_ack = rmpp_recv->seg_num;
+ ack->rmpp_hdr.seg_num = cpu_to_be32(rmpp_recv->seg_num);
+ ack->rmpp_hdr.paylen_newwin = cpu_to_be32(rmpp_recv->newwin);
+ spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+}
+
+static void ack_recv(struct mad_rmpp_recv *rmpp_recv,
+ struct ib_mad_recv_wc *recv_wc)
+{
+ struct ib_mad_send_buf *msg;
+ int ret, hdr_len;
+
+ hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
+ msg = ib_create_send_mad(&rmpp_recv->agent->agent, recv_wc->wc->src_qp,
+ recv_wc->wc->pkey_index, 1, hdr_len,
+ 0, GFP_KERNEL,
+ IB_MGMT_BASE_VERSION);
+ if (IS_ERR(msg))
+ return;
+
+ format_ack(msg, (struct ib_rmpp_mad *) recv_wc->recv_buf.mad, rmpp_recv);
+ msg->ah = rmpp_recv->ah;
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ ib_free_send_mad(msg);
+}
+
+static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent,
+ struct ib_mad_recv_wc *recv_wc)
+{
+ struct ib_mad_send_buf *msg;
+ struct ib_ah *ah;
+ int hdr_len;
+
+ ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc,
+ recv_wc->recv_buf.grh, agent->port_num);
+ if (IS_ERR(ah))
+ return (void *) ah;
+
+ hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
+ msg = ib_create_send_mad(agent, recv_wc->wc->src_qp,
+ recv_wc->wc->pkey_index, 1,
+ hdr_len, 0, GFP_KERNEL,
+ IB_MGMT_BASE_VERSION);
+ if (IS_ERR(msg))
+ rdma_destroy_ah(ah);
+ else {
+ msg->ah = ah;
+ msg->context[0] = ah;
+ }
+
+ return msg;
+}
+
+static void ack_ds_ack(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *recv_wc)
+{
+ struct ib_mad_send_buf *msg;
+ struct ib_rmpp_mad *rmpp_mad;
+ int ret;
+
+ msg = alloc_response_msg(&agent->agent, recv_wc);
+ if (IS_ERR(msg))
+ return;
+
+ rmpp_mad = msg->mad;
+ memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len);
+
+ rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+ ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+ rmpp_mad->rmpp_hdr.seg_num = 0;
+ rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(1);
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ rdma_destroy_ah(msg->ah);
+ ib_free_send_mad(msg);
+ }
+}
+
+void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc)
+{
+ if (mad_send_wc->send_buf->context[0] == mad_send_wc->send_buf->ah)
+ rdma_destroy_ah(mad_send_wc->send_buf->ah);
+ ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+static void nack_recv(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *recv_wc, u8 rmpp_status)
+{
+ struct ib_mad_send_buf *msg;
+ struct ib_rmpp_mad *rmpp_mad;
+ int ret;
+
+ msg = alloc_response_msg(&agent->agent, recv_wc);
+ if (IS_ERR(msg))
+ return;
+
+ rmpp_mad = msg->mad;
+ memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len);
+
+ rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+ rmpp_mad->rmpp_hdr.rmpp_version = IB_MGMT_RMPP_VERSION;
+ rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ABORT;
+ ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+ rmpp_mad->rmpp_hdr.rmpp_status = rmpp_status;
+ rmpp_mad->rmpp_hdr.seg_num = 0;
+ rmpp_mad->rmpp_hdr.paylen_newwin = 0;
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ rdma_destroy_ah(msg->ah);
+ ib_free_send_mad(msg);
+ }
+}
+
+static void recv_timeout_handler(struct work_struct *work)
+{
+ struct mad_rmpp_recv *rmpp_recv =
+ container_of(work, struct mad_rmpp_recv, timeout_work.work);
+ struct ib_mad_recv_wc *rmpp_wc;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rmpp_recv->agent->lock, flags);
+ if (rmpp_recv->state != RMPP_STATE_ACTIVE) {
+ spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+ return;
+ }
+ rmpp_recv->state = RMPP_STATE_TIMEOUT;
+ list_del(&rmpp_recv->list);
+ spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+
+ rmpp_wc = rmpp_recv->rmpp_wc;
+ nack_recv(rmpp_recv->agent, rmpp_wc, IB_MGMT_RMPP_STATUS_T2L);
+ destroy_rmpp_recv(rmpp_recv);
+ ib_free_recv_mad(rmpp_wc);
+}
+
+static void recv_cleanup_handler(struct work_struct *work)
+{
+ struct mad_rmpp_recv *rmpp_recv =
+ container_of(work, struct mad_rmpp_recv, cleanup_work.work);
+ unsigned long flags;
+
+ spin_lock_irqsave(&rmpp_recv->agent->lock, flags);
+ if (rmpp_recv->state == RMPP_STATE_CANCELING) {
+ spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+ return;
+ }
+ list_del(&rmpp_recv->list);
+ spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+ destroy_rmpp_recv(rmpp_recv);
+}
+
+static struct mad_rmpp_recv *
+create_rmpp_recv(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+ struct ib_mad_hdr *mad_hdr;
+
+ rmpp_recv = kmalloc(sizeof *rmpp_recv, GFP_KERNEL);
+ if (!rmpp_recv)
+ return NULL;
+
+ rmpp_recv->ah = ib_create_ah_from_wc(agent->agent.qp->pd,
+ mad_recv_wc->wc,
+ mad_recv_wc->recv_buf.grh,
+ agent->agent.port_num);
+ if (IS_ERR(rmpp_recv->ah))
+ goto error;
+
+ rmpp_recv->agent = agent;
+ init_completion(&rmpp_recv->comp);
+ INIT_DELAYED_WORK(&rmpp_recv->timeout_work, recv_timeout_handler);
+ INIT_DELAYED_WORK(&rmpp_recv->cleanup_work, recv_cleanup_handler);
+ spin_lock_init(&rmpp_recv->lock);
+ rmpp_recv->state = RMPP_STATE_ACTIVE;
+ atomic_set(&rmpp_recv->refcount, 1);
+
+ rmpp_recv->rmpp_wc = mad_recv_wc;
+ rmpp_recv->cur_seg_buf = &mad_recv_wc->recv_buf;
+ rmpp_recv->newwin = 1;
+ rmpp_recv->seg_num = 1;
+ rmpp_recv->last_ack = 0;
+ rmpp_recv->repwin = 1;
+
+ mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr;
+ rmpp_recv->tid = mad_hdr->tid;
+ rmpp_recv->src_qp = mad_recv_wc->wc->src_qp;
+ rmpp_recv->slid = mad_recv_wc->wc->slid;
+ rmpp_recv->mgmt_class = mad_hdr->mgmt_class;
+ rmpp_recv->class_version = mad_hdr->class_version;
+ rmpp_recv->method = mad_hdr->method;
+ rmpp_recv->base_version = mad_hdr->base_version;
+ return rmpp_recv;
+
+error: kfree(rmpp_recv);
+ return NULL;
+}
+
+static struct mad_rmpp_recv *
+find_rmpp_recv(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+ struct ib_mad_hdr *mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr;
+
+ list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+ if (rmpp_recv->tid == mad_hdr->tid &&
+ rmpp_recv->src_qp == mad_recv_wc->wc->src_qp &&
+ rmpp_recv->slid == mad_recv_wc->wc->slid &&
+ rmpp_recv->mgmt_class == mad_hdr->mgmt_class &&
+ rmpp_recv->class_version == mad_hdr->class_version &&
+ rmpp_recv->method == mad_hdr->method)
+ return rmpp_recv;
+ }
+ return NULL;
+}
+
+static struct mad_rmpp_recv *
+acquire_rmpp_recv(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&agent->lock, flags);
+ rmpp_recv = find_rmpp_recv(agent, mad_recv_wc);
+ if (rmpp_recv)
+ atomic_inc(&rmpp_recv->refcount);
+ spin_unlock_irqrestore(&agent->lock, flags);
+ return rmpp_recv;
+}
+
+static struct mad_rmpp_recv *
+insert_rmpp_recv(struct ib_mad_agent_private *agent,
+ struct mad_rmpp_recv *rmpp_recv)
+{
+ struct mad_rmpp_recv *cur_rmpp_recv;
+
+ cur_rmpp_recv = find_rmpp_recv(agent, rmpp_recv->rmpp_wc);
+ if (!cur_rmpp_recv)
+ list_add_tail(&rmpp_recv->list, &agent->rmpp_list);
+
+ return cur_rmpp_recv;
+}
+
+static inline int get_last_flag(struct ib_mad_recv_buf *seg)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *) seg->mad;
+ return ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_LAST;
+}
+
+static inline int get_seg_num(struct ib_mad_recv_buf *seg)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *) seg->mad;
+ return be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num);
+}
+
+static inline struct ib_mad_recv_buf * get_next_seg(struct list_head *rmpp_list,
+ struct ib_mad_recv_buf *seg)
+{
+ if (seg->list.next == rmpp_list)
+ return NULL;
+
+ return container_of(seg->list.next, struct ib_mad_recv_buf, list);
+}
+
+static inline int window_size(struct ib_mad_agent_private *agent)
+{
+ return max(agent->qp_info->recv_queue.max_active >> 3, 1);
+}
+
+static struct ib_mad_recv_buf * find_seg_location(struct list_head *rmpp_list,
+ int seg_num)
+{
+ struct ib_mad_recv_buf *seg_buf;
+ int cur_seg_num;
+
+ list_for_each_entry_reverse(seg_buf, rmpp_list, list) {
+ cur_seg_num = get_seg_num(seg_buf);
+ if (seg_num > cur_seg_num)
+ return seg_buf;
+ if (seg_num == cur_seg_num)
+ break;
+ }
+ return NULL;
+}
+
+static void update_seg_num(struct mad_rmpp_recv *rmpp_recv,
+ struct ib_mad_recv_buf *new_buf)
+{
+ struct list_head *rmpp_list = &rmpp_recv->rmpp_wc->rmpp_list;
+
+ while (new_buf && (get_seg_num(new_buf) == rmpp_recv->seg_num + 1)) {
+ rmpp_recv->cur_seg_buf = new_buf;
+ rmpp_recv->seg_num++;
+ new_buf = get_next_seg(rmpp_list, new_buf);
+ }
+}
+
+static inline int get_mad_len(struct mad_rmpp_recv *rmpp_recv)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ int hdr_size, data_size, pad;
+ bool opa = rdma_cap_opa_mad(rmpp_recv->agent->qp_info->port_priv->device,
+ rmpp_recv->agent->qp_info->port_priv->port_num);
+
+ rmpp_mad = (struct ib_rmpp_mad *)rmpp_recv->cur_seg_buf->mad;
+
+ hdr_size = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
+ if (opa && rmpp_recv->base_version == OPA_MGMT_BASE_VERSION) {
+ data_size = sizeof(struct opa_rmpp_mad) - hdr_size;
+ pad = OPA_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+ if (pad > OPA_MGMT_RMPP_DATA || pad < 0)
+ pad = 0;
+ } else {
+ data_size = sizeof(struct ib_rmpp_mad) - hdr_size;
+ pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+ if (pad > IB_MGMT_RMPP_DATA || pad < 0)
+ pad = 0;
+ }
+
+ return hdr_size + rmpp_recv->seg_num * data_size - pad;
+}
+
+static struct ib_mad_recv_wc * complete_rmpp(struct mad_rmpp_recv *rmpp_recv)
+{
+ struct ib_mad_recv_wc *rmpp_wc;
+
+ ack_recv(rmpp_recv, rmpp_recv->rmpp_wc);
+ if (rmpp_recv->seg_num > 1)
+ cancel_delayed_work(&rmpp_recv->timeout_work);
+
+ rmpp_wc = rmpp_recv->rmpp_wc;
+ rmpp_wc->mad_len = get_mad_len(rmpp_recv);
+ /* 10 seconds until we can find the packet lifetime */
+ queue_delayed_work(rmpp_recv->agent->qp_info->port_priv->wq,
+ &rmpp_recv->cleanup_work, msecs_to_jiffies(10000));
+ return rmpp_wc;
+}
+
+static struct ib_mad_recv_wc *
+continue_rmpp(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+ struct ib_mad_recv_buf *prev_buf;
+ struct ib_mad_recv_wc *done_wc;
+ int seg_num;
+ unsigned long flags;
+
+ rmpp_recv = acquire_rmpp_recv(agent, mad_recv_wc);
+ if (!rmpp_recv)
+ goto drop1;
+
+ seg_num = get_seg_num(&mad_recv_wc->recv_buf);
+
+ spin_lock_irqsave(&rmpp_recv->lock, flags);
+ if ((rmpp_recv->state == RMPP_STATE_TIMEOUT) ||
+ (seg_num > rmpp_recv->newwin))
+ goto drop3;
+
+ if ((seg_num <= rmpp_recv->last_ack) ||
+ (rmpp_recv->state == RMPP_STATE_COMPLETE)) {
+ spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+ ack_recv(rmpp_recv, mad_recv_wc);
+ goto drop2;
+ }
+
+ prev_buf = find_seg_location(&rmpp_recv->rmpp_wc->rmpp_list, seg_num);
+ if (!prev_buf)
+ goto drop3;
+
+ done_wc = NULL;
+ list_add(&mad_recv_wc->recv_buf.list, &prev_buf->list);
+ if (rmpp_recv->cur_seg_buf == prev_buf) {
+ update_seg_num(rmpp_recv, &mad_recv_wc->recv_buf);
+ if (get_last_flag(rmpp_recv->cur_seg_buf)) {
+ rmpp_recv->state = RMPP_STATE_COMPLETE;
+ spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+ done_wc = complete_rmpp(rmpp_recv);
+ goto out;
+ } else if (rmpp_recv->seg_num == rmpp_recv->newwin) {
+ rmpp_recv->newwin += window_size(agent);
+ spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+ ack_recv(rmpp_recv, mad_recv_wc);
+ goto out;
+ }
+ }
+ spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+out:
+ deref_rmpp_recv(rmpp_recv);
+ return done_wc;
+
+drop3: spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+drop2: deref_rmpp_recv(rmpp_recv);
+drop1: ib_free_recv_mad(mad_recv_wc);
+ return NULL;
+}
+
+static struct ib_mad_recv_wc *
+start_rmpp(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+ unsigned long flags;
+
+ rmpp_recv = create_rmpp_recv(agent, mad_recv_wc);
+ if (!rmpp_recv) {
+ ib_free_recv_mad(mad_recv_wc);
+ return NULL;
+ }
+
+ spin_lock_irqsave(&agent->lock, flags);
+ if (insert_rmpp_recv(agent, rmpp_recv)) {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ /* duplicate first MAD */
+ destroy_rmpp_recv(rmpp_recv);
+ return continue_rmpp(agent, mad_recv_wc);
+ }
+ atomic_inc(&rmpp_recv->refcount);
+
+ if (get_last_flag(&mad_recv_wc->recv_buf)) {
+ rmpp_recv->state = RMPP_STATE_COMPLETE;
+ spin_unlock_irqrestore(&agent->lock, flags);
+ complete_rmpp(rmpp_recv);
+ } else {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ /* 40 seconds until we can find the packet lifetimes */
+ queue_delayed_work(agent->qp_info->port_priv->wq,
+ &rmpp_recv->timeout_work,
+ msecs_to_jiffies(40000));
+ rmpp_recv->newwin += window_size(agent);
+ ack_recv(rmpp_recv, mad_recv_wc);
+ mad_recv_wc = NULL;
+ }
+ deref_rmpp_recv(rmpp_recv);
+ return mad_recv_wc;
+}
+
+static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ int timeout;
+ u32 paylen = 0;
+
+ rmpp_mad = mad_send_wr->send_buf.mad;
+ ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+ rmpp_mad->rmpp_hdr.seg_num = cpu_to_be32(++mad_send_wr->seg_num);
+
+ if (mad_send_wr->seg_num == 1) {
+ rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_FIRST;
+ paylen = (mad_send_wr->send_buf.seg_count *
+ mad_send_wr->send_buf.seg_rmpp_size) -
+ mad_send_wr->pad;
+ }
+
+ if (mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) {
+ rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_LAST;
+ paylen = mad_send_wr->send_buf.seg_rmpp_size - mad_send_wr->pad;
+ }
+ rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen);
+
+ /* 2 seconds for an ACK until we can find the packet lifetime */
+ timeout = mad_send_wr->send_buf.timeout_ms;
+ if (!timeout || timeout > 2000)
+ mad_send_wr->timeout = msecs_to_jiffies(2000);
+
+ return ib_send_mad(mad_send_wr);
+}
+
+static void abort_send(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc, u8 rmpp_status)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_send_wc wc;
+ unsigned long flags;
+
+ spin_lock_irqsave(&agent->lock, flags);
+ mad_send_wr = ib_find_send_mad(agent, mad_recv_wc);
+ if (!mad_send_wr)
+ goto out; /* Unmatched send */
+
+ if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
+ (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+ goto out; /* Send is already done */
+
+ ib_mark_mad_done(mad_send_wr);
+ spin_unlock_irqrestore(&agent->lock, flags);
+
+ wc.status = IB_WC_REM_ABORT_ERR;
+ wc.vendor_err = rmpp_status;
+ wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr, &wc);
+ return;
+out:
+ spin_unlock_irqrestore(&agent->lock, flags);
+}
+
+static inline void adjust_last_ack(struct ib_mad_send_wr_private *wr,
+ int seg_num)
+{
+ struct list_head *list;
+
+ wr->last_ack = seg_num;
+ list = &wr->last_ack_seg->list;
+ list_for_each_entry(wr->last_ack_seg, list, list)
+ if (wr->last_ack_seg->num == seg_num)
+ break;
+}
+
+static void process_ds_ack(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc, int newwin)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+
+ rmpp_recv = find_rmpp_recv(agent, mad_recv_wc);
+ if (rmpp_recv && rmpp_recv->state == RMPP_STATE_COMPLETE)
+ rmpp_recv->repwin = newwin;
+}
+
+static void process_rmpp_ack(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_rmpp_mad *rmpp_mad;
+ unsigned long flags;
+ int seg_num, newwin, ret;
+
+ rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+ if (rmpp_mad->rmpp_hdr.rmpp_status) {
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ return;
+ }
+
+ seg_num = be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num);
+ newwin = be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+ if (newwin < seg_num) {
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S);
+ return;
+ }
+
+ spin_lock_irqsave(&agent->lock, flags);
+ mad_send_wr = ib_find_send_mad(agent, mad_recv_wc);
+ if (!mad_send_wr) {
+ if (!seg_num)
+ process_ds_ack(agent, mad_recv_wc, newwin);
+ goto out; /* Unmatched or DS RMPP ACK */
+ }
+
+ if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) &&
+ (mad_send_wr->timeout)) {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ ack_ds_ack(agent, mad_recv_wc);
+ return; /* Repeated ACK for DS RMPP transaction */
+ }
+
+ if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
+ (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+ goto out; /* Send is already done */
+
+ if (seg_num > mad_send_wr->send_buf.seg_count ||
+ seg_num > mad_send_wr->newwin) {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B);
+ return;
+ }
+
+ if (newwin < mad_send_wr->newwin || seg_num < mad_send_wr->last_ack)
+ goto out; /* Old ACK */
+
+ if (seg_num > mad_send_wr->last_ack) {
+ adjust_last_ack(mad_send_wr, seg_num);
+ mad_send_wr->retries_left = mad_send_wr->max_retries;
+ }
+ mad_send_wr->newwin = newwin;
+ if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) {
+ /* If no response is expected, the ACK completes the send */
+ if (!mad_send_wr->send_buf.timeout_ms) {
+ struct ib_mad_send_wc wc;
+
+ ib_mark_mad_done(mad_send_wr);
+ spin_unlock_irqrestore(&agent->lock, flags);
+
+ wc.status = IB_WC_SUCCESS;
+ wc.vendor_err = 0;
+ wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr, &wc);
+ return;
+ }
+ if (mad_send_wr->refcount == 1)
+ ib_reset_mad_timeout(mad_send_wr,
+ mad_send_wr->send_buf.timeout_ms);
+ spin_unlock_irqrestore(&agent->lock, flags);
+ ack_ds_ack(agent, mad_recv_wc);
+ return;
+ } else if (mad_send_wr->refcount == 1 &&
+ mad_send_wr->seg_num < mad_send_wr->newwin &&
+ mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) {
+ /* Send failure will just result in a timeout/retry */
+ ret = send_next_seg(mad_send_wr);
+ if (ret)
+ goto out;
+
+ mad_send_wr->refcount++;
+ list_move_tail(&mad_send_wr->agent_list,
+ &mad_send_wr->mad_agent_priv->send_list);
+ }
+out:
+ spin_unlock_irqrestore(&agent->lock, flags);
+}
+
+static struct ib_mad_recv_wc *
+process_rmpp_data(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_rmpp_hdr *rmpp_hdr;
+ u8 rmpp_status;
+
+ rmpp_hdr = &((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr;
+
+ if (rmpp_hdr->rmpp_status) {
+ rmpp_status = IB_MGMT_RMPP_STATUS_BAD_STATUS;
+ goto bad;
+ }
+
+ if (rmpp_hdr->seg_num == cpu_to_be32(1)) {
+ if (!(ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST)) {
+ rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG;
+ goto bad;
+ }
+ return start_rmpp(agent, mad_recv_wc);
+ } else {
+ if (ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST) {
+ rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG;
+ goto bad;
+ }
+ return continue_rmpp(agent, mad_recv_wc);
+ }
+bad:
+ nack_recv(agent, mad_recv_wc, rmpp_status);
+ ib_free_recv_mad(mad_recv_wc);
+ return NULL;
+}
+
+static void process_rmpp_stop(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+
+ if (rmpp_mad->rmpp_hdr.rmpp_status != IB_MGMT_RMPP_STATUS_RESX) {
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ } else
+ abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status);
+}
+
+static void process_rmpp_abort(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+
+ if (rmpp_mad->rmpp_hdr.rmpp_status < IB_MGMT_RMPP_STATUS_ABORT_MIN ||
+ rmpp_mad->rmpp_hdr.rmpp_status > IB_MGMT_RMPP_STATUS_ABORT_MAX) {
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ } else
+ abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status);
+}
+
+struct ib_mad_recv_wc *
+ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+ if (!(rmpp_mad->rmpp_hdr.rmpp_rtime_flags & IB_MGMT_RMPP_FLAG_ACTIVE))
+ return mad_recv_wc;
+
+ if (rmpp_mad->rmpp_hdr.rmpp_version != IB_MGMT_RMPP_VERSION) {
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV);
+ goto out;
+ }
+
+ switch (rmpp_mad->rmpp_hdr.rmpp_type) {
+ case IB_MGMT_RMPP_TYPE_DATA:
+ return process_rmpp_data(agent, mad_recv_wc);
+ case IB_MGMT_RMPP_TYPE_ACK:
+ process_rmpp_ack(agent, mad_recv_wc);
+ break;
+ case IB_MGMT_RMPP_TYPE_STOP:
+ process_rmpp_stop(agent, mad_recv_wc);
+ break;
+ case IB_MGMT_RMPP_TYPE_ABORT:
+ process_rmpp_abort(agent, mad_recv_wc);
+ break;
+ default:
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT);
+ break;
+ }
+out:
+ ib_free_recv_mad(mad_recv_wc);
+ return NULL;
+}
+
+static int init_newwin(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_mad_agent_private *agent = mad_send_wr->mad_agent_priv;
+ struct ib_mad_hdr *mad_hdr = mad_send_wr->send_buf.mad;
+ struct mad_rmpp_recv *rmpp_recv;
+ struct rdma_ah_attr ah_attr;
+ unsigned long flags;
+ int newwin = 1;
+
+ if (!(mad_hdr->method & IB_MGMT_METHOD_RESP))
+ goto out;
+
+ spin_lock_irqsave(&agent->lock, flags);
+ list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+ if (rmpp_recv->tid != mad_hdr->tid ||
+ rmpp_recv->mgmt_class != mad_hdr->mgmt_class ||
+ rmpp_recv->class_version != mad_hdr->class_version ||
+ (rmpp_recv->method & IB_MGMT_METHOD_RESP))
+ continue;
+
+ if (rdma_query_ah(mad_send_wr->send_buf.ah, &ah_attr))
+ continue;
+
+ if (rmpp_recv->slid == rdma_ah_get_dlid(&ah_attr)) {
+ newwin = rmpp_recv->repwin;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&agent->lock, flags);
+out:
+ return newwin;
+}
+
+int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ int ret;
+
+ rmpp_mad = mad_send_wr->send_buf.mad;
+ if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+ IB_MGMT_RMPP_FLAG_ACTIVE))
+ return IB_RMPP_RESULT_UNHANDLED;
+
+ if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA) {
+ mad_send_wr->seg_num = 1;
+ return IB_RMPP_RESULT_INTERNAL;
+ }
+
+ mad_send_wr->newwin = init_newwin(mad_send_wr);
+
+ /* We need to wait for the final ACK even if there isn't a response */
+ mad_send_wr->refcount += (mad_send_wr->timeout == 0);
+ ret = send_next_seg(mad_send_wr);
+ if (!ret)
+ return IB_RMPP_RESULT_CONSUMED;
+ return ret;
+}
+
+int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ int ret;
+
+ rmpp_mad = mad_send_wr->send_buf.mad;
+ if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+ IB_MGMT_RMPP_FLAG_ACTIVE))
+ return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */
+
+ if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA)
+ return IB_RMPP_RESULT_INTERNAL; /* ACK, STOP, or ABORT */
+
+ if (mad_send_wc->status != IB_WC_SUCCESS ||
+ mad_send_wr->status != IB_WC_SUCCESS)
+ return IB_RMPP_RESULT_PROCESSED; /* Canceled or send error */
+
+ if (!mad_send_wr->timeout)
+ return IB_RMPP_RESULT_PROCESSED; /* Response received */
+
+ if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) {
+ mad_send_wr->timeout =
+ msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+ return IB_RMPP_RESULT_PROCESSED; /* Send done */
+ }
+
+ if (mad_send_wr->seg_num == mad_send_wr->newwin ||
+ mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count)
+ return IB_RMPP_RESULT_PROCESSED; /* Wait for ACK */
+
+ ret = send_next_seg(mad_send_wr);
+ if (ret) {
+ mad_send_wc->status = IB_WC_GENERAL_ERR;
+ return IB_RMPP_RESULT_PROCESSED;
+ }
+ return IB_RMPP_RESULT_CONSUMED;
+}
+
+int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ int ret;
+
+ rmpp_mad = mad_send_wr->send_buf.mad;
+ if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+ IB_MGMT_RMPP_FLAG_ACTIVE))
+ return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */
+
+ if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count)
+ return IB_RMPP_RESULT_PROCESSED;
+
+ mad_send_wr->seg_num = mad_send_wr->last_ack;
+ mad_send_wr->cur_seg = mad_send_wr->last_ack_seg;
+
+ ret = send_next_seg(mad_send_wr);
+ if (ret)
+ return IB_RMPP_RESULT_PROCESSED;
+
+ return IB_RMPP_RESULT_CONSUMED;
+}
diff --git a/drivers/infiniband/core/mad_rmpp.h b/drivers/infiniband/core/mad_rmpp.h
new file mode 100644
index 000000000..3d336bff1
--- /dev/null
+++ b/drivers/infiniband/core/mad_rmpp.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2005 Intel Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MAD_RMPP_H__
+#define __MAD_RMPP_H__
+
+enum {
+ IB_RMPP_RESULT_PROCESSED,
+ IB_RMPP_RESULT_CONSUMED,
+ IB_RMPP_RESULT_INTERNAL,
+ IB_RMPP_RESULT_UNHANDLED
+};
+
+int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr);
+
+struct ib_mad_recv_wc *
+ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc);
+
+int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_send_wc *mad_send_wc);
+
+void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc);
+
+void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent);
+
+int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr);
+
+#endif /* __MAD_RMPP_H__ */
diff --git a/drivers/infiniband/core/mr_pool.c b/drivers/infiniband/core/mr_pool.c
new file mode 100644
index 000000000..49d478b2e
--- /dev/null
+++ b/drivers/infiniband/core/mr_pool.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <rdma/ib_verbs.h>
+#include <rdma/mr_pool.h>
+
+struct ib_mr *ib_mr_pool_get(struct ib_qp *qp, struct list_head *list)
+{
+ struct ib_mr *mr;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->mr_lock, flags);
+ mr = list_first_entry_or_null(list, struct ib_mr, qp_entry);
+ if (mr) {
+ list_del(&mr->qp_entry);
+ qp->mrs_used++;
+ }
+ spin_unlock_irqrestore(&qp->mr_lock, flags);
+
+ return mr;
+}
+EXPORT_SYMBOL(ib_mr_pool_get);
+
+void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->mr_lock, flags);
+ list_add(&mr->qp_entry, list);
+ qp->mrs_used--;
+ spin_unlock_irqrestore(&qp->mr_lock, flags);
+}
+EXPORT_SYMBOL(ib_mr_pool_put);
+
+int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr,
+ enum ib_mr_type type, u32 max_num_sg)
+{
+ struct ib_mr *mr;
+ unsigned long flags;
+ int ret, i;
+
+ for (i = 0; i < nr; i++) {
+ mr = ib_alloc_mr(qp->pd, type, max_num_sg);
+ if (IS_ERR(mr)) {
+ ret = PTR_ERR(mr);
+ goto out;
+ }
+
+ spin_lock_irqsave(&qp->mr_lock, flags);
+ list_add_tail(&mr->qp_entry, list);
+ spin_unlock_irqrestore(&qp->mr_lock, flags);
+ }
+
+ return 0;
+out:
+ ib_mr_pool_destroy(qp, list);
+ return ret;
+}
+EXPORT_SYMBOL(ib_mr_pool_init);
+
+void ib_mr_pool_destroy(struct ib_qp *qp, struct list_head *list)
+{
+ struct ib_mr *mr;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->mr_lock, flags);
+ while (!list_empty(list)) {
+ mr = list_first_entry(list, struct ib_mr, qp_entry);
+ list_del(&mr->qp_entry);
+
+ spin_unlock_irqrestore(&qp->mr_lock, flags);
+ ib_dereg_mr(mr);
+ spin_lock_irqsave(&qp->mr_lock, flags);
+ }
+ spin_unlock_irqrestore(&qp->mr_lock, flags);
+}
+EXPORT_SYMBOL(ib_mr_pool_destroy);
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
new file mode 100644
index 000000000..d50ff70bb
--- /dev/null
+++ b/drivers/infiniband/core/multicast.c
@@ -0,0 +1,908 @@
+/*
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/random.h>
+
+#include <rdma/ib_cache.h>
+#include "sa.h"
+
+static void mcast_add_one(struct ib_device *device);
+static void mcast_remove_one(struct ib_device *device, void *client_data);
+
+static struct ib_client mcast_client = {
+ .name = "ib_multicast",
+ .add = mcast_add_one,
+ .remove = mcast_remove_one
+};
+
+static struct ib_sa_client sa_client;
+static struct workqueue_struct *mcast_wq;
+static union ib_gid mgid0;
+
+struct mcast_device;
+
+struct mcast_port {
+ struct mcast_device *dev;
+ spinlock_t lock;
+ struct rb_root table;
+ atomic_t refcount;
+ struct completion comp;
+ u8 port_num;
+};
+
+struct mcast_device {
+ struct ib_device *device;
+ struct ib_event_handler event_handler;
+ int start_port;
+ int end_port;
+ struct mcast_port port[0];
+};
+
+enum mcast_state {
+ MCAST_JOINING,
+ MCAST_MEMBER,
+ MCAST_ERROR,
+};
+
+enum mcast_group_state {
+ MCAST_IDLE,
+ MCAST_BUSY,
+ MCAST_GROUP_ERROR,
+ MCAST_PKEY_EVENT
+};
+
+enum {
+ MCAST_INVALID_PKEY_INDEX = 0xFFFF
+};
+
+struct mcast_member;
+
+struct mcast_group {
+ struct ib_sa_mcmember_rec rec;
+ struct rb_node node;
+ struct mcast_port *port;
+ spinlock_t lock;
+ struct work_struct work;
+ struct list_head pending_list;
+ struct list_head active_list;
+ struct mcast_member *last_join;
+ int members[NUM_JOIN_MEMBERSHIP_TYPES];
+ atomic_t refcount;
+ enum mcast_group_state state;
+ struct ib_sa_query *query;
+ u16 pkey_index;
+ u8 leave_state;
+ int retries;
+};
+
+struct mcast_member {
+ struct ib_sa_multicast multicast;
+ struct ib_sa_client *client;
+ struct mcast_group *group;
+ struct list_head list;
+ enum mcast_state state;
+ atomic_t refcount;
+ struct completion comp;
+};
+
+static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
+ void *context);
+static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
+ void *context);
+
+static struct mcast_group *mcast_find(struct mcast_port *port,
+ union ib_gid *mgid)
+{
+ struct rb_node *node = port->table.rb_node;
+ struct mcast_group *group;
+ int ret;
+
+ while (node) {
+ group = rb_entry(node, struct mcast_group, node);
+ ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid);
+ if (!ret)
+ return group;
+
+ if (ret < 0)
+ node = node->rb_left;
+ else
+ node = node->rb_right;
+ }
+ return NULL;
+}
+
+static struct mcast_group *mcast_insert(struct mcast_port *port,
+ struct mcast_group *group,
+ int allow_duplicates)
+{
+ struct rb_node **link = &port->table.rb_node;
+ struct rb_node *parent = NULL;
+ struct mcast_group *cur_group;
+ int ret;
+
+ while (*link) {
+ parent = *link;
+ cur_group = rb_entry(parent, struct mcast_group, node);
+
+ ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw,
+ sizeof group->rec.mgid);
+ if (ret < 0)
+ link = &(*link)->rb_left;
+ else if (ret > 0)
+ link = &(*link)->rb_right;
+ else if (allow_duplicates)
+ link = &(*link)->rb_left;
+ else
+ return cur_group;
+ }
+ rb_link_node(&group->node, parent, link);
+ rb_insert_color(&group->node, &port->table);
+ return NULL;
+}
+
+static void deref_port(struct mcast_port *port)
+{
+ if (atomic_dec_and_test(&port->refcount))
+ complete(&port->comp);
+}
+
+static void release_group(struct mcast_group *group)
+{
+ struct mcast_port *port = group->port;
+ unsigned long flags;
+
+ spin_lock_irqsave(&port->lock, flags);
+ if (atomic_dec_and_test(&group->refcount)) {
+ rb_erase(&group->node, &port->table);
+ spin_unlock_irqrestore(&port->lock, flags);
+ kfree(group);
+ deref_port(port);
+ } else
+ spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void deref_member(struct mcast_member *member)
+{
+ if (atomic_dec_and_test(&member->refcount))
+ complete(&member->comp);
+}
+
+static void queue_join(struct mcast_member *member)
+{
+ struct mcast_group *group = member->group;
+ unsigned long flags;
+
+ spin_lock_irqsave(&group->lock, flags);
+ list_add_tail(&member->list, &group->pending_list);
+ if (group->state == MCAST_IDLE) {
+ group->state = MCAST_BUSY;
+ atomic_inc(&group->refcount);
+ queue_work(mcast_wq, &group->work);
+ }
+ spin_unlock_irqrestore(&group->lock, flags);
+}
+
+/*
+ * A multicast group has four types of members: full member, non member,
+ * sendonly non member and sendonly full member.
+ * We need to keep track of the number of members of each
+ * type based on their join state. Adjust the number of members the belong to
+ * the specified join states.
+ */
+static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
+{
+ int i;
+
+ for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++, join_state >>= 1)
+ if (join_state & 0x1)
+ group->members[i] += inc;
+}
+
+/*
+ * If a multicast group has zero members left for a particular join state, but
+ * the group is still a member with the SA, we need to leave that join state.
+ * Determine which join states we still belong to, but that do not have any
+ * active members.
+ */
+static u8 get_leave_state(struct mcast_group *group)
+{
+ u8 leave_state = 0;
+ int i;
+
+ for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++)
+ if (!group->members[i])
+ leave_state |= (0x1 << i);
+
+ return leave_state & group->rec.join_state;
+}
+
+static int check_selector(ib_sa_comp_mask comp_mask,
+ ib_sa_comp_mask selector_mask,
+ ib_sa_comp_mask value_mask,
+ u8 selector, u8 src_value, u8 dst_value)
+{
+ int err;
+
+ if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
+ return 0;
+
+ switch (selector) {
+ case IB_SA_GT:
+ err = (src_value <= dst_value);
+ break;
+ case IB_SA_LT:
+ err = (src_value >= dst_value);
+ break;
+ case IB_SA_EQ:
+ err = (src_value != dst_value);
+ break;
+ default:
+ err = 0;
+ break;
+ }
+
+ return err;
+}
+
+static int cmp_rec(struct ib_sa_mcmember_rec *src,
+ struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask)
+{
+ /* MGID must already match */
+
+ if (comp_mask & IB_SA_MCMEMBER_REC_PORT_GID &&
+ memcmp(&src->port_gid, &dst->port_gid, sizeof src->port_gid))
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey)
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
+ return -EINVAL;
+ if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
+ IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector,
+ src->mtu, dst->mtu))
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS &&
+ src->traffic_class != dst->traffic_class)
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
+ return -EINVAL;
+ if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
+ IB_SA_MCMEMBER_REC_RATE, dst->rate_selector,
+ src->rate, dst->rate))
+ return -EINVAL;
+ if (check_selector(comp_mask,
+ IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
+ IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
+ dst->packet_life_time_selector,
+ src->packet_life_time, dst->packet_life_time))
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl)
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL &&
+ src->flow_label != dst->flow_label)
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT &&
+ src->hop_limit != dst->hop_limit)
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && src->scope != dst->scope)
+ return -EINVAL;
+
+ /* join_state checked separately, proxy_join ignored */
+
+ return 0;
+}
+
+static int send_join(struct mcast_group *group, struct mcast_member *member)
+{
+ struct mcast_port *port = group->port;
+ int ret;
+
+ group->last_join = member;
+ ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
+ port->port_num, IB_MGMT_METHOD_SET,
+ &member->multicast.rec,
+ member->multicast.comp_mask,
+ 3000, GFP_KERNEL, join_handler, group,
+ &group->query);
+ return (ret > 0) ? 0 : ret;
+}
+
+static int send_leave(struct mcast_group *group, u8 leave_state)
+{
+ struct mcast_port *port = group->port;
+ struct ib_sa_mcmember_rec rec;
+ int ret;
+
+ rec = group->rec;
+ rec.join_state = leave_state;
+ group->leave_state = leave_state;
+
+ ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
+ port->port_num, IB_SA_METHOD_DELETE, &rec,
+ IB_SA_MCMEMBER_REC_MGID |
+ IB_SA_MCMEMBER_REC_PORT_GID |
+ IB_SA_MCMEMBER_REC_JOIN_STATE,
+ 3000, GFP_KERNEL, leave_handler,
+ group, &group->query);
+ return (ret > 0) ? 0 : ret;
+}
+
+static void join_group(struct mcast_group *group, struct mcast_member *member,
+ u8 join_state)
+{
+ member->state = MCAST_MEMBER;
+ adjust_membership(group, join_state, 1);
+ group->rec.join_state |= join_state;
+ member->multicast.rec = group->rec;
+ member->multicast.rec.join_state = join_state;
+ list_move(&member->list, &group->active_list);
+}
+
+static int fail_join(struct mcast_group *group, struct mcast_member *member,
+ int status)
+{
+ spin_lock_irq(&group->lock);
+ list_del_init(&member->list);
+ spin_unlock_irq(&group->lock);
+ return member->multicast.callback(status, &member->multicast);
+}
+
+static void process_group_error(struct mcast_group *group)
+{
+ struct mcast_member *member;
+ int ret = 0;
+ u16 pkey_index;
+
+ if (group->state == MCAST_PKEY_EVENT)
+ ret = ib_find_pkey(group->port->dev->device,
+ group->port->port_num,
+ be16_to_cpu(group->rec.pkey), &pkey_index);
+
+ spin_lock_irq(&group->lock);
+ if (group->state == MCAST_PKEY_EVENT && !ret &&
+ group->pkey_index == pkey_index)
+ goto out;
+
+ while (!list_empty(&group->active_list)) {
+ member = list_entry(group->active_list.next,
+ struct mcast_member, list);
+ atomic_inc(&member->refcount);
+ list_del_init(&member->list);
+ adjust_membership(group, member->multicast.rec.join_state, -1);
+ member->state = MCAST_ERROR;
+ spin_unlock_irq(&group->lock);
+
+ ret = member->multicast.callback(-ENETRESET,
+ &member->multicast);
+ deref_member(member);
+ if (ret)
+ ib_sa_free_multicast(&member->multicast);
+ spin_lock_irq(&group->lock);
+ }
+
+ group->rec.join_state = 0;
+out:
+ group->state = MCAST_BUSY;
+ spin_unlock_irq(&group->lock);
+}
+
+static void mcast_work_handler(struct work_struct *work)
+{
+ struct mcast_group *group;
+ struct mcast_member *member;
+ struct ib_sa_multicast *multicast;
+ int status, ret;
+ u8 join_state;
+
+ group = container_of(work, typeof(*group), work);
+retest:
+ spin_lock_irq(&group->lock);
+ while (!list_empty(&group->pending_list) ||
+ (group->state != MCAST_BUSY)) {
+
+ if (group->state != MCAST_BUSY) {
+ spin_unlock_irq(&group->lock);
+ process_group_error(group);
+ goto retest;
+ }
+
+ member = list_entry(group->pending_list.next,
+ struct mcast_member, list);
+ multicast = &member->multicast;
+ join_state = multicast->rec.join_state;
+ atomic_inc(&member->refcount);
+
+ if (join_state == (group->rec.join_state & join_state)) {
+ status = cmp_rec(&group->rec, &multicast->rec,
+ multicast->comp_mask);
+ if (!status)
+ join_group(group, member, join_state);
+ else
+ list_del_init(&member->list);
+ spin_unlock_irq(&group->lock);
+ ret = multicast->callback(status, multicast);
+ } else {
+ spin_unlock_irq(&group->lock);
+ status = send_join(group, member);
+ if (!status) {
+ deref_member(member);
+ return;
+ }
+ ret = fail_join(group, member, status);
+ }
+
+ deref_member(member);
+ if (ret)
+ ib_sa_free_multicast(&member->multicast);
+ spin_lock_irq(&group->lock);
+ }
+
+ join_state = get_leave_state(group);
+ if (join_state) {
+ group->rec.join_state &= ~join_state;
+ spin_unlock_irq(&group->lock);
+ if (send_leave(group, join_state))
+ goto retest;
+ } else {
+ group->state = MCAST_IDLE;
+ spin_unlock_irq(&group->lock);
+ release_group(group);
+ }
+}
+
+/*
+ * Fail a join request if it is still active - at the head of the pending queue.
+ */
+static void process_join_error(struct mcast_group *group, int status)
+{
+ struct mcast_member *member;
+ int ret;
+
+ spin_lock_irq(&group->lock);
+ member = list_entry(group->pending_list.next,
+ struct mcast_member, list);
+ if (group->last_join == member) {
+ atomic_inc(&member->refcount);
+ list_del_init(&member->list);
+ spin_unlock_irq(&group->lock);
+ ret = member->multicast.callback(status, &member->multicast);
+ deref_member(member);
+ if (ret)
+ ib_sa_free_multicast(&member->multicast);
+ } else
+ spin_unlock_irq(&group->lock);
+}
+
+static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
+ void *context)
+{
+ struct mcast_group *group = context;
+ u16 pkey_index = MCAST_INVALID_PKEY_INDEX;
+
+ if (status)
+ process_join_error(group, status);
+ else {
+ int mgids_changed, is_mgid0;
+
+ if (ib_find_pkey(group->port->dev->device,
+ group->port->port_num, be16_to_cpu(rec->pkey),
+ &pkey_index))
+ pkey_index = MCAST_INVALID_PKEY_INDEX;
+
+ spin_lock_irq(&group->port->lock);
+ if (group->state == MCAST_BUSY &&
+ group->pkey_index == MCAST_INVALID_PKEY_INDEX)
+ group->pkey_index = pkey_index;
+ mgids_changed = memcmp(&rec->mgid, &group->rec.mgid,
+ sizeof(group->rec.mgid));
+ group->rec = *rec;
+ if (mgids_changed) {
+ rb_erase(&group->node, &group->port->table);
+ is_mgid0 = !memcmp(&mgid0, &group->rec.mgid,
+ sizeof(mgid0));
+ mcast_insert(group->port, group, is_mgid0);
+ }
+ spin_unlock_irq(&group->port->lock);
+ }
+ mcast_work_handler(&group->work);
+}
+
+static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
+ void *context)
+{
+ struct mcast_group *group = context;
+
+ if (status && group->retries > 0 &&
+ !send_leave(group, group->leave_state))
+ group->retries--;
+ else
+ mcast_work_handler(&group->work);
+}
+
+static struct mcast_group *acquire_group(struct mcast_port *port,
+ union ib_gid *mgid, gfp_t gfp_mask)
+{
+ struct mcast_group *group, *cur_group;
+ unsigned long flags;
+ int is_mgid0;
+
+ is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0);
+ if (!is_mgid0) {
+ spin_lock_irqsave(&port->lock, flags);
+ group = mcast_find(port, mgid);
+ if (group)
+ goto found;
+ spin_unlock_irqrestore(&port->lock, flags);
+ }
+
+ group = kzalloc(sizeof *group, gfp_mask);
+ if (!group)
+ return NULL;
+
+ group->retries = 3;
+ group->port = port;
+ group->rec.mgid = *mgid;
+ group->pkey_index = MCAST_INVALID_PKEY_INDEX;
+ INIT_LIST_HEAD(&group->pending_list);
+ INIT_LIST_HEAD(&group->active_list);
+ INIT_WORK(&group->work, mcast_work_handler);
+ spin_lock_init(&group->lock);
+
+ spin_lock_irqsave(&port->lock, flags);
+ cur_group = mcast_insert(port, group, is_mgid0);
+ if (cur_group) {
+ kfree(group);
+ group = cur_group;
+ } else
+ atomic_inc(&port->refcount);
+found:
+ atomic_inc(&group->refcount);
+ spin_unlock_irqrestore(&port->lock, flags);
+ return group;
+}
+
+/*
+ * We serialize all join requests to a single group to make our lives much
+ * easier. Otherwise, two users could try to join the same group
+ * simultaneously, with different configurations, one could leave while the
+ * join is in progress, etc., which makes locking around error recovery
+ * difficult.
+ */
+struct ib_sa_multicast *
+ib_sa_join_multicast(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ struct ib_sa_mcmember_rec *rec,
+ ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
+ int (*callback)(int status,
+ struct ib_sa_multicast *multicast),
+ void *context)
+{
+ struct mcast_device *dev;
+ struct mcast_member *member;
+ struct ib_sa_multicast *multicast;
+ int ret;
+
+ dev = ib_get_client_data(device, &mcast_client);
+ if (!dev)
+ return ERR_PTR(-ENODEV);
+
+ member = kmalloc(sizeof *member, gfp_mask);
+ if (!member)
+ return ERR_PTR(-ENOMEM);
+
+ ib_sa_client_get(client);
+ member->client = client;
+ member->multicast.rec = *rec;
+ member->multicast.comp_mask = comp_mask;
+ member->multicast.callback = callback;
+ member->multicast.context = context;
+ init_completion(&member->comp);
+ atomic_set(&member->refcount, 1);
+ member->state = MCAST_JOINING;
+
+ member->group = acquire_group(&dev->port[port_num - dev->start_port],
+ &rec->mgid, gfp_mask);
+ if (!member->group) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ /*
+ * The user will get the multicast structure in their callback. They
+ * could then free the multicast structure before we can return from
+ * this routine. So we save the pointer to return before queuing
+ * any callback.
+ */
+ multicast = &member->multicast;
+ queue_join(member);
+ return multicast;
+
+err:
+ ib_sa_client_put(client);
+ kfree(member);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_sa_join_multicast);
+
+void ib_sa_free_multicast(struct ib_sa_multicast *multicast)
+{
+ struct mcast_member *member;
+ struct mcast_group *group;
+
+ member = container_of(multicast, struct mcast_member, multicast);
+ group = member->group;
+
+ spin_lock_irq(&group->lock);
+ if (member->state == MCAST_MEMBER)
+ adjust_membership(group, multicast->rec.join_state, -1);
+
+ list_del_init(&member->list);
+
+ if (group->state == MCAST_IDLE) {
+ group->state = MCAST_BUSY;
+ spin_unlock_irq(&group->lock);
+ /* Continue to hold reference on group until callback */
+ queue_work(mcast_wq, &group->work);
+ } else {
+ spin_unlock_irq(&group->lock);
+ release_group(group);
+ }
+
+ deref_member(member);
+ wait_for_completion(&member->comp);
+ ib_sa_client_put(member->client);
+ kfree(member);
+}
+EXPORT_SYMBOL(ib_sa_free_multicast);
+
+int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
+ union ib_gid *mgid, struct ib_sa_mcmember_rec *rec)
+{
+ struct mcast_device *dev;
+ struct mcast_port *port;
+ struct mcast_group *group;
+ unsigned long flags;
+ int ret = 0;
+
+ dev = ib_get_client_data(device, &mcast_client);
+ if (!dev)
+ return -ENODEV;
+
+ port = &dev->port[port_num - dev->start_port];
+ spin_lock_irqsave(&port->lock, flags);
+ group = mcast_find(port, mgid);
+ if (group)
+ *rec = group->rec;
+ else
+ ret = -EADDRNOTAVAIL;
+ spin_unlock_irqrestore(&port->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_sa_get_mcmember_rec);
+
+/**
+ * ib_init_ah_from_mcmember - Initialize AH attribute from multicast
+ * member record and gid of the device.
+ * @device: RDMA device
+ * @port_num: Port of the rdma device to consider
+ * @ndev: Optional netdevice, applicable only for RoCE
+ * @gid_type: GID type to consider
+ * @ah_attr: AH attribute to fillup on successful completion
+ *
+ * ib_init_ah_from_mcmember() initializes AH attribute based on multicast
+ * member record and other device properties. On success the caller is
+ * responsible to call rdma_destroy_ah_attr on the ah_attr. Returns 0 on
+ * success or appropriate error code.
+ *
+ */
+int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
+ struct ib_sa_mcmember_rec *rec,
+ struct net_device *ndev,
+ enum ib_gid_type gid_type,
+ struct rdma_ah_attr *ah_attr)
+{
+ const struct ib_gid_attr *sgid_attr;
+
+ /* GID table is not based on the netdevice for IB link layer,
+ * so ignore ndev during search.
+ */
+ if (rdma_protocol_ib(device, port_num))
+ ndev = NULL;
+ else if (!rdma_protocol_roce(device, port_num))
+ return -EINVAL;
+
+ sgid_attr = rdma_find_gid_by_port(device, &rec->port_gid,
+ gid_type, port_num, ndev);
+ if (IS_ERR(sgid_attr))
+ return PTR_ERR(sgid_attr);
+
+ memset(ah_attr, 0, sizeof(*ah_attr));
+ ah_attr->type = rdma_ah_find_type(device, port_num);
+
+ rdma_ah_set_dlid(ah_attr, be16_to_cpu(rec->mlid));
+ rdma_ah_set_sl(ah_attr, rec->sl);
+ rdma_ah_set_port_num(ah_attr, port_num);
+ rdma_ah_set_static_rate(ah_attr, rec->rate);
+ rdma_move_grh_sgid_attr(ah_attr, &rec->mgid,
+ be32_to_cpu(rec->flow_label),
+ rec->hop_limit, rec->traffic_class,
+ sgid_attr);
+ return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_mcmember);
+
+static void mcast_groups_event(struct mcast_port *port,
+ enum mcast_group_state state)
+{
+ struct mcast_group *group;
+ struct rb_node *node;
+ unsigned long flags;
+
+ spin_lock_irqsave(&port->lock, flags);
+ for (node = rb_first(&port->table); node; node = rb_next(node)) {
+ group = rb_entry(node, struct mcast_group, node);
+ spin_lock(&group->lock);
+ if (group->state == MCAST_IDLE) {
+ atomic_inc(&group->refcount);
+ queue_work(mcast_wq, &group->work);
+ }
+ if (group->state != MCAST_GROUP_ERROR)
+ group->state = state;
+ spin_unlock(&group->lock);
+ }
+ spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void mcast_event_handler(struct ib_event_handler *handler,
+ struct ib_event *event)
+{
+ struct mcast_device *dev;
+ int index;
+
+ dev = container_of(handler, struct mcast_device, event_handler);
+ if (!rdma_cap_ib_mcast(dev->device, event->element.port_num))
+ return;
+
+ index = event->element.port_num - dev->start_port;
+
+ switch (event->event) {
+ case IB_EVENT_PORT_ERR:
+ case IB_EVENT_LID_CHANGE:
+ case IB_EVENT_SM_CHANGE:
+ case IB_EVENT_CLIENT_REREGISTER:
+ mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR);
+ break;
+ case IB_EVENT_PKEY_CHANGE:
+ mcast_groups_event(&dev->port[index], MCAST_PKEY_EVENT);
+ break;
+ default:
+ break;
+ }
+}
+
+static void mcast_add_one(struct ib_device *device)
+{
+ struct mcast_device *dev;
+ struct mcast_port *port;
+ int i;
+ int count = 0;
+
+ dev = kmalloc(struct_size(dev, port, device->phys_port_cnt),
+ GFP_KERNEL);
+ if (!dev)
+ return;
+
+ dev->start_port = rdma_start_port(device);
+ dev->end_port = rdma_end_port(device);
+
+ for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+ if (!rdma_cap_ib_mcast(device, dev->start_port + i))
+ continue;
+ port = &dev->port[i];
+ port->dev = dev;
+ port->port_num = dev->start_port + i;
+ spin_lock_init(&port->lock);
+ port->table = RB_ROOT;
+ init_completion(&port->comp);
+ atomic_set(&port->refcount, 1);
+ ++count;
+ }
+
+ if (!count) {
+ kfree(dev);
+ return;
+ }
+
+ dev->device = device;
+ ib_set_client_data(device, &mcast_client, dev);
+
+ INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler);
+ ib_register_event_handler(&dev->event_handler);
+}
+
+static void mcast_remove_one(struct ib_device *device, void *client_data)
+{
+ struct mcast_device *dev = client_data;
+ struct mcast_port *port;
+ int i;
+
+ if (!dev)
+ return;
+
+ ib_unregister_event_handler(&dev->event_handler);
+ flush_workqueue(mcast_wq);
+
+ for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+ if (rdma_cap_ib_mcast(device, dev->start_port + i)) {
+ port = &dev->port[i];
+ deref_port(port);
+ wait_for_completion(&port->comp);
+ }
+ }
+
+ kfree(dev);
+}
+
+int mcast_init(void)
+{
+ int ret;
+
+ mcast_wq = alloc_ordered_workqueue("ib_mcast", WQ_MEM_RECLAIM);
+ if (!mcast_wq)
+ return -ENOMEM;
+
+ ib_sa_register_client(&sa_client);
+
+ ret = ib_register_client(&mcast_client);
+ if (ret)
+ goto err;
+ return 0;
+
+err:
+ ib_sa_unregister_client(&sa_client);
+ destroy_workqueue(mcast_wq);
+ return ret;
+}
+
+void mcast_cleanup(void)
+{
+ ib_unregister_client(&mcast_client);
+ ib_sa_unregister_client(&sa_client);
+ destroy_workqueue(mcast_wq);
+}
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
new file mode 100644
index 000000000..3ccaae18a
--- /dev/null
+++ b/drivers/infiniband/core/netlink.c
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies Inc. All rights reserved.
+ * Copyright (c) 2010 Voltaire Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
+
+#include <linux/export.h>
+#include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <rdma/rdma_netlink.h>
+#include <linux/module.h>
+#include "core_priv.h"
+
+static DEFINE_MUTEX(rdma_nl_mutex);
+static struct sock *nls;
+static struct {
+ const struct rdma_nl_cbs *cb_table;
+} rdma_nl_types[RDMA_NL_NUM_CLIENTS];
+
+int rdma_nl_chk_listeners(unsigned int group)
+{
+ return (netlink_has_listeners(nls, group)) ? 0 : -1;
+}
+EXPORT_SYMBOL(rdma_nl_chk_listeners);
+
+static bool is_nl_msg_valid(unsigned int type, unsigned int op)
+{
+ static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS] = {
+ [RDMA_NL_RDMA_CM] = RDMA_NL_RDMA_CM_NUM_OPS,
+ [RDMA_NL_IWCM] = RDMA_NL_IWPM_NUM_OPS,
+ [RDMA_NL_LS] = RDMA_NL_LS_NUM_OPS,
+ [RDMA_NL_NLDEV] = RDMA_NLDEV_NUM_OPS,
+ };
+
+ /*
+ * This BUILD_BUG_ON is intended to catch addition of new
+ * RDMA netlink protocol without updating the array above.
+ */
+ BUILD_BUG_ON(RDMA_NL_NUM_CLIENTS != 6);
+
+ if (type >= RDMA_NL_NUM_CLIENTS)
+ return false;
+
+ return (op < max_num_ops[type]) ? true : false;
+}
+
+static bool is_nl_valid(unsigned int type, unsigned int op)
+{
+ const struct rdma_nl_cbs *cb_table;
+
+ if (!is_nl_msg_valid(type, op))
+ return false;
+
+ if (!rdma_nl_types[type].cb_table) {
+ mutex_unlock(&rdma_nl_mutex);
+ request_module("rdma-netlink-subsys-%d", type);
+ mutex_lock(&rdma_nl_mutex);
+ }
+
+ cb_table = rdma_nl_types[type].cb_table;
+
+ if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit))
+ return false;
+ return true;
+}
+
+void rdma_nl_register(unsigned int index,
+ const struct rdma_nl_cbs cb_table[])
+{
+ mutex_lock(&rdma_nl_mutex);
+ if (!is_nl_msg_valid(index, 0)) {
+ /*
+ * All clients are not interesting in success/failure of
+ * this call. They want to see the print to error log and
+ * continue their initialization. Print warning for them,
+ * because it is programmer's error to be here.
+ */
+ mutex_unlock(&rdma_nl_mutex);
+ WARN(true,
+ "The not-valid %u index was supplied to RDMA netlink\n",
+ index);
+ return;
+ }
+
+ if (rdma_nl_types[index].cb_table) {
+ mutex_unlock(&rdma_nl_mutex);
+ WARN(true,
+ "The %u index is already registered in RDMA netlink\n",
+ index);
+ return;
+ }
+
+ rdma_nl_types[index].cb_table = cb_table;
+ mutex_unlock(&rdma_nl_mutex);
+}
+EXPORT_SYMBOL(rdma_nl_register);
+
+void rdma_nl_unregister(unsigned int index)
+{
+ mutex_lock(&rdma_nl_mutex);
+ rdma_nl_types[index].cb_table = NULL;
+ mutex_unlock(&rdma_nl_mutex);
+}
+EXPORT_SYMBOL(rdma_nl_unregister);
+
+void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq,
+ int len, int client, int op, int flags)
+{
+ *nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op), len, flags);
+ if (!*nlh)
+ return NULL;
+ return nlmsg_data(*nlh);
+}
+EXPORT_SYMBOL(ibnl_put_msg);
+
+int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh,
+ int len, void *data, int type)
+{
+ if (nla_put(skb, type, len, data)) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ibnl_put_attr);
+
+static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ int type = nlh->nlmsg_type;
+ unsigned int index = RDMA_NL_GET_CLIENT(type);
+ unsigned int op = RDMA_NL_GET_OP(type);
+ const struct rdma_nl_cbs *cb_table;
+
+ if (!is_nl_valid(index, op))
+ return -EINVAL;
+
+ cb_table = rdma_nl_types[index].cb_table;
+
+ if ((cb_table[op].flags & RDMA_NL_ADMIN_PERM) &&
+ !netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ /*
+ * LS responses overload the 0x100 (NLM_F_ROOT) flag. Don't
+ * mistakenly call the .dump() function.
+ */
+ if (index == RDMA_NL_LS) {
+ if (cb_table[op].doit)
+ return cb_table[op].doit(skb, nlh, extack);
+ return -EINVAL;
+ }
+ /* FIXME: Convert IWCM to properly handle doit callbacks */
+ if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_RDMA_CM ||
+ index == RDMA_NL_IWCM) {
+ struct netlink_dump_control c = {
+ .dump = cb_table[op].dump,
+ };
+ if (c.dump)
+ return netlink_dump_start(nls, skb, nlh, &c);
+ return -EINVAL;
+ }
+
+ if (cb_table[op].doit)
+ return cb_table[op].doit(skb, nlh, extack);
+
+ return 0;
+}
+
+/*
+ * This function is similar to netlink_rcv_skb with one exception:
+ * It calls to the callback for the netlink messages without NLM_F_REQUEST
+ * flag. These messages are intended for RDMA_NL_LS consumer, so it is allowed
+ * for that consumer only.
+ */
+static int rdma_nl_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
+ struct nlmsghdr *,
+ struct netlink_ext_ack *))
+{
+ struct netlink_ext_ack extack = {};
+ struct nlmsghdr *nlh;
+ int err;
+
+ while (skb->len >= nlmsg_total_size(0)) {
+ int msglen;
+
+ nlh = nlmsg_hdr(skb);
+ err = 0;
+
+ if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
+ return 0;
+
+ /*
+ * Generally speaking, the only requests are handled
+ * by the kernel, but RDMA_NL_LS is different, because it
+ * runs backward netlink scheme. Kernel initiates messages
+ * and waits for reply with data to keep pathrecord cache
+ * in sync.
+ */
+ if (!(nlh->nlmsg_flags & NLM_F_REQUEST) &&
+ (RDMA_NL_GET_CLIENT(nlh->nlmsg_type) != RDMA_NL_LS))
+ goto ack;
+
+ /* Skip control messages */
+ if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
+ goto ack;
+
+ err = cb(skb, nlh, &extack);
+ if (err == -EINTR)
+ goto skip;
+
+ack:
+ if (nlh->nlmsg_flags & NLM_F_ACK || err)
+ netlink_ack(skb, nlh, err, &extack);
+
+skip:
+ msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (msglen > skb->len)
+ msglen = skb->len;
+ skb_pull(skb, msglen);
+ }
+
+ return 0;
+}
+
+static void rdma_nl_rcv(struct sk_buff *skb)
+{
+ mutex_lock(&rdma_nl_mutex);
+ rdma_nl_rcv_skb(skb, &rdma_nl_rcv_msg);
+ mutex_unlock(&rdma_nl_mutex);
+}
+
+int rdma_nl_unicast(struct sk_buff *skb, u32 pid)
+{
+ int err;
+
+ err = netlink_unicast(nls, skb, pid, MSG_DONTWAIT);
+ return (err < 0) ? err : 0;
+}
+EXPORT_SYMBOL(rdma_nl_unicast);
+
+int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid)
+{
+ int err;
+
+ err = netlink_unicast(nls, skb, pid, 0);
+ return (err < 0) ? err : 0;
+}
+EXPORT_SYMBOL(rdma_nl_unicast_wait);
+
+int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags)
+{
+ return nlmsg_multicast(nls, skb, 0, group, flags);
+}
+EXPORT_SYMBOL(rdma_nl_multicast);
+
+int __init rdma_nl_init(void)
+{
+ struct netlink_kernel_cfg cfg = {
+ .input = rdma_nl_rcv,
+ };
+
+ nls = netlink_kernel_create(&init_net, NETLINK_RDMA, &cfg);
+ if (!nls)
+ return -ENOMEM;
+
+ nls->sk_sndtimeo = 10 * HZ;
+ return 0;
+}
+
+void rdma_nl_exit(void)
+{
+ int idx;
+
+ for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++)
+ rdma_nl_unregister(idx);
+
+ netlink_kernel_release(nls);
+}
+
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_RDMA);
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
new file mode 100644
index 000000000..f6fa9b115
--- /dev/null
+++ b/drivers/infiniband/core/nldev.c
@@ -0,0 +1,1121 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/pid.h>
+#include <linux/pid_namespace.h>
+#include <net/netlink.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_netlink.h>
+
+#include "core_priv.h"
+#include "cma_priv.h"
+
+static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
+ [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING,
+ .len = IB_DEVICE_NAME_MAX - 1},
+ [RDMA_NLDEV_ATTR_PORT_INDEX] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_FW_VERSION] = { .type = NLA_NUL_STRING,
+ .len = IB_FW_VERSION_NAME_MAX - 1},
+ [RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 },
+ [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 },
+ [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 },
+ [RDMA_NLDEV_ATTR_LID] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_LMC] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_PORT_STATE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_RES_SUMMARY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME] = { .type = NLA_NUL_STRING,
+ .len = 16 },
+ [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR] = { .type = NLA_U64 },
+ [RDMA_NLDEV_ATTR_RES_QP] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_QP_ENTRY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_LQPN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_RQPN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_RQ_PSN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_SQ_PSN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_RES_STATE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = NLA_NUL_STRING,
+ .len = TASK_COMM_LEN },
+ [RDMA_NLDEV_ATTR_RES_CM_ID] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_PS] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_SRC_ADDR] = {
+ .len = sizeof(struct __kernel_sockaddr_storage) },
+ [RDMA_NLDEV_ATTR_RES_DST_ADDR] = {
+ .len = sizeof(struct __kernel_sockaddr_storage) },
+ [RDMA_NLDEV_ATTR_RES_CQ] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_CQ_ENTRY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_CQE] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 },
+ [RDMA_NLDEV_ATTR_RES_POLL_CTX] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_RES_MR] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_MR_ENTRY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_RKEY] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_LKEY] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_IOVA] = { .type = NLA_U64 },
+ [RDMA_NLDEV_ATTR_RES_MRLEN] = { .type = NLA_U64 },
+ [RDMA_NLDEV_ATTR_RES_PD] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_PD_ENTRY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_NDEV_INDEX] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_NDEV_NAME] = { .type = NLA_NUL_STRING,
+ .len = IFNAMSIZ },
+ [RDMA_NLDEV_ATTR_DRIVER] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_DRIVER_ENTRY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_DRIVER_STRING] = { .type = NLA_NUL_STRING,
+ .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
+ [RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_DRIVER_S32] = { .type = NLA_S32 },
+ [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 },
+ [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 },
+};
+
+static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
+ enum rdma_nldev_print_type print_type)
+{
+ if (nla_put_string(msg, RDMA_NLDEV_ATTR_DRIVER_STRING, name))
+ return -EMSGSIZE;
+ if (print_type != RDMA_NLDEV_PRINT_TYPE_UNSPEC &&
+ nla_put_u8(msg, RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE, print_type))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int _rdma_nl_put_driver_u32(struct sk_buff *msg, const char *name,
+ enum rdma_nldev_print_type print_type,
+ u32 value)
+{
+ if (put_driver_name_print_type(msg, name, print_type))
+ return -EMSGSIZE;
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DRIVER_U32, value))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int _rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name,
+ enum rdma_nldev_print_type print_type,
+ u64 value)
+{
+ if (put_driver_name_print_type(msg, name, print_type))
+ return -EMSGSIZE;
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_DRIVER_U64, value,
+ RDMA_NLDEV_ATTR_PAD))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+int rdma_nl_put_driver_u32(struct sk_buff *msg, const char *name, u32 value)
+{
+ return _rdma_nl_put_driver_u32(msg, name, RDMA_NLDEV_PRINT_TYPE_UNSPEC,
+ value);
+}
+EXPORT_SYMBOL(rdma_nl_put_driver_u32);
+
+int rdma_nl_put_driver_u32_hex(struct sk_buff *msg, const char *name,
+ u32 value)
+{
+ return _rdma_nl_put_driver_u32(msg, name, RDMA_NLDEV_PRINT_TYPE_HEX,
+ value);
+}
+EXPORT_SYMBOL(rdma_nl_put_driver_u32_hex);
+
+int rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name, u64 value)
+{
+ return _rdma_nl_put_driver_u64(msg, name, RDMA_NLDEV_PRINT_TYPE_UNSPEC,
+ value);
+}
+EXPORT_SYMBOL(rdma_nl_put_driver_u64);
+
+int rdma_nl_put_driver_u64_hex(struct sk_buff *msg, const char *name, u64 value)
+{
+ return _rdma_nl_put_driver_u64(msg, name, RDMA_NLDEV_PRINT_TYPE_HEX,
+ value);
+}
+EXPORT_SYMBOL(rdma_nl_put_driver_u64_hex);
+
+static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device)
+{
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index))
+ return -EMSGSIZE;
+ if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
+{
+ char fw[IB_FW_VERSION_NAME_MAX];
+
+ if (fill_nldev_handle(msg, device))
+ return -EMSGSIZE;
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, rdma_end_port(device)))
+ return -EMSGSIZE;
+
+ BUILD_BUG_ON(sizeof(device->attrs.device_cap_flags) != sizeof(u64));
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS,
+ device->attrs.device_cap_flags,
+ RDMA_NLDEV_ATTR_PAD))
+ return -EMSGSIZE;
+
+ ib_get_device_fw_str(device, fw);
+ /* Device without FW has strlen(fw) = 0 */
+ if (strlen(fw) && nla_put_string(msg, RDMA_NLDEV_ATTR_FW_VERSION, fw))
+ return -EMSGSIZE;
+
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_NODE_GUID,
+ be64_to_cpu(device->node_guid),
+ RDMA_NLDEV_ATTR_PAD))
+ return -EMSGSIZE;
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SYS_IMAGE_GUID,
+ be64_to_cpu(device->attrs.sys_image_guid),
+ RDMA_NLDEV_ATTR_PAD))
+ return -EMSGSIZE;
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int fill_port_info(struct sk_buff *msg,
+ struct ib_device *device, u32 port,
+ const struct net *net)
+{
+ struct net_device *netdev = NULL;
+ struct ib_port_attr attr;
+ int ret;
+
+ if (fill_nldev_handle(msg, device))
+ return -EMSGSIZE;
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port))
+ return -EMSGSIZE;
+
+ ret = ib_query_port(device, port, &attr);
+ if (ret)
+ return ret;
+
+ if (rdma_protocol_ib(device, port)) {
+ BUILD_BUG_ON(sizeof(attr.port_cap_flags) > sizeof(u64));
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS,
+ (u64)attr.port_cap_flags,
+ RDMA_NLDEV_ATTR_PAD))
+ return -EMSGSIZE;
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX,
+ attr.subnet_prefix, RDMA_NLDEV_ATTR_PAD))
+ return -EMSGSIZE;
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_LID, attr.lid))
+ return -EMSGSIZE;
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_SM_LID, attr.sm_lid))
+ return -EMSGSIZE;
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_LMC, attr.lmc))
+ return -EMSGSIZE;
+ }
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_STATE, attr.state))
+ return -EMSGSIZE;
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state))
+ return -EMSGSIZE;
+
+ if (device->get_netdev)
+ netdev = device->get_netdev(device, port);
+
+ if (netdev && net_eq(dev_net(netdev), net)) {
+ ret = nla_put_u32(msg,
+ RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex);
+ if (ret)
+ goto out;
+ ret = nla_put_string(msg,
+ RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name);
+ }
+
+out:
+ if (netdev)
+ dev_put(netdev);
+ return ret;
+}
+
+static int fill_res_info_entry(struct sk_buff *msg,
+ const char *name, u64 curr)
+{
+ struct nlattr *entry_attr;
+
+ entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY);
+ if (!entry_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME, name))
+ goto err;
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR, curr,
+ RDMA_NLDEV_ATTR_PAD))
+ goto err;
+
+ nla_nest_end(msg, entry_attr);
+ return 0;
+
+err:
+ nla_nest_cancel(msg, entry_attr);
+ return -EMSGSIZE;
+}
+
+static int fill_res_info(struct sk_buff *msg, struct ib_device *device)
+{
+ static const char * const names[RDMA_RESTRACK_MAX] = {
+ [RDMA_RESTRACK_PD] = "pd",
+ [RDMA_RESTRACK_CQ] = "cq",
+ [RDMA_RESTRACK_QP] = "qp",
+ [RDMA_RESTRACK_CM_ID] = "cm_id",
+ [RDMA_RESTRACK_MR] = "mr",
+ };
+
+ struct rdma_restrack_root *res = &device->res;
+ struct nlattr *table_attr;
+ int ret, i, curr;
+
+ if (fill_nldev_handle(msg, device))
+ return -EMSGSIZE;
+
+ table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_SUMMARY);
+ if (!table_attr)
+ return -EMSGSIZE;
+
+ for (i = 0; i < RDMA_RESTRACK_MAX; i++) {
+ if (!names[i])
+ continue;
+ curr = rdma_restrack_count(res, i, task_active_pid_ns(current));
+ ret = fill_res_info_entry(msg, names[i], curr);
+ if (ret)
+ goto err;
+ }
+
+ nla_nest_end(msg, table_attr);
+ return 0;
+
+err:
+ nla_nest_cancel(msg, table_attr);
+ return ret;
+}
+
+static int fill_res_name_pid(struct sk_buff *msg,
+ struct rdma_restrack_entry *res)
+{
+ /*
+ * For user resources, user is should read /proc/PID/comm to get the
+ * name of the task file.
+ */
+ if (rdma_is_kernel_res(res)) {
+ if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_KERN_NAME,
+ res->kern_name))
+ return -EMSGSIZE;
+ } else {
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PID,
+ task_pid_vnr(res->task)))
+ return -EMSGSIZE;
+ }
+ return 0;
+}
+
+static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb,
+ struct rdma_restrack_entry *res, uint32_t port)
+{
+ struct ib_qp *qp = container_of(res, struct ib_qp, res);
+ struct rdma_restrack_root *resroot = &qp->device->res;
+ struct ib_qp_init_attr qp_init_attr;
+ struct nlattr *entry_attr;
+ struct ib_qp_attr qp_attr;
+ int ret;
+
+ ret = ib_query_qp(qp, &qp_attr, 0, &qp_init_attr);
+ if (ret)
+ return ret;
+
+ if (port && port != qp_attr.port_num)
+ return 0;
+
+ entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY);
+ if (!entry_attr)
+ goto out;
+
+ /* In create_qp() port is not set yet */
+ if (qp_attr.port_num &&
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp_attr.port_num))
+ goto err;
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num))
+ goto err;
+ if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC) {
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQPN,
+ qp_attr.dest_qp_num))
+ goto err;
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQ_PSN,
+ qp_attr.rq_psn))
+ goto err;
+ }
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_SQ_PSN, qp_attr.sq_psn))
+ goto err;
+
+ if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC ||
+ qp->qp_type == IB_QPT_XRC_INI || qp->qp_type == IB_QPT_XRC_TGT) {
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE,
+ qp_attr.path_mig_state))
+ goto err;
+ }
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, qp->qp_type))
+ goto err;
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state))
+ goto err;
+
+ if (fill_res_name_pid(msg, res))
+ goto err;
+
+ if (resroot->fill_res_entry(msg, res))
+ goto err;
+
+ nla_nest_end(msg, entry_attr);
+ return 0;
+
+err:
+ nla_nest_cancel(msg, entry_attr);
+out:
+ return -EMSGSIZE;
+}
+
+static int fill_res_cm_id_entry(struct sk_buff *msg,
+ struct netlink_callback *cb,
+ struct rdma_restrack_entry *res, uint32_t port)
+{
+ struct rdma_id_private *id_priv =
+ container_of(res, struct rdma_id_private, res);
+ struct rdma_restrack_root *resroot = &id_priv->id.device->res;
+ struct rdma_cm_id *cm_id = &id_priv->id;
+ struct nlattr *entry_attr;
+
+ if (port && port != cm_id->port_num)
+ return 0;
+
+ entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY);
+ if (!entry_attr)
+ goto out;
+
+ if (cm_id->port_num &&
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, cm_id->port_num))
+ goto err;
+
+ if (id_priv->qp_num) {
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, id_priv->qp_num))
+ goto err;
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, cm_id->qp_type))
+ goto err;
+ }
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PS, cm_id->ps))
+ goto err;
+
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, id_priv->state))
+ goto err;
+
+ if (cm_id->route.addr.src_addr.ss_family &&
+ nla_put(msg, RDMA_NLDEV_ATTR_RES_SRC_ADDR,
+ sizeof(cm_id->route.addr.src_addr),
+ &cm_id->route.addr.src_addr))
+ goto err;
+ if (cm_id->route.addr.dst_addr.ss_family &&
+ nla_put(msg, RDMA_NLDEV_ATTR_RES_DST_ADDR,
+ sizeof(cm_id->route.addr.dst_addr),
+ &cm_id->route.addr.dst_addr))
+ goto err;
+
+ if (fill_res_name_pid(msg, res))
+ goto err;
+
+ if (resroot->fill_res_entry(msg, res))
+ goto err;
+
+ nla_nest_end(msg, entry_attr);
+ return 0;
+
+err:
+ nla_nest_cancel(msg, entry_attr);
+out:
+ return -EMSGSIZE;
+}
+
+static int fill_res_cq_entry(struct sk_buff *msg, struct netlink_callback *cb,
+ struct rdma_restrack_entry *res, uint32_t port)
+{
+ struct ib_cq *cq = container_of(res, struct ib_cq, res);
+ struct rdma_restrack_root *resroot = &cq->device->res;
+ struct nlattr *entry_attr;
+
+ entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CQ_ENTRY);
+ if (!entry_attr)
+ goto out;
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQE, cq->cqe))
+ goto err;
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT,
+ atomic_read(&cq->usecnt), RDMA_NLDEV_ATTR_PAD))
+ goto err;
+
+ /* Poll context is only valid for kernel CQs */
+ if (rdma_is_kernel_res(res) &&
+ nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx))
+ goto err;
+
+ if (fill_res_name_pid(msg, res))
+ goto err;
+
+ if (resroot->fill_res_entry(msg, res))
+ goto err;
+
+ nla_nest_end(msg, entry_attr);
+ return 0;
+
+err:
+ nla_nest_cancel(msg, entry_attr);
+out:
+ return -EMSGSIZE;
+}
+
+static int fill_res_mr_entry(struct sk_buff *msg, struct netlink_callback *cb,
+ struct rdma_restrack_entry *res, uint32_t port)
+{
+ struct ib_mr *mr = container_of(res, struct ib_mr, res);
+ struct rdma_restrack_root *resroot = &mr->pd->device->res;
+ struct nlattr *entry_attr;
+
+ entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_MR_ENTRY);
+ if (!entry_attr)
+ goto out;
+
+ if (netlink_capable(cb->skb, CAP_NET_ADMIN)) {
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RKEY, mr->rkey))
+ goto err;
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LKEY, mr->lkey))
+ goto err;
+ }
+
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_MRLEN, mr->length,
+ RDMA_NLDEV_ATTR_PAD))
+ goto err;
+
+ if (fill_res_name_pid(msg, res))
+ goto err;
+
+ if (resroot->fill_res_entry(msg, res))
+ goto err;
+
+ nla_nest_end(msg, entry_attr);
+ return 0;
+
+err:
+ nla_nest_cancel(msg, entry_attr);
+out:
+ return -EMSGSIZE;
+}
+
+static int fill_res_pd_entry(struct sk_buff *msg, struct netlink_callback *cb,
+ struct rdma_restrack_entry *res, uint32_t port)
+{
+ struct ib_pd *pd = container_of(res, struct ib_pd, res);
+ struct rdma_restrack_root *resroot = &pd->device->res;
+ struct nlattr *entry_attr;
+
+ entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_PD_ENTRY);
+ if (!entry_attr)
+ goto out;
+
+ if (netlink_capable(cb->skb, CAP_NET_ADMIN)) {
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY,
+ pd->local_dma_lkey))
+ goto err;
+ if ((pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) &&
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY,
+ pd->unsafe_global_rkey))
+ goto err;
+ }
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT,
+ atomic_read(&pd->usecnt), RDMA_NLDEV_ATTR_PAD))
+ goto err;
+
+ if (fill_res_name_pid(msg, res))
+ goto err;
+
+ if (resroot->fill_res_entry(msg, res))
+ goto err;
+
+ nla_nest_end(msg, entry_attr);
+ return 0;
+
+err:
+ nla_nest_cancel(msg, entry_attr);
+out:
+ return -EMSGSIZE;
+}
+
+static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct ib_device *device;
+ struct sk_buff *msg;
+ u32 index;
+ int err;
+
+ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+
+ device = ib_device_get_by_index(index);
+ if (!device)
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
+ 0, 0);
+
+ err = fill_dev_info(msg, device);
+ if (err)
+ goto err_free;
+
+ nlmsg_end(msg, nlh);
+
+ put_device(&device->dev);
+ return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_free:
+ nlmsg_free(msg);
+err:
+ put_device(&device->dev);
+ return err;
+}
+
+static int _nldev_get_dumpit(struct ib_device *device,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ unsigned int idx)
+{
+ int start = cb->args[0];
+ struct nlmsghdr *nlh;
+
+ if (idx < start)
+ return 0;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
+ 0, NLM_F_MULTI);
+
+ if (fill_dev_info(skb, device)) {
+ nlmsg_cancel(skb, nlh);
+ goto out;
+ }
+
+ nlmsg_end(skb, nlh);
+
+ idx++;
+
+out: cb->args[0] = idx;
+ return skb->len;
+}
+
+static int nldev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ /*
+ * There is no need to take lock, because
+ * we are relying on ib_core's lists_rwsem
+ */
+ return ib_enum_all_devs(_nldev_get_dumpit, skb, cb);
+}
+
+static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct ib_device *device;
+ struct sk_buff *msg;
+ u32 index;
+ u32 port;
+ int err;
+
+ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (err ||
+ !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+ !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(index);
+ if (!device)
+ return -EINVAL;
+
+ port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+ if (!rdma_is_port_valid(device, port)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
+ 0, 0);
+
+ err = fill_port_info(msg, device, port, sock_net(skb->sk));
+ if (err)
+ goto err_free;
+
+ nlmsg_end(msg, nlh);
+ put_device(&device->dev);
+
+ return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_free:
+ nlmsg_free(msg);
+err:
+ put_device(&device->dev);
+ return err;
+}
+
+static int nldev_port_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct ib_device *device;
+ int start = cb->args[0];
+ struct nlmsghdr *nlh;
+ u32 idx = 0;
+ u32 ifindex;
+ int err;
+ u32 p;
+
+ err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NULL);
+ if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+ return -EINVAL;
+
+ ifindex = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(ifindex);
+ if (!device)
+ return -EINVAL;
+
+ for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
+ /*
+ * The dumpit function returns all information from specific
+ * index. This specific index is taken from the netlink
+ * messages request sent by user and it is available
+ * in cb->args[0].
+ *
+ * Usually, the user doesn't fill this field and it causes
+ * to return everything.
+ *
+ */
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+ RDMA_NLDEV_CMD_PORT_GET),
+ 0, NLM_F_MULTI);
+
+ if (fill_port_info(skb, device, p, sock_net(skb->sk))) {
+ nlmsg_cancel(skb, nlh);
+ goto out;
+ }
+ idx++;
+ nlmsg_end(skb, nlh);
+ }
+
+out:
+ put_device(&device->dev);
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct ib_device *device;
+ struct sk_buff *msg;
+ u32 index;
+ int ret;
+
+ ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(index);
+ if (!device)
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET),
+ 0, 0);
+
+ ret = fill_res_info(msg, device);
+ if (ret)
+ goto err_free;
+
+ nlmsg_end(msg, nlh);
+ put_device(&device->dev);
+ return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_free:
+ nlmsg_free(msg);
+err:
+ put_device(&device->dev);
+ return ret;
+}
+
+static int _nldev_res_get_dumpit(struct ib_device *device,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ unsigned int idx)
+{
+ int start = cb->args[0];
+ struct nlmsghdr *nlh;
+
+ if (idx < start)
+ return 0;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET),
+ 0, NLM_F_MULTI);
+
+ if (fill_res_info(skb, device)) {
+ nlmsg_cancel(skb, nlh);
+ goto out;
+ }
+
+ nlmsg_end(skb, nlh);
+
+ idx++;
+
+out:
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+static int nldev_res_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return ib_enum_all_devs(_nldev_res_get_dumpit, skb, cb);
+}
+
+struct nldev_fill_res_entry {
+ int (*fill_res_func)(struct sk_buff *msg, struct netlink_callback *cb,
+ struct rdma_restrack_entry *res, u32 port);
+ enum rdma_nldev_attr nldev_attr;
+ enum rdma_nldev_command nldev_cmd;
+};
+
+static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
+ [RDMA_RESTRACK_QP] = {
+ .fill_res_func = fill_res_qp_entry,
+ .nldev_cmd = RDMA_NLDEV_CMD_RES_QP_GET,
+ .nldev_attr = RDMA_NLDEV_ATTR_RES_QP,
+ },
+ [RDMA_RESTRACK_CM_ID] = {
+ .fill_res_func = fill_res_cm_id_entry,
+ .nldev_cmd = RDMA_NLDEV_CMD_RES_CM_ID_GET,
+ .nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID,
+ },
+ [RDMA_RESTRACK_CQ] = {
+ .fill_res_func = fill_res_cq_entry,
+ .nldev_cmd = RDMA_NLDEV_CMD_RES_CQ_GET,
+ .nldev_attr = RDMA_NLDEV_ATTR_RES_CQ,
+ },
+ [RDMA_RESTRACK_MR] = {
+ .fill_res_func = fill_res_mr_entry,
+ .nldev_cmd = RDMA_NLDEV_CMD_RES_MR_GET,
+ .nldev_attr = RDMA_NLDEV_ATTR_RES_MR,
+ },
+ [RDMA_RESTRACK_PD] = {
+ .fill_res_func = fill_res_pd_entry,
+ .nldev_cmd = RDMA_NLDEV_CMD_RES_PD_GET,
+ .nldev_attr = RDMA_NLDEV_ATTR_RES_PD,
+ },
+};
+
+static int res_get_common_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ enum rdma_restrack_type res_type)
+{
+ const struct nldev_fill_res_entry *fe = &fill_entries[res_type];
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct rdma_restrack_entry *res;
+ int err, ret = 0, idx = 0;
+ struct nlattr *table_attr;
+ struct ib_device *device;
+ int start = cb->args[0];
+ struct nlmsghdr *nlh;
+ u32 index, port = 0;
+ bool filled = false;
+
+ err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NULL);
+ /*
+ * Right now, we are expecting the device index to get res information,
+ * but it is possible to extend this code to return all devices in
+ * one shot by checking the existence of RDMA_NLDEV_ATTR_DEV_INDEX.
+ * if it doesn't exist, we will iterate over all devices.
+ *
+ * But it is not needed for now.
+ */
+ if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(index);
+ if (!device)
+ return -EINVAL;
+
+ /*
+ * If no PORT_INDEX is supplied, we will return all QPs from that device
+ */
+ if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) {
+ port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+ if (!rdma_is_port_valid(device, port)) {
+ ret = -EINVAL;
+ goto err_index;
+ }
+ }
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, fe->nldev_cmd),
+ 0, NLM_F_MULTI);
+
+ if (fill_nldev_handle(skb, device)) {
+ ret = -EMSGSIZE;
+ goto err;
+ }
+
+ table_attr = nla_nest_start(skb, fe->nldev_attr);
+ if (!table_attr) {
+ ret = -EMSGSIZE;
+ goto err;
+ }
+
+ down_read(&device->res.rwsem);
+ hash_for_each_possible(device->res.hash, res, node, res_type) {
+ if (idx < start)
+ goto next;
+
+ if ((rdma_is_kernel_res(res) &&
+ task_active_pid_ns(current) != &init_pid_ns) ||
+ (!rdma_is_kernel_res(res) && task_active_pid_ns(current) !=
+ task_active_pid_ns(res->task)))
+ /*
+ * 1. Kern resources should be visible in init
+ * namspace only
+ * 2. Present only resources visible in the current
+ * namespace
+ */
+ goto next;
+
+ if (!rdma_restrack_get(res))
+ /*
+ * Resource is under release now, but we are not
+ * relesing lock now, so it will be released in
+ * our next pass, once we will get ->next pointer.
+ */
+ goto next;
+
+ filled = true;
+
+ up_read(&device->res.rwsem);
+ ret = fe->fill_res_func(skb, cb, res, port);
+ down_read(&device->res.rwsem);
+ /*
+ * Return resource back, but it won't be released till
+ * the &device->res.rwsem will be released for write.
+ */
+ rdma_restrack_put(res);
+
+ if (ret == -EMSGSIZE)
+ /*
+ * There is a chance to optimize here.
+ * It can be done by using list_prepare_entry
+ * and list_for_each_entry_continue afterwards.
+ */
+ break;
+ if (ret)
+ goto res_err;
+next: idx++;
+ }
+ up_read(&device->res.rwsem);
+
+ nla_nest_end(skb, table_attr);
+ nlmsg_end(skb, nlh);
+ cb->args[0] = idx;
+
+ /*
+ * No more entries to fill, cancel the message and
+ * return 0 to mark end of dumpit.
+ */
+ if (!filled)
+ goto err;
+
+ put_device(&device->dev);
+ return skb->len;
+
+res_err:
+ nla_nest_cancel(skb, table_attr);
+ up_read(&device->res.rwsem);
+
+err:
+ nlmsg_cancel(skb, nlh);
+
+err_index:
+ put_device(&device->dev);
+ return ret;
+}
+
+static int nldev_res_get_qp_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_QP);
+}
+
+static int nldev_res_get_cm_id_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_CM_ID);
+}
+
+static int nldev_res_get_cq_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_CQ);
+}
+
+static int nldev_res_get_mr_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_MR);
+}
+
+static int nldev_res_get_pd_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_PD);
+}
+
+static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
+ [RDMA_NLDEV_CMD_GET] = {
+ .doit = nldev_get_doit,
+ .dump = nldev_get_dumpit,
+ },
+ [RDMA_NLDEV_CMD_PORT_GET] = {
+ .doit = nldev_port_get_doit,
+ .dump = nldev_port_get_dumpit,
+ },
+ [RDMA_NLDEV_CMD_RES_GET] = {
+ .doit = nldev_res_get_doit,
+ .dump = nldev_res_get_dumpit,
+ },
+ [RDMA_NLDEV_CMD_RES_QP_GET] = {
+ .dump = nldev_res_get_qp_dumpit,
+ /*
+ * .doit is not implemented yet for two reasons:
+ * 1. It is not needed yet.
+ * 2. There is a need to provide identifier, while it is easy
+ * for the QPs (device index + port index + LQPN), it is not
+ * the case for the rest of resources (PD and CQ). Because it
+ * is better to provide similar interface for all resources,
+ * let's wait till we will have other resources implemented
+ * too.
+ */
+ },
+ [RDMA_NLDEV_CMD_RES_CM_ID_GET] = {
+ .dump = nldev_res_get_cm_id_dumpit,
+ },
+ [RDMA_NLDEV_CMD_RES_CQ_GET] = {
+ .dump = nldev_res_get_cq_dumpit,
+ },
+ [RDMA_NLDEV_CMD_RES_MR_GET] = {
+ .dump = nldev_res_get_mr_dumpit,
+ },
+ [RDMA_NLDEV_CMD_RES_PD_GET] = {
+ .dump = nldev_res_get_pd_dumpit,
+ },
+};
+
+void __init nldev_init(void)
+{
+ rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table);
+}
+
+void __exit nldev_exit(void)
+{
+ rdma_nl_unregister(RDMA_NL_NLDEV);
+}
+
+MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_NLDEV, 5);
diff --git a/drivers/infiniband/core/opa_smi.h b/drivers/infiniband/core/opa_smi.h
new file mode 100644
index 000000000..3bfab3505
--- /dev/null
+++ b/drivers/infiniband/core/opa_smi.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __OPA_SMI_H_
+#define __OPA_SMI_H_
+
+#include <rdma/ib_smi.h>
+#include <rdma/opa_smi.h>
+
+#include "smi.h"
+
+enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch,
+ int port_num, int phys_port_cnt);
+int opa_smi_get_fwd_port(struct opa_smp *smp);
+extern enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp);
+extern enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp,
+ bool is_switch, int port_num);
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action opa_smi_check_local_smp(struct opa_smp *smp,
+ struct ib_device *device)
+{
+ /* C14-9:3 -- We're at the end of the DR segment of path */
+ /* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */
+ return (device->process_mad &&
+ !opa_get_smp_direction(smp) &&
+ (smp->hop_ptr == smp->hop_cnt + 1)) ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD;
+}
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action opa_smi_check_local_returning_smp(struct opa_smp *smp,
+ struct ib_device *device)
+{
+ /* C14-13:3 -- We're at the end of the DR segment of path */
+ /* C14-13:4 -- Hop Pointer == 0 -> give to SM */
+ return (device->process_mad &&
+ opa_get_smp_direction(smp) &&
+ !smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD;
+}
+
+#endif /* __OPA_SMI_H_ */
diff --git a/drivers/infiniband/core/packer.c b/drivers/infiniband/core/packer.c
new file mode 100644
index 000000000..19b1ee327
--- /dev/null
+++ b/drivers/infiniband/core/packer.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/export.h>
+#include <linux/string.h>
+
+#include <rdma/ib_pack.h>
+
+static u64 value_read(int offset, int size, void *structure)
+{
+ switch (size) {
+ case 1: return *(u8 *) (structure + offset);
+ case 2: return be16_to_cpup((__be16 *) (structure + offset));
+ case 4: return be32_to_cpup((__be32 *) (structure + offset));
+ case 8: return be64_to_cpup((__be64 *) (structure + offset));
+ default:
+ pr_warn("Field size %d bits not handled\n", size * 8);
+ return 0;
+ }
+}
+
+/**
+ * ib_pack - Pack a structure into a buffer
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @structure:Structure to pack from
+ * @buf:Buffer to pack into
+ *
+ * ib_pack() packs a list of structure fields into a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_pack(const struct ib_field *desc,
+ int desc_len,
+ void *structure,
+ void *buf)
+{
+ int i;
+
+ for (i = 0; i < desc_len; ++i) {
+ if (desc[i].size_bits <= 32) {
+ int shift;
+ u32 val;
+ __be32 mask;
+ __be32 *addr;
+
+ shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+ if (desc[i].struct_size_bytes)
+ val = value_read(desc[i].struct_offset_bytes,
+ desc[i].struct_size_bytes,
+ structure) << shift;
+ else
+ val = 0;
+
+ mask = cpu_to_be32(((1ull << desc[i].size_bits) - 1) << shift);
+ addr = (__be32 *) buf + desc[i].offset_words;
+ *addr = (*addr & ~mask) | (cpu_to_be32(val) & mask);
+ } else if (desc[i].size_bits <= 64) {
+ int shift;
+ u64 val;
+ __be64 mask;
+ __be64 *addr;
+
+ shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+ if (desc[i].struct_size_bytes)
+ val = value_read(desc[i].struct_offset_bytes,
+ desc[i].struct_size_bytes,
+ structure) << shift;
+ else
+ val = 0;
+
+ mask = cpu_to_be64((~0ull >> (64 - desc[i].size_bits)) << shift);
+ addr = (__be64 *) ((__be32 *) buf + desc[i].offset_words);
+ *addr = (*addr & ~mask) | (cpu_to_be64(val) & mask);
+ } else {
+ if (desc[i].offset_bits % 8 ||
+ desc[i].size_bits % 8) {
+ pr_warn("Structure field %s of size %d bits is not byte-aligned\n",
+ desc[i].field_name, desc[i].size_bits);
+ }
+
+ if (desc[i].struct_size_bytes)
+ memcpy(buf + desc[i].offset_words * 4 +
+ desc[i].offset_bits / 8,
+ structure + desc[i].struct_offset_bytes,
+ desc[i].size_bits / 8);
+ else
+ memset(buf + desc[i].offset_words * 4 +
+ desc[i].offset_bits / 8,
+ 0,
+ desc[i].size_bits / 8);
+ }
+ }
+}
+EXPORT_SYMBOL(ib_pack);
+
+static void value_write(int offset, int size, u64 val, void *structure)
+{
+ switch (size * 8) {
+ case 8: *( u8 *) (structure + offset) = val; break;
+ case 16: *(__be16 *) (structure + offset) = cpu_to_be16(val); break;
+ case 32: *(__be32 *) (structure + offset) = cpu_to_be32(val); break;
+ case 64: *(__be64 *) (structure + offset) = cpu_to_be64(val); break;
+ default:
+ pr_warn("Field size %d bits not handled\n", size * 8);
+ }
+}
+
+/**
+ * ib_unpack - Unpack a buffer into a structure
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @buf:Buffer to unpack from
+ * @structure:Structure to unpack into
+ *
+ * ib_pack() unpacks a list of structure fields from a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_unpack(const struct ib_field *desc,
+ int desc_len,
+ void *buf,
+ void *structure)
+{
+ int i;
+
+ for (i = 0; i < desc_len; ++i) {
+ if (!desc[i].struct_size_bytes)
+ continue;
+
+ if (desc[i].size_bits <= 32) {
+ int shift;
+ u32 val;
+ u32 mask;
+ __be32 *addr;
+
+ shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+ mask = ((1ull << desc[i].size_bits) - 1) << shift;
+ addr = (__be32 *) buf + desc[i].offset_words;
+ val = (be32_to_cpup(addr) & mask) >> shift;
+ value_write(desc[i].struct_offset_bytes,
+ desc[i].struct_size_bytes,
+ val,
+ structure);
+ } else if (desc[i].size_bits <= 64) {
+ int shift;
+ u64 val;
+ u64 mask;
+ __be64 *addr;
+
+ shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+ mask = (~0ull >> (64 - desc[i].size_bits)) << shift;
+ addr = (__be64 *) buf + desc[i].offset_words;
+ val = (be64_to_cpup(addr) & mask) >> shift;
+ value_write(desc[i].struct_offset_bytes,
+ desc[i].struct_size_bytes,
+ val,
+ structure);
+ } else {
+ if (desc[i].offset_bits % 8 ||
+ desc[i].size_bits % 8) {
+ pr_warn("Structure field %s of size %d bits is not byte-aligned\n",
+ desc[i].field_name, desc[i].size_bits);
+ }
+
+ memcpy(structure + desc[i].struct_offset_bytes,
+ buf + desc[i].offset_words * 4 +
+ desc[i].offset_bits / 8,
+ desc[i].size_bits / 8);
+ }
+ }
+}
+EXPORT_SYMBOL(ib_unpack);
diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
new file mode 100644
index 000000000..5819a2fb0
--- /dev/null
+++ b/drivers/infiniband/core/rdma_core.c
@@ -0,0 +1,1030 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+#include <linux/sched/mm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/uverbs_types.h>
+#include <linux/rcupdate.h>
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/rdma_user_ioctl.h>
+#include "uverbs.h"
+#include "core_priv.h"
+#include "rdma_core.h"
+
+void uverbs_uobject_get(struct ib_uobject *uobject)
+{
+ kref_get(&uobject->ref);
+}
+
+static void uverbs_uobject_free(struct kref *ref)
+{
+ struct ib_uobject *uobj =
+ container_of(ref, struct ib_uobject, ref);
+
+ if (uobj->uapi_object->type_class->needs_kfree_rcu)
+ kfree_rcu(uobj, rcu);
+ else
+ kfree(uobj);
+}
+
+void uverbs_uobject_put(struct ib_uobject *uobject)
+{
+ kref_put(&uobject->ref, uverbs_uobject_free);
+}
+
+static int uverbs_try_lock_object(struct ib_uobject *uobj,
+ enum rdma_lookup_mode mode)
+{
+ /*
+ * When a shared access is required, we use a positive counter. Each
+ * shared access request checks that the value != -1 and increment it.
+ * Exclusive access is required for operations like write or destroy.
+ * In exclusive access mode, we check that the counter is zero (nobody
+ * claimed this object) and we set it to -1. Releasing a shared access
+ * lock is done simply by decreasing the counter. As for exclusive
+ * access locks, since only a single one of them is is allowed
+ * concurrently, setting the counter to zero is enough for releasing
+ * this lock.
+ */
+ switch (mode) {
+ case UVERBS_LOOKUP_READ:
+ return atomic_fetch_add_unless(&uobj->usecnt, 1, -1) == -1 ?
+ -EBUSY : 0;
+ case UVERBS_LOOKUP_WRITE:
+ /* lock is exclusive */
+ return atomic_cmpxchg(&uobj->usecnt, 0, -1) == 0 ? 0 : -EBUSY;
+ case UVERBS_LOOKUP_DESTROY:
+ return 0;
+ }
+ return 0;
+}
+
+static void assert_uverbs_usecnt(struct ib_uobject *uobj,
+ enum rdma_lookup_mode mode)
+{
+#ifdef CONFIG_LOCKDEP
+ switch (mode) {
+ case UVERBS_LOOKUP_READ:
+ WARN_ON(atomic_read(&uobj->usecnt) <= 0);
+ break;
+ case UVERBS_LOOKUP_WRITE:
+ WARN_ON(atomic_read(&uobj->usecnt) != -1);
+ break;
+ case UVERBS_LOOKUP_DESTROY:
+ break;
+ }
+#endif
+}
+
+/*
+ * This must be called with the hw_destroy_rwsem locked for read or write,
+ * also the uobject itself must be locked for write.
+ *
+ * Upon return the HW object is guaranteed to be destroyed.
+ *
+ * For RDMA_REMOVE_ABORT, the hw_destroy_rwsem is not required to be held,
+ * however the type's allocat_commit function cannot have been called and the
+ * uobject cannot be on the uobjects_lists
+ *
+ * For RDMA_REMOVE_DESTROY the caller shold be holding a kref (eg via
+ * rdma_lookup_get_uobject) and the object is left in a state where the caller
+ * needs to call rdma_lookup_put_uobject.
+ *
+ * For all other destroy modes this function internally unlocks the uobject
+ * and consumes the kref on the uobj.
+ */
+static int uverbs_destroy_uobject(struct ib_uobject *uobj,
+ enum rdma_remove_reason reason)
+{
+ struct ib_uverbs_file *ufile = uobj->ufile;
+ unsigned long flags;
+ int ret;
+
+ lockdep_assert_held(&ufile->hw_destroy_rwsem);
+ assert_uverbs_usecnt(uobj, UVERBS_LOOKUP_WRITE);
+
+ if (uobj->object) {
+ ret = uobj->uapi_object->type_class->destroy_hw(uobj, reason);
+ if (ret) {
+ if (ib_is_destroy_retryable(ret, reason, uobj))
+ return ret;
+
+ /* Nothing to be done, dangle the memory and move on */
+ WARN(true,
+ "ib_uverbs: failed to remove uobject id %d, driver err=%d",
+ uobj->id, ret);
+ }
+
+ uobj->object = NULL;
+ }
+
+ if (reason == RDMA_REMOVE_ABORT) {
+ WARN_ON(!list_empty(&uobj->list));
+ WARN_ON(!uobj->context);
+ uobj->uapi_object->type_class->alloc_abort(uobj);
+ }
+
+ uobj->context = NULL;
+
+ /*
+ * For DESTROY the usecnt is not changed, the caller is expected to
+ * manage it via uobj_put_destroy(). Only DESTROY can remove the IDR
+ * handle.
+ */
+ if (reason != RDMA_REMOVE_DESTROY)
+ atomic_set(&uobj->usecnt, 0);
+ else
+ uobj->uapi_object->type_class->remove_handle(uobj);
+
+ if (!list_empty(&uobj->list)) {
+ spin_lock_irqsave(&ufile->uobjects_lock, flags);
+ list_del_init(&uobj->list);
+ spin_unlock_irqrestore(&ufile->uobjects_lock, flags);
+
+ /*
+ * Pairs with the get in rdma_alloc_commit_uobject(), could
+ * destroy uobj.
+ */
+ uverbs_uobject_put(uobj);
+ }
+
+ /*
+ * When aborting the stack kref remains owned by the core code, and is
+ * not transferred into the type. Pairs with the get in alloc_uobj
+ */
+ if (reason == RDMA_REMOVE_ABORT)
+ uverbs_uobject_put(uobj);
+
+ return 0;
+}
+
+/*
+ * This calls uverbs_destroy_uobject() using the RDMA_REMOVE_DESTROY
+ * sequence. It should only be used from command callbacks. On success the
+ * caller must pair this with uobj_put_destroy(). This
+ * version requires the caller to have already obtained an
+ * LOOKUP_DESTROY uobject kref.
+ */
+int uobj_destroy(struct ib_uobject *uobj)
+{
+ struct ib_uverbs_file *ufile = uobj->ufile;
+ int ret;
+
+ down_read(&ufile->hw_destroy_rwsem);
+
+ /*
+ * Once the uobject is destroyed by RDMA_REMOVE_DESTROY then it is left
+ * write locked as the callers put it back with UVERBS_LOOKUP_DESTROY.
+ * This is because any other concurrent thread can still see the object
+ * in the xarray due to RCU. Leaving it locked ensures nothing else will
+ * touch it.
+ */
+ ret = uverbs_try_lock_object(uobj, UVERBS_LOOKUP_WRITE);
+ if (ret)
+ goto out_unlock;
+
+ ret = uverbs_destroy_uobject(uobj, RDMA_REMOVE_DESTROY);
+ if (ret) {
+ atomic_set(&uobj->usecnt, 0);
+ goto out_unlock;
+ }
+
+out_unlock:
+ up_read(&ufile->hw_destroy_rwsem);
+ return ret;
+}
+
+/*
+ * uobj_get_destroy destroys the HW object and returns a handle to the uobj
+ * with a NULL object pointer. The caller must pair this with
+ * uobj_put_destroy().
+ */
+struct ib_uobject *__uobj_get_destroy(const struct uverbs_api_object *obj,
+ u32 id, struct ib_uverbs_file *ufile)
+{
+ struct ib_uobject *uobj;
+ int ret;
+
+ uobj = rdma_lookup_get_uobject(obj, ufile, id, UVERBS_LOOKUP_DESTROY);
+ if (IS_ERR(uobj))
+ return uobj;
+
+ ret = uobj_destroy(uobj);
+ if (ret) {
+ rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_DESTROY);
+ return ERR_PTR(ret);
+ }
+
+ return uobj;
+}
+
+/*
+ * Does both uobj_get_destroy() and uobj_put_destroy(). Returns success_res
+ * on success (negative errno on failure). For use by callers that do not need
+ * the uobj.
+ */
+int __uobj_perform_destroy(const struct uverbs_api_object *obj, u32 id,
+ struct ib_uverbs_file *ufile, int success_res)
+{
+ struct ib_uobject *uobj;
+
+ uobj = __uobj_get_destroy(obj, id, ufile);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+
+ uobj_put_destroy(uobj);
+ return success_res;
+}
+
+/* alloc_uobj must be undone by uverbs_destroy_uobject() */
+static struct ib_uobject *alloc_uobj(struct ib_uverbs_file *ufile,
+ const struct uverbs_api_object *obj)
+{
+ struct ib_uobject *uobj;
+ struct ib_ucontext *ucontext;
+
+ ucontext = ib_uverbs_get_ucontext(ufile);
+ if (IS_ERR(ucontext))
+ return ERR_CAST(ucontext);
+
+ uobj = kzalloc(obj->type_attrs->obj_size, GFP_KERNEL);
+ if (!uobj)
+ return ERR_PTR(-ENOMEM);
+ /*
+ * user_handle should be filled by the handler,
+ * The object is added to the list in the commit stage.
+ */
+ uobj->ufile = ufile;
+ uobj->context = ucontext;
+ INIT_LIST_HEAD(&uobj->list);
+ uobj->uapi_object = obj;
+ /*
+ * Allocated objects start out as write locked to deny any other
+ * syscalls from accessing them until they are committed. See
+ * rdma_alloc_commit_uobject
+ */
+ atomic_set(&uobj->usecnt, -1);
+ kref_init(&uobj->ref);
+
+ return uobj;
+}
+
+static int idr_add_uobj(struct ib_uobject *uobj)
+{
+ int ret;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&uobj->ufile->idr_lock);
+
+ /*
+ * We start with allocating an idr pointing to NULL. This represents an
+ * object which isn't initialized yet. We'll replace it later on with
+ * the real object once we commit.
+ */
+ ret = idr_alloc(&uobj->ufile->idr, NULL, 0,
+ min_t(unsigned long, U32_MAX - 1, INT_MAX), GFP_NOWAIT);
+ if (ret >= 0)
+ uobj->id = ret;
+
+ spin_unlock(&uobj->ufile->idr_lock);
+ idr_preload_end();
+
+ return ret < 0 ? ret : 0;
+}
+
+/* Returns the ib_uobject or an error. The caller should check for IS_ERR. */
+static struct ib_uobject *
+lookup_get_idr_uobject(const struct uverbs_api_object *obj,
+ struct ib_uverbs_file *ufile, s64 id,
+ enum rdma_lookup_mode mode)
+{
+ struct ib_uobject *uobj;
+ unsigned long idrno = id;
+
+ if (id < 0 || id > ULONG_MAX)
+ return ERR_PTR(-EINVAL);
+
+ rcu_read_lock();
+ /* object won't be released as we're protected in rcu */
+ uobj = idr_find(&ufile->idr, idrno);
+ if (!uobj) {
+ uobj = ERR_PTR(-ENOENT);
+ goto free;
+ }
+
+ /*
+ * The idr_find is guaranteed to return a pointer to something that
+ * isn't freed yet, or NULL, as the free after idr_remove goes through
+ * kfree_rcu(). However the object may still have been released and
+ * kfree() could be called at any time.
+ */
+ if (!kref_get_unless_zero(&uobj->ref))
+ uobj = ERR_PTR(-ENOENT);
+
+free:
+ rcu_read_unlock();
+ return uobj;
+}
+
+static struct ib_uobject *
+lookup_get_fd_uobject(const struct uverbs_api_object *obj,
+ struct ib_uverbs_file *ufile, s64 id,
+ enum rdma_lookup_mode mode)
+{
+ const struct uverbs_obj_fd_type *fd_type;
+ struct file *f;
+ struct ib_uobject *uobject;
+ int fdno = id;
+
+ if (fdno != id)
+ return ERR_PTR(-EINVAL);
+
+ if (mode != UVERBS_LOOKUP_READ)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (!obj->type_attrs)
+ return ERR_PTR(-EIO);
+ fd_type =
+ container_of(obj->type_attrs, struct uverbs_obj_fd_type, type);
+
+ f = fget(fdno);
+ if (!f)
+ return ERR_PTR(-EBADF);
+
+ uobject = f->private_data;
+ /*
+ * fget(id) ensures we are not currently running uverbs_close_fd,
+ * and the caller is expected to ensure that uverbs_close_fd is never
+ * done while a call top lookup is possible.
+ */
+ if (f->f_op != fd_type->fops || uobject->ufile != ufile) {
+ fput(f);
+ return ERR_PTR(-EBADF);
+ }
+
+ uverbs_uobject_get(uobject);
+ return uobject;
+}
+
+struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_api_object *obj,
+ struct ib_uverbs_file *ufile, s64 id,
+ enum rdma_lookup_mode mode)
+{
+ struct ib_uobject *uobj;
+ int ret;
+
+ if (!obj)
+ return ERR_PTR(-EINVAL);
+
+ uobj = obj->type_class->lookup_get(obj, ufile, id, mode);
+ if (IS_ERR(uobj))
+ return uobj;
+
+ if (uobj->uapi_object != obj) {
+ ret = -EINVAL;
+ goto free;
+ }
+
+ /*
+ * If we have been disassociated block every command except for
+ * DESTROY based commands.
+ */
+ if (mode != UVERBS_LOOKUP_DESTROY &&
+ !srcu_dereference(ufile->device->ib_dev,
+ &ufile->device->disassociate_srcu)) {
+ ret = -EIO;
+ goto free;
+ }
+
+ ret = uverbs_try_lock_object(uobj, mode);
+ if (ret)
+ goto free;
+
+ return uobj;
+free:
+ obj->type_class->lookup_put(uobj, mode);
+ uverbs_uobject_put(uobj);
+ return ERR_PTR(ret);
+}
+
+static struct ib_uobject *
+alloc_begin_idr_uobject(const struct uverbs_api_object *obj,
+ struct ib_uverbs_file *ufile)
+{
+ int ret;
+ struct ib_uobject *uobj;
+
+ uobj = alloc_uobj(ufile, obj);
+ if (IS_ERR(uobj))
+ return uobj;
+
+ ret = idr_add_uobj(uobj);
+ if (ret)
+ goto uobj_put;
+
+ ret = ib_rdmacg_try_charge(&uobj->cg_obj, uobj->context->device,
+ RDMACG_RESOURCE_HCA_OBJECT);
+ if (ret)
+ goto idr_remove;
+
+ return uobj;
+
+idr_remove:
+ spin_lock(&ufile->idr_lock);
+ idr_remove(&ufile->idr, uobj->id);
+ spin_unlock(&ufile->idr_lock);
+uobj_put:
+ uverbs_uobject_put(uobj);
+ return ERR_PTR(ret);
+}
+
+static struct ib_uobject *
+alloc_begin_fd_uobject(const struct uverbs_api_object *obj,
+ struct ib_uverbs_file *ufile)
+{
+ int new_fd;
+ struct ib_uobject *uobj;
+
+ new_fd = get_unused_fd_flags(O_CLOEXEC);
+ if (new_fd < 0)
+ return ERR_PTR(new_fd);
+
+ uobj = alloc_uobj(ufile, obj);
+ if (IS_ERR(uobj)) {
+ put_unused_fd(new_fd);
+ return uobj;
+ }
+
+ uobj->id = new_fd;
+ uobj->ufile = ufile;
+
+ return uobj;
+}
+
+struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj,
+ struct ib_uverbs_file *ufile)
+{
+ struct ib_uobject *ret;
+
+ if (!obj)
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * The hw_destroy_rwsem is held across the entire object creation and
+ * released during rdma_alloc_commit_uobject or
+ * rdma_alloc_abort_uobject
+ */
+ if (!down_read_trylock(&ufile->hw_destroy_rwsem))
+ return ERR_PTR(-EIO);
+
+ ret = obj->type_class->alloc_begin(obj, ufile);
+ if (IS_ERR(ret)) {
+ up_read(&ufile->hw_destroy_rwsem);
+ return ret;
+ }
+ return ret;
+}
+
+static void alloc_abort_idr_uobject(struct ib_uobject *uobj)
+{
+ ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device,
+ RDMACG_RESOURCE_HCA_OBJECT);
+
+ spin_lock(&uobj->ufile->idr_lock);
+ idr_remove(&uobj->ufile->idr, uobj->id);
+ spin_unlock(&uobj->ufile->idr_lock);
+}
+
+static int __must_check destroy_hw_idr_uobject(struct ib_uobject *uobj,
+ enum rdma_remove_reason why)
+{
+ const struct uverbs_obj_idr_type *idr_type =
+ container_of(uobj->uapi_object->type_attrs,
+ struct uverbs_obj_idr_type, type);
+ int ret = idr_type->destroy_object(uobj, why);
+
+ /*
+ * We can only fail gracefully if the user requested to destroy the
+ * object or when a retry may be called upon an error.
+ * In the rest of the cases, just remove whatever you can.
+ */
+ if (ib_is_destroy_retryable(ret, why, uobj))
+ return ret;
+
+ if (why == RDMA_REMOVE_ABORT)
+ return 0;
+
+ ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device,
+ RDMACG_RESOURCE_HCA_OBJECT);
+
+ return 0;
+}
+
+static void remove_handle_idr_uobject(struct ib_uobject *uobj)
+{
+ spin_lock(&uobj->ufile->idr_lock);
+ idr_remove(&uobj->ufile->idr, uobj->id);
+ spin_unlock(&uobj->ufile->idr_lock);
+ /* Matches the kref in alloc_commit_idr_uobject */
+ uverbs_uobject_put(uobj);
+}
+
+static void alloc_abort_fd_uobject(struct ib_uobject *uobj)
+{
+ put_unused_fd(uobj->id);
+}
+
+static int __must_check destroy_hw_fd_uobject(struct ib_uobject *uobj,
+ enum rdma_remove_reason why)
+{
+ const struct uverbs_obj_fd_type *fd_type = container_of(
+ uobj->uapi_object->type_attrs, struct uverbs_obj_fd_type, type);
+ int ret = fd_type->context_closed(uobj, why);
+
+ if (ib_is_destroy_retryable(ret, why, uobj))
+ return ret;
+
+ return 0;
+}
+
+static void remove_handle_fd_uobject(struct ib_uobject *uobj)
+{
+}
+
+static int alloc_commit_idr_uobject(struct ib_uobject *uobj)
+{
+ struct ib_uverbs_file *ufile = uobj->ufile;
+
+ spin_lock(&ufile->idr_lock);
+ /*
+ * We already allocated this IDR with a NULL object, so
+ * this shouldn't fail.
+ *
+ * NOTE: Once we set the IDR we loose ownership of our kref on uobj.
+ * It will be put by remove_commit_idr_uobject()
+ */
+ WARN_ON(idr_replace(&ufile->idr, uobj, uobj->id));
+ spin_unlock(&ufile->idr_lock);
+
+ return 0;
+}
+
+static int alloc_commit_fd_uobject(struct ib_uobject *uobj)
+{
+ const struct uverbs_obj_fd_type *fd_type = container_of(
+ uobj->uapi_object->type_attrs, struct uverbs_obj_fd_type, type);
+ int fd = uobj->id;
+ struct file *filp;
+
+ /*
+ * The kref for uobj is moved into filp->private data and put in
+ * uverbs_close_fd(). Once alloc_commit() succeeds uverbs_close_fd()
+ * must be guaranteed to be called from the provided fops release
+ * callback.
+ */
+ filp = anon_inode_getfile(fd_type->name,
+ fd_type->fops,
+ uobj,
+ fd_type->flags);
+ if (IS_ERR(filp))
+ return PTR_ERR(filp);
+
+ uobj->object = filp;
+
+ /* Matching put will be done in uverbs_close_fd() */
+ kref_get(&uobj->ufile->ref);
+
+ /* This shouldn't be used anymore. Use the file object instead */
+ uobj->id = 0;
+
+ /*
+ * NOTE: Once we install the file we loose ownership of our kref on
+ * uobj. It will be put by uverbs_close_fd()
+ */
+ fd_install(fd, filp);
+
+ return 0;
+}
+
+/*
+ * In all cases rdma_alloc_commit_uobject() consumes the kref to uobj and the
+ * caller can no longer assume uobj is valid. If this function fails it
+ * destroys the uboject, including the attached HW object.
+ */
+int __must_check rdma_alloc_commit_uobject(struct ib_uobject *uobj)
+{
+ struct ib_uverbs_file *ufile = uobj->ufile;
+ int ret;
+
+ /* alloc_commit consumes the uobj kref */
+ ret = uobj->uapi_object->type_class->alloc_commit(uobj);
+ if (ret) {
+ uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT);
+ up_read(&ufile->hw_destroy_rwsem);
+ return ret;
+ }
+
+ /* kref is held so long as the uobj is on the uobj list. */
+ uverbs_uobject_get(uobj);
+ spin_lock_irq(&ufile->uobjects_lock);
+ list_add(&uobj->list, &ufile->uobjects);
+ spin_unlock_irq(&ufile->uobjects_lock);
+
+ /* matches atomic_set(-1) in alloc_uobj */
+ atomic_set(&uobj->usecnt, 0);
+
+ /* Matches the down_read in rdma_alloc_begin_uobject */
+ up_read(&ufile->hw_destroy_rwsem);
+
+ return 0;
+}
+
+/*
+ * This consumes the kref for uobj. It is up to the caller to unwind the HW
+ * object and anything else connected to uobj before calling this.
+ */
+void rdma_alloc_abort_uobject(struct ib_uobject *uobj)
+{
+ struct ib_uverbs_file *ufile = uobj->ufile;
+
+ uobj->object = NULL;
+ uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT);
+
+ /* Matches the down_read in rdma_alloc_begin_uobject */
+ up_read(&ufile->hw_destroy_rwsem);
+}
+
+static void lookup_put_idr_uobject(struct ib_uobject *uobj,
+ enum rdma_lookup_mode mode)
+{
+}
+
+static void lookup_put_fd_uobject(struct ib_uobject *uobj,
+ enum rdma_lookup_mode mode)
+{
+ struct file *filp = uobj->object;
+
+ WARN_ON(mode != UVERBS_LOOKUP_READ);
+ /* This indirectly calls uverbs_close_fd and free the object */
+ fput(filp);
+}
+
+void rdma_lookup_put_uobject(struct ib_uobject *uobj,
+ enum rdma_lookup_mode mode)
+{
+ assert_uverbs_usecnt(uobj, mode);
+ /*
+ * In order to unlock an object, either decrease its usecnt for
+ * read access or zero it in case of exclusive access. See
+ * uverbs_try_lock_object for locking schema information.
+ */
+ switch (mode) {
+ case UVERBS_LOOKUP_READ:
+ atomic_dec(&uobj->usecnt);
+ break;
+ case UVERBS_LOOKUP_WRITE:
+ atomic_set(&uobj->usecnt, 0);
+ break;
+ case UVERBS_LOOKUP_DESTROY:
+ break;
+ }
+
+ uobj->uapi_object->type_class->lookup_put(uobj, mode);
+ /* Pairs with the kref obtained by type->lookup_get */
+ uverbs_uobject_put(uobj);
+}
+
+void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile)
+{
+ spin_lock_init(&ufile->idr_lock);
+ idr_init(&ufile->idr);
+}
+
+void release_ufile_idr_uobject(struct ib_uverbs_file *ufile)
+{
+ struct ib_uobject *entry;
+ int id;
+
+ /*
+ * At this point uverbs_cleanup_ufile() is guaranteed to have run, and
+ * there are no HW objects left, however the IDR is still populated
+ * with anything that has not been cleaned up by userspace. Since the
+ * kref on ufile is 0, nothing is allowed to call lookup_get.
+ *
+ * This is an optimized equivalent to remove_handle_idr_uobject
+ */
+ idr_for_each_entry(&ufile->idr, entry, id) {
+ WARN_ON(entry->object);
+ uverbs_uobject_put(entry);
+ }
+
+ idr_destroy(&ufile->idr);
+}
+
+const struct uverbs_obj_type_class uverbs_idr_class = {
+ .alloc_begin = alloc_begin_idr_uobject,
+ .lookup_get = lookup_get_idr_uobject,
+ .alloc_commit = alloc_commit_idr_uobject,
+ .alloc_abort = alloc_abort_idr_uobject,
+ .lookup_put = lookup_put_idr_uobject,
+ .destroy_hw = destroy_hw_idr_uobject,
+ .remove_handle = remove_handle_idr_uobject,
+ /*
+ * When we destroy an object, we first just lock it for WRITE and
+ * actually DESTROY it in the finalize stage. So, the problematic
+ * scenario is when we just started the finalize stage of the
+ * destruction (nothing was executed yet). Now, the other thread
+ * fetched the object for READ access, but it didn't lock it yet.
+ * The DESTROY thread continues and starts destroying the object.
+ * When the other thread continue - without the RCU, it would
+ * access freed memory. However, the rcu_read_lock delays the free
+ * until the rcu_read_lock of the READ operation quits. Since the
+ * exclusive lock of the object is still taken by the DESTROY flow, the
+ * READ operation will get -EBUSY and it'll just bail out.
+ */
+ .needs_kfree_rcu = true,
+};
+EXPORT_SYMBOL(uverbs_idr_class);
+
+void uverbs_close_fd(struct file *f)
+{
+ struct ib_uobject *uobj = f->private_data;
+ struct ib_uverbs_file *ufile = uobj->ufile;
+
+ if (down_read_trylock(&ufile->hw_destroy_rwsem)) {
+ /*
+ * lookup_get_fd_uobject holds the kref on the struct file any
+ * time a FD uobj is locked, which prevents this release
+ * method from being invoked. Meaning we can always get the
+ * write lock here, or we have a kernel bug.
+ */
+ WARN_ON(uverbs_try_lock_object(uobj, UVERBS_LOOKUP_WRITE));
+ uverbs_destroy_uobject(uobj, RDMA_REMOVE_CLOSE);
+ up_read(&ufile->hw_destroy_rwsem);
+ }
+
+ /* Matches the get in alloc_begin_fd_uobject */
+ kref_put(&ufile->ref, ib_uverbs_release_file);
+
+ /* Pairs with filp->private_data in alloc_begin_fd_uobject */
+ uverbs_uobject_put(uobj);
+}
+
+static void ufile_disassociate_ucontext(struct ib_ucontext *ibcontext)
+{
+ struct ib_device *ib_dev = ibcontext->device;
+ struct task_struct *owning_process = NULL;
+ struct mm_struct *owning_mm = NULL;
+
+ owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID);
+ if (!owning_process)
+ return;
+
+ owning_mm = get_task_mm(owning_process);
+ if (!owning_mm) {
+ pr_info("no mm, disassociate ucontext is pending task termination\n");
+ while (1) {
+ put_task_struct(owning_process);
+ usleep_range(1000, 2000);
+ owning_process = get_pid_task(ibcontext->tgid,
+ PIDTYPE_PID);
+ if (!owning_process ||
+ owning_process->state == TASK_DEAD) {
+ pr_info("disassociate ucontext done, task was terminated\n");
+ /* in case task was dead need to release the
+ * task struct.
+ */
+ if (owning_process)
+ put_task_struct(owning_process);
+ return;
+ }
+ }
+ }
+
+ down_write(&owning_mm->mmap_sem);
+ ib_dev->disassociate_ucontext(ibcontext);
+ up_write(&owning_mm->mmap_sem);
+ mmput(owning_mm);
+ put_task_struct(owning_process);
+}
+
+/*
+ * Drop the ucontext off the ufile and completely disconnect it from the
+ * ib_device
+ */
+static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile,
+ enum rdma_remove_reason reason)
+{
+ struct ib_ucontext *ucontext = ufile->ucontext;
+ int ret;
+
+ if (reason == RDMA_REMOVE_DRIVER_REMOVE)
+ ufile_disassociate_ucontext(ucontext);
+
+ put_pid(ucontext->tgid);
+ ib_rdmacg_uncharge(&ucontext->cg_obj, ucontext->device,
+ RDMACG_RESOURCE_HCA_HANDLE);
+
+ /*
+ * FIXME: Drivers are not permitted to fail dealloc_ucontext, remove
+ * the error return.
+ */
+ ret = ucontext->device->dealloc_ucontext(ucontext);
+ WARN_ON(ret);
+
+ ufile->ucontext = NULL;
+}
+
+static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile,
+ enum rdma_remove_reason reason)
+{
+ struct ib_uobject *obj, *next_obj;
+ int ret = -EINVAL;
+
+ /*
+ * This shouldn't run while executing other commands on this
+ * context. Thus, the only thing we should take care of is
+ * releasing a FD while traversing this list. The FD could be
+ * closed and released from the _release fop of this FD.
+ * In order to mitigate this, we add a lock.
+ * We take and release the lock per traversal in order to let
+ * other threads (which might still use the FDs) chance to run.
+ */
+ list_for_each_entry_safe(obj, next_obj, &ufile->uobjects, list) {
+ /*
+ * if we hit this WARN_ON, that means we are
+ * racing with a lookup_get.
+ */
+ WARN_ON(uverbs_try_lock_object(obj, UVERBS_LOOKUP_WRITE));
+ if (!uverbs_destroy_uobject(obj, reason))
+ ret = 0;
+ else
+ atomic_set(&obj->usecnt, 0);
+ }
+ return ret;
+}
+
+/*
+ * Destroy the uncontext and every uobject associated with it. If called with
+ * reason != RDMA_REMOVE_CLOSE this will not return until the destruction has
+ * been completed and ufile->ucontext is NULL.
+ *
+ * This is internally locked and can be called in parallel from multiple
+ * contexts.
+ */
+void uverbs_destroy_ufile_hw(struct ib_uverbs_file *ufile,
+ enum rdma_remove_reason reason)
+{
+ if (reason == RDMA_REMOVE_CLOSE) {
+ /*
+ * During destruction we might trigger something that
+ * synchronously calls release on any file descriptor. For
+ * this reason all paths that come from file_operations
+ * release must use try_lock. They can progress knowing that
+ * there is an ongoing uverbs_destroy_ufile_hw that will clean
+ * up the driver resources.
+ */
+ if (!mutex_trylock(&ufile->ucontext_lock))
+ return;
+
+ } else {
+ mutex_lock(&ufile->ucontext_lock);
+ }
+
+ down_write(&ufile->hw_destroy_rwsem);
+
+ /*
+ * If a ucontext was never created then we can't have any uobjects to
+ * cleanup, nothing to do.
+ */
+ if (!ufile->ucontext)
+ goto done;
+
+ ufile->ucontext->closing = true;
+ ufile->ucontext->cleanup_retryable = true;
+ while (!list_empty(&ufile->uobjects))
+ if (__uverbs_cleanup_ufile(ufile, reason)) {
+ /*
+ * No entry was cleaned-up successfully during this
+ * iteration
+ */
+ break;
+ }
+
+ ufile->ucontext->cleanup_retryable = false;
+ if (!list_empty(&ufile->uobjects))
+ __uverbs_cleanup_ufile(ufile, reason);
+
+ ufile_destroy_ucontext(ufile, reason);
+
+done:
+ up_write(&ufile->hw_destroy_rwsem);
+ mutex_unlock(&ufile->ucontext_lock);
+}
+
+const struct uverbs_obj_type_class uverbs_fd_class = {
+ .alloc_begin = alloc_begin_fd_uobject,
+ .lookup_get = lookup_get_fd_uobject,
+ .alloc_commit = alloc_commit_fd_uobject,
+ .alloc_abort = alloc_abort_fd_uobject,
+ .lookup_put = lookup_put_fd_uobject,
+ .destroy_hw = destroy_hw_fd_uobject,
+ .remove_handle = remove_handle_fd_uobject,
+ .needs_kfree_rcu = false,
+};
+EXPORT_SYMBOL(uverbs_fd_class);
+
+struct ib_uobject *
+uverbs_get_uobject_from_file(u16 object_id,
+ struct ib_uverbs_file *ufile,
+ enum uverbs_obj_access access, s64 id)
+{
+ const struct uverbs_api_object *obj =
+ uapi_get_object(ufile->device->uapi, object_id);
+
+ switch (access) {
+ case UVERBS_ACCESS_READ:
+ return rdma_lookup_get_uobject(obj, ufile, id,
+ UVERBS_LOOKUP_READ);
+ case UVERBS_ACCESS_DESTROY:
+ /* Actual destruction is done inside uverbs_handle_method */
+ return rdma_lookup_get_uobject(obj, ufile, id,
+ UVERBS_LOOKUP_DESTROY);
+ case UVERBS_ACCESS_WRITE:
+ return rdma_lookup_get_uobject(obj, ufile, id,
+ UVERBS_LOOKUP_WRITE);
+ case UVERBS_ACCESS_NEW:
+ return rdma_alloc_begin_uobject(obj, ufile);
+ default:
+ WARN_ON(true);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+}
+
+int uverbs_finalize_object(struct ib_uobject *uobj,
+ enum uverbs_obj_access access,
+ bool commit)
+{
+ int ret = 0;
+
+ /*
+ * refcounts should be handled at the object level and not at the
+ * uobject level. Refcounts of the objects themselves are done in
+ * handlers.
+ */
+
+ switch (access) {
+ case UVERBS_ACCESS_READ:
+ rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_READ);
+ break;
+ case UVERBS_ACCESS_WRITE:
+ rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_WRITE);
+ break;
+ case UVERBS_ACCESS_DESTROY:
+ if (uobj)
+ rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_DESTROY);
+ break;
+ case UVERBS_ACCESS_NEW:
+ if (commit)
+ ret = rdma_alloc_commit_uobject(uobj);
+ else
+ rdma_alloc_abort_uobject(uobj);
+ break;
+ default:
+ WARN_ON(true);
+ ret = -EOPNOTSUPP;
+ }
+
+ return ret;
+}
diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h
new file mode 100644
index 000000000..f962f2a59
--- /dev/null
+++ b/drivers/infiniband/core/rdma_core.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005-2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RDMA_CORE_H
+#define RDMA_CORE_H
+
+#include <linux/idr.h>
+#include <rdma/uverbs_types.h>
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/ib_verbs.h>
+#include <linux/mutex.h>
+
+struct ib_uverbs_device;
+
+void uverbs_destroy_ufile_hw(struct ib_uverbs_file *ufile,
+ enum rdma_remove_reason reason);
+
+int uobj_destroy(struct ib_uobject *uobj);
+
+/*
+ * uverbs_uobject_get is called in order to increase the reference count on
+ * an uobject. This is useful when a handler wants to keep the uobject's memory
+ * alive, regardless if this uobject is still alive in the context's objects
+ * repository. Objects are put via uverbs_uobject_put.
+ */
+void uverbs_uobject_get(struct ib_uobject *uobject);
+
+/*
+ * In order to indicate we no longer needs this uobject, uverbs_uobject_put
+ * is called. When the reference count is decreased, the uobject is freed.
+ * For example, this is used when attaching a completion channel to a CQ.
+ */
+void uverbs_uobject_put(struct ib_uobject *uobject);
+
+/* Indicate this fd is no longer used by this consumer, but its memory isn't
+ * necessarily released yet. When the last reference is put, we release the
+ * memory. After this call is executed, calling uverbs_uobject_get isn't
+ * allowed.
+ * This must be called from the release file_operations of the file!
+ */
+void uverbs_close_fd(struct file *f);
+
+/*
+ * Get an ib_uobject that corresponds to the given id from ufile, assuming
+ * the object is from the given type. Lock it to the required access when
+ * applicable.
+ * This function could create (access == NEW), destroy (access == DESTROY)
+ * or unlock (access == READ || access == WRITE) objects if required.
+ * The action will be finalized only when uverbs_finalize_object or
+ * uverbs_finalize_objects are called.
+ */
+struct ib_uobject *
+uverbs_get_uobject_from_file(u16 object_id,
+ struct ib_uverbs_file *ufile,
+ enum uverbs_obj_access access, s64 id);
+
+/*
+ * Note that certain finalize stages could return a status:
+ * (a) alloc_commit could return a failure if the object is committed at the
+ * same time when the context is destroyed.
+ * (b) remove_commit could fail if the object wasn't destroyed successfully.
+ * Since multiple objects could be finalized in one transaction, it is very NOT
+ * recommended to have several finalize actions which have side effects.
+ * For example, it's NOT recommended to have a certain action which has both
+ * a commit action and a destroy action or two destroy objects in the same
+ * action. The rule of thumb is to have one destroy or commit action with
+ * multiple lookups.
+ * The first non zero return value of finalize_object is returned from this
+ * function. For example, this could happen when we couldn't destroy an
+ * object.
+ */
+int uverbs_finalize_object(struct ib_uobject *uobj,
+ enum uverbs_obj_access access,
+ bool commit);
+
+void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile);
+void release_ufile_idr_uobject(struct ib_uverbs_file *ufile);
+
+/*
+ * This is the runtime description of the uverbs API, used by the syscall
+ * machinery to validate and dispatch calls.
+ */
+
+/*
+ * Depending on ID the slot pointer in the radix tree points at one of these
+ * structs.
+ */
+struct uverbs_api_object {
+ const struct uverbs_obj_type *type_attrs;
+ const struct uverbs_obj_type_class *type_class;
+};
+
+struct uverbs_api_ioctl_method {
+ int (__rcu *handler)(struct ib_uverbs_file *ufile,
+ struct uverbs_attr_bundle *ctx);
+ DECLARE_BITMAP(attr_mandatory, UVERBS_API_ATTR_BKEY_LEN);
+ u16 bundle_size;
+ u8 use_stack:1;
+ u8 driver_method:1;
+ u8 key_bitmap_len;
+ u8 destroy_bkey;
+};
+
+struct uverbs_api_attr {
+ struct uverbs_attr_spec spec;
+};
+
+struct uverbs_api_object;
+struct uverbs_api {
+ /* radix tree contains struct uverbs_api_* pointers */
+ struct radix_tree_root radix;
+ enum rdma_driver_id driver_id;
+};
+
+static inline const struct uverbs_api_object *
+uapi_get_object(struct uverbs_api *uapi, u16 object_id)
+{
+ return radix_tree_lookup(&uapi->radix, uapi_key_obj(object_id));
+}
+
+char *uapi_key_format(char *S, unsigned int key);
+struct uverbs_api *uverbs_alloc_api(
+ const struct uverbs_object_tree_def *const *driver_specs,
+ enum rdma_driver_id driver_id);
+void uverbs_disassociate_api_pre(struct ib_uverbs_device *uverbs_dev);
+void uverbs_disassociate_api(struct uverbs_api *uapi);
+void uverbs_destroy_api(struct uverbs_api *uapi);
+void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm,
+ unsigned int num_attrs);
+
+#endif /* RDMA_CORE_H */
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
new file mode 100644
index 000000000..279f0ae65
--- /dev/null
+++ b/drivers/infiniband/core/restrack.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved.
+ */
+
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/restrack.h>
+#include <linux/mutex.h>
+#include <linux/sched/task.h>
+#include <linux/pid_namespace.h>
+
+#include "cma_priv.h"
+
+static int fill_res_noop(struct sk_buff *msg,
+ struct rdma_restrack_entry *entry)
+{
+ return 0;
+}
+
+void rdma_restrack_init(struct rdma_restrack_root *res)
+{
+ init_rwsem(&res->rwsem);
+ res->fill_res_entry = fill_res_noop;
+}
+
+static const char *type2str(enum rdma_restrack_type type)
+{
+ static const char * const names[RDMA_RESTRACK_MAX] = {
+ [RDMA_RESTRACK_PD] = "PD",
+ [RDMA_RESTRACK_CQ] = "CQ",
+ [RDMA_RESTRACK_QP] = "QP",
+ [RDMA_RESTRACK_CM_ID] = "CM_ID",
+ [RDMA_RESTRACK_MR] = "MR",
+ };
+
+ return names[type];
+};
+
+void rdma_restrack_clean(struct rdma_restrack_root *res)
+{
+ struct rdma_restrack_entry *e;
+ char buf[TASK_COMM_LEN];
+ struct ib_device *dev;
+ const char *owner;
+ int bkt;
+
+ if (hash_empty(res->hash))
+ return;
+
+ dev = container_of(res, struct ib_device, res);
+ pr_err("restrack: %s", CUT_HERE);
+ pr_err("restrack: BUG: RESTRACK detected leak of resources on %s\n",
+ dev->name);
+ hash_for_each(res->hash, bkt, e, node) {
+ if (rdma_is_kernel_res(e)) {
+ owner = e->kern_name;
+ } else {
+ /*
+ * There is no need to call get_task_struct here,
+ * because we can be here only if there are more
+ * get_task_struct() call than put_task_struct().
+ */
+ get_task_comm(buf, e->task);
+ owner = buf;
+ }
+
+ pr_err("restrack: %s %s object allocated by %s is not freed\n",
+ rdma_is_kernel_res(e) ? "Kernel" : "User",
+ type2str(e->type), owner);
+ }
+ pr_err("restrack: %s", CUT_HERE);
+}
+
+int rdma_restrack_count(struct rdma_restrack_root *res,
+ enum rdma_restrack_type type,
+ struct pid_namespace *ns)
+{
+ struct rdma_restrack_entry *e;
+ u32 cnt = 0;
+
+ down_read(&res->rwsem);
+ hash_for_each_possible(res->hash, e, node, type) {
+ if (ns == &init_pid_ns ||
+ (!rdma_is_kernel_res(e) &&
+ ns == task_active_pid_ns(e->task)))
+ cnt++;
+ }
+ up_read(&res->rwsem);
+ return cnt;
+}
+EXPORT_SYMBOL(rdma_restrack_count);
+
+static void set_kern_name(struct rdma_restrack_entry *res)
+{
+ struct ib_pd *pd;
+
+ switch (res->type) {
+ case RDMA_RESTRACK_QP:
+ pd = container_of(res, struct ib_qp, res)->pd;
+ if (!pd) {
+ WARN_ONCE(true, "XRC QPs are not supported\n");
+ /* Survive, despite the programmer's error */
+ res->kern_name = " ";
+ }
+ break;
+ case RDMA_RESTRACK_MR:
+ pd = container_of(res, struct ib_mr, res)->pd;
+ break;
+ default:
+ /* Other types set kern_name directly */
+ pd = NULL;
+ break;
+ }
+
+ if (pd)
+ res->kern_name = pd->res.kern_name;
+}
+
+static struct ib_device *res_to_dev(struct rdma_restrack_entry *res)
+{
+ switch (res->type) {
+ case RDMA_RESTRACK_PD:
+ return container_of(res, struct ib_pd, res)->device;
+ case RDMA_RESTRACK_CQ:
+ return container_of(res, struct ib_cq, res)->device;
+ case RDMA_RESTRACK_QP:
+ return container_of(res, struct ib_qp, res)->device;
+ case RDMA_RESTRACK_CM_ID:
+ return container_of(res, struct rdma_id_private,
+ res)->id.device;
+ case RDMA_RESTRACK_MR:
+ return container_of(res, struct ib_mr, res)->device;
+ default:
+ WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type);
+ return NULL;
+ }
+}
+
+static bool res_is_user(struct rdma_restrack_entry *res)
+{
+ switch (res->type) {
+ case RDMA_RESTRACK_PD:
+ return container_of(res, struct ib_pd, res)->uobject;
+ case RDMA_RESTRACK_CQ:
+ return container_of(res, struct ib_cq, res)->uobject;
+ case RDMA_RESTRACK_QP:
+ return container_of(res, struct ib_qp, res)->uobject;
+ case RDMA_RESTRACK_CM_ID:
+ return !res->kern_name;
+ case RDMA_RESTRACK_MR:
+ return container_of(res, struct ib_mr, res)->pd->uobject;
+ default:
+ WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type);
+ return false;
+ }
+}
+
+void rdma_restrack_add(struct rdma_restrack_entry *res)
+{
+ struct ib_device *dev = res_to_dev(res);
+
+ if (!dev)
+ return;
+
+ if (res->type != RDMA_RESTRACK_CM_ID || !res_is_user(res))
+ res->task = NULL;
+
+ if (res_is_user(res)) {
+ if (!res->task)
+ rdma_restrack_set_task(res, current);
+ res->kern_name = NULL;
+ } else {
+ set_kern_name(res);
+ }
+
+ kref_init(&res->kref);
+ init_completion(&res->comp);
+ res->valid = true;
+
+ down_write(&dev->res.rwsem);
+ hash_add(dev->res.hash, &res->node, res->type);
+ up_write(&dev->res.rwsem);
+}
+EXPORT_SYMBOL(rdma_restrack_add);
+
+int __must_check rdma_restrack_get(struct rdma_restrack_entry *res)
+{
+ return kref_get_unless_zero(&res->kref);
+}
+EXPORT_SYMBOL(rdma_restrack_get);
+
+static void restrack_release(struct kref *kref)
+{
+ struct rdma_restrack_entry *res;
+
+ res = container_of(kref, struct rdma_restrack_entry, kref);
+ complete(&res->comp);
+}
+
+int rdma_restrack_put(struct rdma_restrack_entry *res)
+{
+ return kref_put(&res->kref, restrack_release);
+}
+EXPORT_SYMBOL(rdma_restrack_put);
+
+void rdma_restrack_del(struct rdma_restrack_entry *res)
+{
+ struct ib_device *dev;
+
+ if (!res->valid)
+ goto out;
+
+ dev = res_to_dev(res);
+ if (!dev)
+ return;
+
+ rdma_restrack_put(res);
+
+ wait_for_completion(&res->comp);
+
+ down_write(&dev->res.rwsem);
+ hash_del(&res->node);
+ res->valid = false;
+ up_write(&dev->res.rwsem);
+
+out:
+ if (res->task) {
+ put_task_struct(res->task);
+ res->task = NULL;
+ }
+}
+EXPORT_SYMBOL(rdma_restrack_del);
diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
new file mode 100644
index 000000000..558de0b98
--- /dev/null
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -0,0 +1,924 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "core_priv.h"
+
+#include <linux/in.h>
+#include <linux/in6.h>
+
+/* For in6_dev_get/in6_dev_put */
+#include <net/addrconf.h>
+#include <net/bonding.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+
+static struct workqueue_struct *gid_cache_wq;
+
+enum gid_op_type {
+ GID_DEL = 0,
+ GID_ADD
+};
+
+struct update_gid_event_work {
+ struct work_struct work;
+ union ib_gid gid;
+ struct ib_gid_attr gid_attr;
+ enum gid_op_type gid_op;
+};
+
+#define ROCE_NETDEV_CALLBACK_SZ 3
+struct netdev_event_work_cmd {
+ roce_netdev_callback cb;
+ roce_netdev_filter filter;
+ struct net_device *ndev;
+ struct net_device *filter_ndev;
+};
+
+struct netdev_event_work {
+ struct work_struct work;
+ struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ];
+};
+
+static const struct {
+ bool (*is_supported)(const struct ib_device *device, u8 port_num);
+ enum ib_gid_type gid_type;
+} PORT_CAP_TO_GID_TYPE[] = {
+ {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE},
+ {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP},
+};
+
+#define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE)
+
+unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port)
+{
+ int i;
+ unsigned int ret_flags = 0;
+
+ if (!rdma_protocol_roce(ib_dev, port))
+ return 1UL << IB_GID_TYPE_IB;
+
+ for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++)
+ if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port))
+ ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type;
+
+ return ret_flags;
+}
+EXPORT_SYMBOL(roce_gid_type_mask_support);
+
+static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
+ u8 port, union ib_gid *gid,
+ struct ib_gid_attr *gid_attr)
+{
+ int i;
+ unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
+ for (i = 0; i < IB_GID_TYPE_SIZE; i++) {
+ if ((1UL << i) & gid_type_mask) {
+ gid_attr->gid_type = i;
+ switch (gid_op) {
+ case GID_ADD:
+ ib_cache_gid_add(ib_dev, port,
+ gid, gid_attr);
+ break;
+ case GID_DEL:
+ ib_cache_gid_del(ib_dev, port,
+ gid, gid_attr);
+ break;
+ }
+ }
+ }
+}
+
+enum bonding_slave_state {
+ BONDING_SLAVE_STATE_ACTIVE = 1UL << 0,
+ BONDING_SLAVE_STATE_INACTIVE = 1UL << 1,
+ /* No primary slave or the device isn't a slave in bonding */
+ BONDING_SLAVE_STATE_NA = 1UL << 2,
+};
+
+static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev,
+ struct net_device *upper)
+{
+ if (upper && netif_is_bond_master(upper)) {
+ struct net_device *pdev =
+ bond_option_active_slave_get_rcu(netdev_priv(upper));
+
+ if (pdev)
+ return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE :
+ BONDING_SLAVE_STATE_INACTIVE;
+ }
+
+ return BONDING_SLAVE_STATE_NA;
+}
+
+#define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \
+ BONDING_SLAVE_STATE_NA)
+static bool
+is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *real_dev;
+ bool res;
+
+ if (!rdma_ndev)
+ return false;
+
+ rcu_read_lock();
+ real_dev = rdma_vlan_dev_real_dev(cookie);
+ if (!real_dev)
+ real_dev = cookie;
+
+ res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) &&
+ (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) &
+ REQUIRED_BOND_STATES)) ||
+ real_dev == rdma_ndev);
+
+ rcu_read_unlock();
+ return res;
+}
+
+static bool
+is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *master_dev;
+ bool res;
+
+ if (!rdma_ndev)
+ return false;
+
+ rcu_read_lock();
+ master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev);
+ res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) ==
+ BONDING_SLAVE_STATE_INACTIVE;
+ rcu_read_unlock();
+
+ return res;
+}
+
+/** is_ndev_for_default_gid_filter - Check if a given netdevice
+ * can be considered for default GIDs or not.
+ * @ib_dev: IB device to check
+ * @port: Port to consider for adding default GID
+ * @rdma_ndev: rdma netdevice pointer
+ * @cookie_ndev: Netdevice to consider to form a default GID
+ *
+ * is_ndev_for_default_gid_filter() returns true if a given netdevice can be
+ * considered for deriving default RoCE GID, returns false otherwise.
+ */
+static bool
+is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *cookie_ndev = cookie;
+ bool res;
+
+ if (!rdma_ndev)
+ return false;
+
+ rcu_read_lock();
+
+ /*
+ * When rdma netdevice is used in bonding, bonding master netdevice
+ * should be considered for default GIDs. Therefore, ignore slave rdma
+ * netdevices when bonding is considered.
+ * Additionally when event(cookie) netdevice is bond master device,
+ * make sure that it the upper netdevice of rdma netdevice.
+ */
+ res = ((cookie_ndev == rdma_ndev && !netif_is_bond_slave(rdma_ndev)) ||
+ (netif_is_bond_master(cookie_ndev) &&
+ rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev)));
+
+ rcu_read_unlock();
+ return res;
+}
+
+static bool pass_all_filter(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ return true;
+}
+
+static bool upper_device_filter(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ bool res;
+
+ if (!rdma_ndev)
+ return false;
+
+ if (rdma_ndev == cookie)
+ return true;
+
+ rcu_read_lock();
+ res = rdma_is_upper_dev_rcu(rdma_ndev, cookie);
+ rcu_read_unlock();
+
+ return res;
+}
+
+/**
+ * is_upper_ndev_bond_master_filter - Check if a given netdevice
+ * is bond master device of netdevice of the the RDMA device of port.
+ * @ib_dev: IB device to check
+ * @port: Port to consider for adding default GID
+ * @rdma_ndev: Pointer to rdma netdevice
+ * @cookie: Netdevice to consider to form a default GID
+ *
+ * is_upper_ndev_bond_master_filter() returns true if a cookie_netdev
+ * is bond master device and rdma_ndev is its lower netdevice. It might
+ * not have been established as slave device yet.
+ */
+static bool
+is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev,
+ void *cookie)
+{
+ struct net_device *cookie_ndev = cookie;
+ bool match = false;
+
+ if (!rdma_ndev)
+ return false;
+
+ rcu_read_lock();
+ if (netif_is_bond_master(cookie_ndev) &&
+ rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev))
+ match = true;
+ rcu_read_unlock();
+ return match;
+}
+
+static void update_gid_ip(enum gid_op_type gid_op,
+ struct ib_device *ib_dev,
+ u8 port, struct net_device *ndev,
+ struct sockaddr *addr)
+{
+ union ib_gid gid;
+ struct ib_gid_attr gid_attr;
+
+ rdma_ip2gid(addr, &gid);
+ memset(&gid_attr, 0, sizeof(gid_attr));
+ gid_attr.ndev = ndev;
+
+ update_gid(gid_op, ib_dev, port, &gid, &gid_attr);
+}
+
+static void bond_delete_netdev_default_gids(struct ib_device *ib_dev,
+ u8 port,
+ struct net_device *rdma_ndev,
+ struct net_device *event_ndev)
+{
+ struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev);
+ unsigned long gid_type_mask;
+
+ if (!rdma_ndev)
+ return;
+
+ if (!real_dev)
+ real_dev = event_ndev;
+
+ rcu_read_lock();
+
+ if (((rdma_ndev != event_ndev &&
+ !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) ||
+ is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev)
+ ==
+ BONDING_SLAVE_STATE_INACTIVE)) {
+ rcu_read_unlock();
+ return;
+ }
+
+ rcu_read_unlock();
+
+ gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
+ ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
+ gid_type_mask,
+ IB_CACHE_GID_DEFAULT_MODE_DELETE);
+}
+
+static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
+ u8 port, struct net_device *ndev)
+{
+ struct in_device *in_dev;
+ struct sin_list {
+ struct list_head list;
+ struct sockaddr_in ip;
+ };
+ struct sin_list *sin_iter;
+ struct sin_list *sin_temp;
+
+ LIST_HEAD(sin_list);
+ if (ndev->reg_state >= NETREG_UNREGISTERING)
+ return;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(ndev);
+ if (!in_dev) {
+ rcu_read_unlock();
+ return;
+ }
+
+ for_ifa(in_dev) {
+ struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+
+ if (!entry)
+ continue;
+
+ entry->ip.sin_family = AF_INET;
+ entry->ip.sin_addr.s_addr = ifa->ifa_address;
+ list_add_tail(&entry->list, &sin_list);
+ }
+ endfor_ifa(in_dev);
+ rcu_read_unlock();
+
+ list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) {
+ update_gid_ip(GID_ADD, ib_dev, port, ndev,
+ (struct sockaddr *)&sin_iter->ip);
+ list_del(&sin_iter->list);
+ kfree(sin_iter);
+ }
+}
+
+static void enum_netdev_ipv6_ips(struct ib_device *ib_dev,
+ u8 port, struct net_device *ndev)
+{
+ struct inet6_ifaddr *ifp;
+ struct inet6_dev *in6_dev;
+ struct sin6_list {
+ struct list_head list;
+ struct sockaddr_in6 sin6;
+ };
+ struct sin6_list *sin6_iter;
+ struct sin6_list *sin6_temp;
+ struct ib_gid_attr gid_attr = {.ndev = ndev};
+ LIST_HEAD(sin6_list);
+
+ if (ndev->reg_state >= NETREG_UNREGISTERING)
+ return;
+
+ in6_dev = in6_dev_get(ndev);
+ if (!in6_dev)
+ return;
+
+ read_lock_bh(&in6_dev->lock);
+ list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+ struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+
+ if (!entry)
+ continue;
+
+ entry->sin6.sin6_family = AF_INET6;
+ entry->sin6.sin6_addr = ifp->addr;
+ list_add_tail(&entry->list, &sin6_list);
+ }
+ read_unlock_bh(&in6_dev->lock);
+
+ in6_dev_put(in6_dev);
+
+ list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) {
+ union ib_gid gid;
+
+ rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid);
+ update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr);
+ list_del(&sin6_iter->list);
+ kfree(sin6_iter);
+ }
+}
+
+static void _add_netdev_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev)
+{
+ enum_netdev_ipv4_ips(ib_dev, port, ndev);
+ if (IS_ENABLED(CONFIG_IPV6))
+ enum_netdev_ipv6_ips(ib_dev, port, ndev);
+}
+
+static void add_netdev_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ _add_netdev_ips(ib_dev, port, cookie);
+}
+
+static void del_netdev_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie);
+}
+
+/**
+ * del_default_gids - Delete default GIDs of the event/cookie netdevice
+ * @ib_dev: RDMA device pointer
+ * @port: Port of the RDMA device whose GID table to consider
+ * @rdma_ndev: Unused rdma netdevice
+ * @cookie: Pointer to event netdevice
+ *
+ * del_default_gids() deletes the default GIDs of the event/cookie netdevice.
+ */
+static void del_default_gids(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *cookie_ndev = cookie;
+ unsigned long gid_type_mask;
+
+ gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
+ ib_cache_gid_set_default_gid(ib_dev, port, cookie_ndev, gid_type_mask,
+ IB_CACHE_GID_DEFAULT_MODE_DELETE);
+}
+
+static void add_default_gids(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *event_ndev = cookie;
+ unsigned long gid_type_mask;
+
+ gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+ ib_cache_gid_set_default_gid(ib_dev, port, event_ndev, gid_type_mask,
+ IB_CACHE_GID_DEFAULT_MODE_SET);
+}
+
+static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev,
+ u8 port,
+ struct net_device *rdma_ndev,
+ void *cookie)
+{
+ struct net *net;
+ struct net_device *ndev;
+
+ /* Lock the rtnl to make sure the netdevs does not move under
+ * our feet
+ */
+ rtnl_lock();
+ down_read(&net_rwsem);
+ for_each_net(net)
+ for_each_netdev(net, ndev) {
+ /*
+ * Filter and add default GIDs of the primary netdevice
+ * when not in bonding mode, or add default GIDs
+ * of bond master device, when in bonding mode.
+ */
+ if (is_ndev_for_default_gid_filter(ib_dev, port,
+ rdma_ndev, ndev))
+ add_default_gids(ib_dev, port, rdma_ndev, ndev);
+
+ if (is_eth_port_of_netdev_filter(ib_dev, port,
+ rdma_ndev, ndev))
+ _add_netdev_ips(ib_dev, port, ndev);
+ }
+ up_read(&net_rwsem);
+ rtnl_unlock();
+}
+
+/**
+ * rdma_roce_rescan_device - Rescan all of the network devices in the system
+ * and add their gids, as needed, to the relevant RoCE devices.
+ *
+ * @device: the rdma device
+ */
+void rdma_roce_rescan_device(struct ib_device *ib_dev)
+{
+ ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL,
+ enum_all_gids_of_dev_cb, NULL);
+}
+EXPORT_SYMBOL(rdma_roce_rescan_device);
+
+static void callback_for_addr_gid_device_scan(struct ib_device *device,
+ u8 port,
+ struct net_device *rdma_ndev,
+ void *cookie)
+{
+ struct update_gid_event_work *parsed = cookie;
+
+ return update_gid(parsed->gid_op, device,
+ port, &parsed->gid,
+ &parsed->gid_attr);
+}
+
+struct upper_list {
+ struct list_head list;
+ struct net_device *upper;
+};
+
+static int netdev_upper_walk(struct net_device *upper, void *data)
+{
+ struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+ struct list_head *upper_list = data;
+
+ if (!entry)
+ return 0;
+
+ list_add_tail(&entry->list, upper_list);
+ dev_hold(upper);
+ entry->upper = upper;
+
+ return 0;
+}
+
+static void handle_netdev_upper(struct ib_device *ib_dev, u8 port,
+ void *cookie,
+ void (*handle_netdev)(struct ib_device *ib_dev,
+ u8 port,
+ struct net_device *ndev))
+{
+ struct net_device *ndev = cookie;
+ struct upper_list *upper_iter;
+ struct upper_list *upper_temp;
+ LIST_HEAD(upper_list);
+
+ rcu_read_lock();
+ netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &upper_list);
+ rcu_read_unlock();
+
+ handle_netdev(ib_dev, port, ndev);
+ list_for_each_entry_safe(upper_iter, upper_temp, &upper_list,
+ list) {
+ handle_netdev(ib_dev, port, upper_iter->upper);
+ dev_put(upper_iter->upper);
+ list_del(&upper_iter->list);
+ kfree(upper_iter);
+ }
+}
+
+static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+ struct net_device *event_ndev)
+{
+ ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev);
+}
+
+static void del_netdev_upper_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids);
+}
+
+static void add_netdev_upper_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips);
+}
+
+static void del_netdev_default_ips_join(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev,
+ void *cookie)
+{
+ struct net_device *master_ndev;
+
+ rcu_read_lock();
+ master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev);
+ if (master_ndev)
+ dev_hold(master_ndev);
+ rcu_read_unlock();
+
+ if (master_ndev) {
+ bond_delete_netdev_default_gids(ib_dev, port, rdma_ndev,
+ master_ndev);
+ dev_put(master_ndev);
+ }
+}
+
+/* The following functions operate on all IB devices. netdevice_event and
+ * addr_event execute ib_enum_all_roce_netdevs through a work.
+ * ib_enum_all_roce_netdevs iterates through all IB devices.
+ */
+
+static void netdevice_event_work_handler(struct work_struct *_work)
+{
+ struct netdev_event_work *work =
+ container_of(_work, struct netdev_event_work, work);
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
+ ib_enum_all_roce_netdevs(work->cmds[i].filter,
+ work->cmds[i].filter_ndev,
+ work->cmds[i].cb,
+ work->cmds[i].ndev);
+ dev_put(work->cmds[i].ndev);
+ dev_put(work->cmds[i].filter_ndev);
+ }
+
+ kfree(work);
+}
+
+static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
+ struct net_device *ndev)
+{
+ unsigned int i;
+ struct netdev_event_work *ndev_work =
+ kmalloc(sizeof(*ndev_work), GFP_KERNEL);
+
+ if (!ndev_work)
+ return NOTIFY_DONE;
+
+ memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds));
+ for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) {
+ if (!ndev_work->cmds[i].ndev)
+ ndev_work->cmds[i].ndev = ndev;
+ if (!ndev_work->cmds[i].filter_ndev)
+ ndev_work->cmds[i].filter_ndev = ndev;
+ dev_hold(ndev_work->cmds[i].ndev);
+ dev_hold(ndev_work->cmds[i].filter_ndev);
+ }
+ INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
+
+ queue_work(gid_cache_wq, &ndev_work->work);
+
+ return NOTIFY_DONE;
+}
+
+static const struct netdev_event_work_cmd add_cmd = {
+ .cb = add_netdev_ips,
+ .filter = is_eth_port_of_netdev_filter
+};
+
+static const struct netdev_event_work_cmd add_cmd_upper_ips = {
+ .cb = add_netdev_upper_ips,
+ .filter = is_eth_port_of_netdev_filter
+};
+
+static void
+ndev_event_unlink(struct netdev_notifier_changeupper_info *changeupper_info,
+ struct netdev_event_work_cmd *cmds)
+{
+ static const struct netdev_event_work_cmd
+ upper_ips_del_cmd = {
+ .cb = del_netdev_upper_ips,
+ .filter = upper_device_filter
+ };
+
+ cmds[0] = upper_ips_del_cmd;
+ cmds[0].ndev = changeupper_info->upper_dev;
+ cmds[1] = add_cmd;
+}
+
+static const struct netdev_event_work_cmd bonding_default_add_cmd = {
+ .cb = add_default_gids,
+ .filter = is_upper_ndev_bond_master_filter
+};
+
+static void
+ndev_event_link(struct net_device *event_ndev,
+ struct netdev_notifier_changeupper_info *changeupper_info,
+ struct netdev_event_work_cmd *cmds)
+{
+ static const struct netdev_event_work_cmd
+ bonding_default_del_cmd = {
+ .cb = del_default_gids,
+ .filter = is_upper_ndev_bond_master_filter
+ };
+ /*
+ * When a lower netdev is linked to its upper bonding
+ * netdev, delete lower slave netdev's default GIDs.
+ */
+ cmds[0] = bonding_default_del_cmd;
+ cmds[0].ndev = event_ndev;
+ cmds[0].filter_ndev = changeupper_info->upper_dev;
+
+ /* Now add bonding upper device default GIDs */
+ cmds[1] = bonding_default_add_cmd;
+ cmds[1].ndev = changeupper_info->upper_dev;
+ cmds[1].filter_ndev = changeupper_info->upper_dev;
+
+ /* Now add bonding upper device IP based GIDs */
+ cmds[2] = add_cmd_upper_ips;
+ cmds[2].ndev = changeupper_info->upper_dev;
+ cmds[2].filter_ndev = changeupper_info->upper_dev;
+}
+
+static void netdevice_event_changeupper(struct net_device *event_ndev,
+ struct netdev_notifier_changeupper_info *changeupper_info,
+ struct netdev_event_work_cmd *cmds)
+{
+ if (changeupper_info->linking)
+ ndev_event_link(event_ndev, changeupper_info, cmds);
+ else
+ ndev_event_unlink(changeupper_info, cmds);
+}
+
+static const struct netdev_event_work_cmd add_default_gid_cmd = {
+ .cb = add_default_gids,
+ .filter = is_ndev_for_default_gid_filter,
+};
+
+static int netdevice_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ static const struct netdev_event_work_cmd del_cmd = {
+ .cb = del_netdev_ips, .filter = pass_all_filter};
+ static const struct netdev_event_work_cmd
+ bonding_default_del_cmd_join = {
+ .cb = del_netdev_default_ips_join,
+ .filter = is_eth_port_inactive_slave_filter
+ };
+ static const struct netdev_event_work_cmd
+ netdev_del_cmd = {
+ .cb = del_netdev_ips,
+ .filter = is_eth_port_of_netdev_filter
+ };
+ static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = {
+ .cb = del_netdev_upper_ips, .filter = upper_device_filter};
+ struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} };
+
+ if (ndev->type != ARPHRD_ETHER)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ case NETDEV_UP:
+ cmds[0] = bonding_default_del_cmd_join;
+ cmds[1] = add_default_gid_cmd;
+ cmds[2] = add_cmd;
+ break;
+
+ case NETDEV_UNREGISTER:
+ if (ndev->reg_state < NETREG_UNREGISTERED)
+ cmds[0] = del_cmd;
+ else
+ return NOTIFY_DONE;
+ break;
+
+ case NETDEV_CHANGEADDR:
+ cmds[0] = netdev_del_cmd;
+ if (ndev->reg_state == NETREG_REGISTERED) {
+ cmds[1] = add_default_gid_cmd;
+ cmds[2] = add_cmd;
+ }
+ break;
+
+ case NETDEV_CHANGEUPPER:
+ netdevice_event_changeupper(ndev,
+ container_of(ptr, struct netdev_notifier_changeupper_info, info),
+ cmds);
+ break;
+
+ case NETDEV_BONDING_FAILOVER:
+ cmds[0] = bonding_event_ips_del_cmd;
+ /* Add default GIDs of the bond device */
+ cmds[1] = bonding_default_add_cmd;
+ /* Add IP based GIDs of the bond device */
+ cmds[2] = add_cmd_upper_ips;
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return netdevice_queue_work(cmds, ndev);
+}
+
+static void update_gid_event_work_handler(struct work_struct *_work)
+{
+ struct update_gid_event_work *work =
+ container_of(_work, struct update_gid_event_work, work);
+
+ ib_enum_all_roce_netdevs(is_eth_port_of_netdev_filter,
+ work->gid_attr.ndev,
+ callback_for_addr_gid_device_scan, work);
+
+ dev_put(work->gid_attr.ndev);
+ kfree(work);
+}
+
+static int addr_event(struct notifier_block *this, unsigned long event,
+ struct sockaddr *sa, struct net_device *ndev)
+{
+ struct update_gid_event_work *work;
+ enum gid_op_type gid_op;
+
+ if (ndev->type != ARPHRD_ETHER)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UP:
+ gid_op = GID_ADD;
+ break;
+
+ case NETDEV_DOWN:
+ gid_op = GID_DEL;
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (!work)
+ return NOTIFY_DONE;
+
+ INIT_WORK(&work->work, update_gid_event_work_handler);
+
+ rdma_ip2gid(sa, &work->gid);
+ work->gid_op = gid_op;
+
+ memset(&work->gid_attr, 0, sizeof(work->gid_attr));
+ dev_hold(ndev);
+ work->gid_attr.ndev = ndev;
+
+ queue_work(gid_cache_wq, &work->work);
+
+ return NOTIFY_DONE;
+}
+
+static int inetaddr_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct sockaddr_in in;
+ struct net_device *ndev;
+ struct in_ifaddr *ifa = ptr;
+
+ in.sin_family = AF_INET;
+ in.sin_addr.s_addr = ifa->ifa_address;
+ ndev = ifa->ifa_dev->dev;
+
+ return addr_event(this, event, (struct sockaddr *)&in, ndev);
+}
+
+static int inet6addr_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct sockaddr_in6 in6;
+ struct net_device *ndev;
+ struct inet6_ifaddr *ifa6 = ptr;
+
+ in6.sin6_family = AF_INET6;
+ in6.sin6_addr = ifa6->addr;
+ ndev = ifa6->idev->dev;
+
+ return addr_event(this, event, (struct sockaddr *)&in6, ndev);
+}
+
+static struct notifier_block nb_netdevice = {
+ .notifier_call = netdevice_event
+};
+
+static struct notifier_block nb_inetaddr = {
+ .notifier_call = inetaddr_event
+};
+
+static struct notifier_block nb_inet6addr = {
+ .notifier_call = inet6addr_event
+};
+
+int __init roce_gid_mgmt_init(void)
+{
+ gid_cache_wq = alloc_ordered_workqueue("gid-cache-wq", 0);
+ if (!gid_cache_wq)
+ return -ENOMEM;
+
+ register_inetaddr_notifier(&nb_inetaddr);
+ if (IS_ENABLED(CONFIG_IPV6))
+ register_inet6addr_notifier(&nb_inet6addr);
+ /* We relay on the netdevice notifier to enumerate all
+ * existing devices in the system. Register to this notifier
+ * last to make sure we will not miss any IP add/del
+ * callbacks.
+ */
+ register_netdevice_notifier(&nb_netdevice);
+
+ return 0;
+}
+
+void __exit roce_gid_mgmt_cleanup(void)
+{
+ if (IS_ENABLED(CONFIG_IPV6))
+ unregister_inet6addr_notifier(&nb_inet6addr);
+ unregister_inetaddr_notifier(&nb_inetaddr);
+ unregister_netdevice_notifier(&nb_netdevice);
+ /* Ensure all gid deletion tasks complete before we go down,
+ * to avoid any reference to free'd memory. By the time
+ * ib-core is removed, all physical devices have been removed,
+ * so no issue with remaining hardware contexts.
+ */
+ destroy_workqueue(gid_cache_wq);
+}
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
new file mode 100644
index 000000000..683e6d11a
--- /dev/null
+++ b/drivers/infiniband/core/rw.c
@@ -0,0 +1,745 @@
+/*
+ * Copyright (c) 2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <rdma/mr_pool.h>
+#include <rdma/rw.h>
+
+enum {
+ RDMA_RW_SINGLE_WR,
+ RDMA_RW_MULTI_WR,
+ RDMA_RW_MR,
+ RDMA_RW_SIG_MR,
+};
+
+static bool rdma_rw_force_mr;
+module_param_named(force_mr, rdma_rw_force_mr, bool, 0);
+MODULE_PARM_DESC(force_mr, "Force usage of MRs for RDMA READ/WRITE operations");
+
+/*
+ * Check if the device might use memory registration. This is currently only
+ * true for iWarp devices. In the future we can hopefully fine tune this based
+ * on HCA driver input.
+ */
+static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u8 port_num)
+{
+ if (rdma_protocol_iwarp(dev, port_num))
+ return true;
+ if (unlikely(rdma_rw_force_mr))
+ return true;
+ return false;
+}
+
+/*
+ * Check if the device will use memory registration for this RW operation.
+ * We currently always use memory registrations for iWarp RDMA READs, and
+ * have a debug option to force usage of MRs.
+ *
+ * XXX: In the future we can hopefully fine tune this based on HCA driver
+ * input.
+ */
+static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num,
+ enum dma_data_direction dir, int dma_nents)
+{
+ if (rdma_protocol_iwarp(dev, port_num) && dir == DMA_FROM_DEVICE)
+ return true;
+ if (unlikely(rdma_rw_force_mr))
+ return true;
+ return false;
+}
+
+static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev)
+{
+ /* arbitrary limit to avoid allocating gigantic resources */
+ return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256);
+}
+
+/* Caller must have zero-initialized *reg. */
+static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num,
+ struct rdma_rw_reg_ctx *reg, struct scatterlist *sg,
+ u32 sg_cnt, u32 offset)
+{
+ u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+ u32 nents = min(sg_cnt, pages_per_mr);
+ int count = 0, ret;
+
+ reg->mr = ib_mr_pool_get(qp, &qp->rdma_mrs);
+ if (!reg->mr)
+ return -EAGAIN;
+
+ if (reg->mr->need_inval) {
+ reg->inv_wr.opcode = IB_WR_LOCAL_INV;
+ reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey;
+ reg->inv_wr.next = &reg->reg_wr.wr;
+ count++;
+ } else {
+ reg->inv_wr.next = NULL;
+ }
+
+ ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE);
+ if (ret < 0 || ret < nents) {
+ ib_mr_pool_put(qp, &qp->rdma_mrs, reg->mr);
+ return -EINVAL;
+ }
+
+ reg->reg_wr.wr.opcode = IB_WR_REG_MR;
+ reg->reg_wr.mr = reg->mr;
+ reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE;
+ if (rdma_protocol_iwarp(qp->device, port_num))
+ reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE;
+ count++;
+
+ reg->sge.addr = reg->mr->iova;
+ reg->sge.length = reg->mr->length;
+ return count;
+}
+
+static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u8 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset,
+ u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+ struct rdma_rw_reg_ctx *prev = NULL;
+ u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+ int i, j, ret = 0, count = 0;
+
+ ctx->nr_ops = (sg_cnt + pages_per_mr - 1) / pages_per_mr;
+ ctx->reg = kcalloc(ctx->nr_ops, sizeof(*ctx->reg), GFP_KERNEL);
+ if (!ctx->reg) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < ctx->nr_ops; i++) {
+ struct rdma_rw_reg_ctx *reg = &ctx->reg[i];
+ u32 nents = min(sg_cnt, pages_per_mr);
+
+ ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sg_cnt,
+ offset);
+ if (ret < 0)
+ goto out_free;
+ count += ret;
+
+ if (prev) {
+ if (reg->mr->need_inval)
+ prev->wr.wr.next = &reg->inv_wr;
+ else
+ prev->wr.wr.next = &reg->reg_wr.wr;
+ }
+
+ reg->reg_wr.wr.next = &reg->wr.wr;
+
+ reg->wr.wr.sg_list = &reg->sge;
+ reg->wr.wr.num_sge = 1;
+ reg->wr.remote_addr = remote_addr;
+ reg->wr.rkey = rkey;
+ if (dir == DMA_TO_DEVICE) {
+ reg->wr.wr.opcode = IB_WR_RDMA_WRITE;
+ } else if (!rdma_cap_read_inv(qp->device, port_num)) {
+ reg->wr.wr.opcode = IB_WR_RDMA_READ;
+ } else {
+ reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+ reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey;
+ }
+ count++;
+
+ remote_addr += reg->sge.length;
+ sg_cnt -= nents;
+ for (j = 0; j < nents; j++)
+ sg = sg_next(sg);
+ prev = reg;
+ offset = 0;
+ }
+
+ if (prev)
+ prev->wr.wr.next = NULL;
+
+ ctx->type = RDMA_RW_MR;
+ return count;
+
+out_free:
+ while (--i >= 0)
+ ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
+ kfree(ctx->reg);
+out:
+ return ret;
+}
+
+static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ struct scatterlist *sg, u32 sg_cnt, u32 offset,
+ u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+ struct ib_device *dev = qp->pd->device;
+ u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
+ qp->max_read_sge;
+ struct ib_sge *sge;
+ u32 total_len = 0, i, j;
+
+ ctx->nr_ops = DIV_ROUND_UP(sg_cnt, max_sge);
+
+ ctx->map.sges = sge = kcalloc(sg_cnt, sizeof(*sge), GFP_KERNEL);
+ if (!ctx->map.sges)
+ goto out;
+
+ ctx->map.wrs = kcalloc(ctx->nr_ops, sizeof(*ctx->map.wrs), GFP_KERNEL);
+ if (!ctx->map.wrs)
+ goto out_free_sges;
+
+ for (i = 0; i < ctx->nr_ops; i++) {
+ struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i];
+ u32 nr_sge = min(sg_cnt, max_sge);
+
+ if (dir == DMA_TO_DEVICE)
+ rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
+ else
+ rdma_wr->wr.opcode = IB_WR_RDMA_READ;
+ rdma_wr->remote_addr = remote_addr + total_len;
+ rdma_wr->rkey = rkey;
+ rdma_wr->wr.num_sge = nr_sge;
+ rdma_wr->wr.sg_list = sge;
+
+ for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) {
+ sge->addr = ib_sg_dma_address(dev, sg) + offset;
+ sge->length = ib_sg_dma_len(dev, sg) - offset;
+ sge->lkey = qp->pd->local_dma_lkey;
+
+ total_len += sge->length;
+ sge++;
+ sg_cnt--;
+ offset = 0;
+ }
+
+ rdma_wr->wr.next = i + 1 < ctx->nr_ops ?
+ &ctx->map.wrs[i + 1].wr : NULL;
+ }
+
+ ctx->type = RDMA_RW_MULTI_WR;
+ return ctx->nr_ops;
+
+out_free_sges:
+ kfree(ctx->map.sges);
+out:
+ return -ENOMEM;
+}
+
+static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey,
+ enum dma_data_direction dir)
+{
+ struct ib_device *dev = qp->pd->device;
+ struct ib_rdma_wr *rdma_wr = &ctx->single.wr;
+
+ ctx->nr_ops = 1;
+
+ ctx->single.sge.lkey = qp->pd->local_dma_lkey;
+ ctx->single.sge.addr = ib_sg_dma_address(dev, sg) + offset;
+ ctx->single.sge.length = ib_sg_dma_len(dev, sg) - offset;
+
+ memset(rdma_wr, 0, sizeof(*rdma_wr));
+ if (dir == DMA_TO_DEVICE)
+ rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
+ else
+ rdma_wr->wr.opcode = IB_WR_RDMA_READ;
+ rdma_wr->wr.sg_list = &ctx->single.sge;
+ rdma_wr->wr.num_sge = 1;
+ rdma_wr->remote_addr = remote_addr;
+ rdma_wr->rkey = rkey;
+
+ ctx->type = RDMA_RW_SINGLE_WR;
+ return 1;
+}
+
+/**
+ * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context
+ * @ctx: context to initialize
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @sg: scatterlist to READ/WRITE from/to
+ * @sg_cnt: number of entries in @sg
+ * @sg_offset: current byte offset into @sg
+ * @remote_addr:remote address to read/write (relative to @rkey)
+ * @rkey: remote key to operate on
+ * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ *
+ * Returns the number of WQEs that will be needed on the workqueue if
+ * successful, or a negative error code.
+ */
+int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+ struct scatterlist *sg, u32 sg_cnt, u32 sg_offset,
+ u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+ struct ib_device *dev = qp->pd->device;
+ int ret;
+
+ ret = ib_dma_map_sg(dev, sg, sg_cnt, dir);
+ if (!ret)
+ return -ENOMEM;
+ sg_cnt = ret;
+
+ /*
+ * Skip to the S/G entry that sg_offset falls into:
+ */
+ for (;;) {
+ u32 len = ib_sg_dma_len(dev, sg);
+
+ if (sg_offset < len)
+ break;
+
+ sg = sg_next(sg);
+ sg_offset -= len;
+ sg_cnt--;
+ }
+
+ ret = -EIO;
+ if (WARN_ON_ONCE(sg_cnt == 0))
+ goto out_unmap_sg;
+
+ if (rdma_rw_io_needs_mr(qp->device, port_num, dir, sg_cnt)) {
+ ret = rdma_rw_init_mr_wrs(ctx, qp, port_num, sg, sg_cnt,
+ sg_offset, remote_addr, rkey, dir);
+ } else if (sg_cnt > 1) {
+ ret = rdma_rw_init_map_wrs(ctx, qp, sg, sg_cnt, sg_offset,
+ remote_addr, rkey, dir);
+ } else {
+ ret = rdma_rw_init_single_wr(ctx, qp, sg, sg_offset,
+ remote_addr, rkey, dir);
+ }
+
+ if (ret < 0)
+ goto out_unmap_sg;
+ return ret;
+
+out_unmap_sg:
+ ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_rw_ctx_init);
+
+/**
+ * rdma_rw_ctx_signature_init - initialize a RW context with signature offload
+ * @ctx: context to initialize
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @sg: scatterlist to READ/WRITE from/to
+ * @sg_cnt: number of entries in @sg
+ * @prot_sg: scatterlist to READ/WRITE protection information from/to
+ * @prot_sg_cnt: number of entries in @prot_sg
+ * @sig_attrs: signature offloading algorithms
+ * @remote_addr:remote address to read/write (relative to @rkey)
+ * @rkey: remote key to operate on
+ * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ *
+ * Returns the number of WQEs that will be needed on the workqueue if
+ * successful, or a negative error code.
+ */
+int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u8 port_num, struct scatterlist *sg, u32 sg_cnt,
+ struct scatterlist *prot_sg, u32 prot_sg_cnt,
+ struct ib_sig_attrs *sig_attrs,
+ u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+ struct ib_device *dev = qp->pd->device;
+ u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+ struct ib_rdma_wr *rdma_wr;
+ struct ib_send_wr *prev_wr = NULL;
+ int count = 0, ret;
+
+ if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) {
+ pr_err("SG count too large\n");
+ return -EINVAL;
+ }
+
+ ret = ib_dma_map_sg(dev, sg, sg_cnt, dir);
+ if (!ret)
+ return -ENOMEM;
+ sg_cnt = ret;
+
+ ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir);
+ if (!ret) {
+ ret = -ENOMEM;
+ goto out_unmap_sg;
+ }
+ prot_sg_cnt = ret;
+
+ ctx->type = RDMA_RW_SIG_MR;
+ ctx->nr_ops = 1;
+ ctx->sig = kcalloc(1, sizeof(*ctx->sig), GFP_KERNEL);
+ if (!ctx->sig) {
+ ret = -ENOMEM;
+ goto out_unmap_prot_sg;
+ }
+
+ ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->data, sg, sg_cnt, 0);
+ if (ret < 0)
+ goto out_free_ctx;
+ count += ret;
+ prev_wr = &ctx->sig->data.reg_wr.wr;
+
+ ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->prot,
+ prot_sg, prot_sg_cnt, 0);
+ if (ret < 0)
+ goto out_destroy_data_mr;
+ count += ret;
+
+ if (ctx->sig->prot.inv_wr.next)
+ prev_wr->next = &ctx->sig->prot.inv_wr;
+ else
+ prev_wr->next = &ctx->sig->prot.reg_wr.wr;
+ prev_wr = &ctx->sig->prot.reg_wr.wr;
+
+ ctx->sig->sig_mr = ib_mr_pool_get(qp, &qp->sig_mrs);
+ if (!ctx->sig->sig_mr) {
+ ret = -EAGAIN;
+ goto out_destroy_prot_mr;
+ }
+
+ if (ctx->sig->sig_mr->need_inval) {
+ memset(&ctx->sig->sig_inv_wr, 0, sizeof(ctx->sig->sig_inv_wr));
+
+ ctx->sig->sig_inv_wr.opcode = IB_WR_LOCAL_INV;
+ ctx->sig->sig_inv_wr.ex.invalidate_rkey = ctx->sig->sig_mr->rkey;
+
+ prev_wr->next = &ctx->sig->sig_inv_wr;
+ prev_wr = &ctx->sig->sig_inv_wr;
+ }
+
+ ctx->sig->sig_wr.wr.opcode = IB_WR_REG_SIG_MR;
+ ctx->sig->sig_wr.wr.wr_cqe = NULL;
+ ctx->sig->sig_wr.wr.sg_list = &ctx->sig->data.sge;
+ ctx->sig->sig_wr.wr.num_sge = 1;
+ ctx->sig->sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE;
+ ctx->sig->sig_wr.sig_attrs = sig_attrs;
+ ctx->sig->sig_wr.sig_mr = ctx->sig->sig_mr;
+ if (prot_sg_cnt)
+ ctx->sig->sig_wr.prot = &ctx->sig->prot.sge;
+ prev_wr->next = &ctx->sig->sig_wr.wr;
+ prev_wr = &ctx->sig->sig_wr.wr;
+ count++;
+
+ ctx->sig->sig_sge.addr = 0;
+ ctx->sig->sig_sge.length = ctx->sig->data.sge.length;
+ if (sig_attrs->wire.sig_type != IB_SIG_TYPE_NONE)
+ ctx->sig->sig_sge.length += ctx->sig->prot.sge.length;
+
+ rdma_wr = &ctx->sig->data.wr;
+ rdma_wr->wr.sg_list = &ctx->sig->sig_sge;
+ rdma_wr->wr.num_sge = 1;
+ rdma_wr->remote_addr = remote_addr;
+ rdma_wr->rkey = rkey;
+ if (dir == DMA_TO_DEVICE)
+ rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
+ else
+ rdma_wr->wr.opcode = IB_WR_RDMA_READ;
+ prev_wr->next = &rdma_wr->wr;
+ prev_wr = &rdma_wr->wr;
+ count++;
+
+ return count;
+
+out_destroy_prot_mr:
+ if (prot_sg_cnt)
+ ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr);
+out_destroy_data_mr:
+ ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr);
+out_free_ctx:
+ kfree(ctx->sig);
+out_unmap_prot_sg:
+ ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir);
+out_unmap_sg:
+ ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_rw_ctx_signature_init);
+
+/*
+ * Now that we are going to post the WRs we can update the lkey and need_inval
+ * state on the MRs. If we were doing this at init time, we would get double
+ * or missing invalidations if a context was initialized but not actually
+ * posted.
+ */
+static void rdma_rw_update_lkey(struct rdma_rw_reg_ctx *reg, bool need_inval)
+{
+ reg->mr->need_inval = need_inval;
+ ib_update_fast_reg_key(reg->mr, ib_inc_rkey(reg->mr->lkey));
+ reg->reg_wr.key = reg->mr->lkey;
+ reg->sge.lkey = reg->mr->lkey;
+}
+
+/**
+ * rdma_rw_ctx_wrs - return chain of WRs for a RDMA READ or WRITE operation
+ * @ctx: context to operate on
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @cqe: completion queue entry for the last WR
+ * @chain_wr: WR to append to the posted chain
+ *
+ * Return the WR chain for the set of RDMA READ/WRITE operations described by
+ * @ctx, as well as any memory registration operations needed. If @chain_wr
+ * is non-NULL the WR it points to will be appended to the chain of WRs posted.
+ * If @chain_wr is not set @cqe must be set so that the caller gets a
+ * completion notification.
+ */
+struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u8 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
+{
+ struct ib_send_wr *first_wr, *last_wr;
+ int i;
+
+ switch (ctx->type) {
+ case RDMA_RW_SIG_MR:
+ rdma_rw_update_lkey(&ctx->sig->data, true);
+ if (ctx->sig->prot.mr)
+ rdma_rw_update_lkey(&ctx->sig->prot, true);
+
+ ctx->sig->sig_mr->need_inval = true;
+ ib_update_fast_reg_key(ctx->sig->sig_mr,
+ ib_inc_rkey(ctx->sig->sig_mr->lkey));
+ ctx->sig->sig_sge.lkey = ctx->sig->sig_mr->lkey;
+
+ if (ctx->sig->data.inv_wr.next)
+ first_wr = &ctx->sig->data.inv_wr;
+ else
+ first_wr = &ctx->sig->data.reg_wr.wr;
+ last_wr = &ctx->sig->data.wr.wr;
+ break;
+ case RDMA_RW_MR:
+ for (i = 0; i < ctx->nr_ops; i++) {
+ rdma_rw_update_lkey(&ctx->reg[i],
+ ctx->reg[i].wr.wr.opcode !=
+ IB_WR_RDMA_READ_WITH_INV);
+ }
+
+ if (ctx->reg[0].inv_wr.next)
+ first_wr = &ctx->reg[0].inv_wr;
+ else
+ first_wr = &ctx->reg[0].reg_wr.wr;
+ last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr;
+ break;
+ case RDMA_RW_MULTI_WR:
+ first_wr = &ctx->map.wrs[0].wr;
+ last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr;
+ break;
+ case RDMA_RW_SINGLE_WR:
+ first_wr = &ctx->single.wr.wr;
+ last_wr = &ctx->single.wr.wr;
+ break;
+ default:
+ BUG();
+ }
+
+ if (chain_wr) {
+ last_wr->next = chain_wr;
+ } else {
+ last_wr->wr_cqe = cqe;
+ last_wr->send_flags |= IB_SEND_SIGNALED;
+ }
+
+ return first_wr;
+}
+EXPORT_SYMBOL(rdma_rw_ctx_wrs);
+
+/**
+ * rdma_rw_ctx_post - post a RDMA READ or RDMA WRITE operation
+ * @ctx: context to operate on
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @cqe: completion queue entry for the last WR
+ * @chain_wr: WR to append to the posted chain
+ *
+ * Post the set of RDMA READ/WRITE operations described by @ctx, as well as
+ * any memory registration operations needed. If @chain_wr is non-NULL the
+ * WR it points to will be appended to the chain of WRs posted. If @chain_wr
+ * is not set @cqe must be set so that the caller gets a completion
+ * notification.
+ */
+int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+ struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
+{
+ struct ib_send_wr *first_wr;
+
+ first_wr = rdma_rw_ctx_wrs(ctx, qp, port_num, cqe, chain_wr);
+ return ib_post_send(qp, first_wr, NULL);
+}
+EXPORT_SYMBOL(rdma_rw_ctx_post);
+
+/**
+ * rdma_rw_ctx_destroy - release all resources allocated by rdma_rw_ctx_init
+ * @ctx: context to release
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @sg: scatterlist that was used for the READ/WRITE
+ * @sg_cnt: number of entries in @sg
+ * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ */
+void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+ struct scatterlist *sg, u32 sg_cnt, enum dma_data_direction dir)
+{
+ int i;
+
+ switch (ctx->type) {
+ case RDMA_RW_MR:
+ for (i = 0; i < ctx->nr_ops; i++)
+ ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
+ kfree(ctx->reg);
+ break;
+ case RDMA_RW_MULTI_WR:
+ kfree(ctx->map.wrs);
+ kfree(ctx->map.sges);
+ break;
+ case RDMA_RW_SINGLE_WR:
+ break;
+ default:
+ BUG();
+ break;
+ }
+
+ ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+}
+EXPORT_SYMBOL(rdma_rw_ctx_destroy);
+
+/**
+ * rdma_rw_ctx_destroy_signature - release all resources allocated by
+ * rdma_rw_ctx_init_signature
+ * @ctx: context to release
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @sg: scatterlist that was used for the READ/WRITE
+ * @sg_cnt: number of entries in @sg
+ * @prot_sg: scatterlist that was used for the READ/WRITE of the PI
+ * @prot_sg_cnt: number of entries in @prot_sg
+ * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ */
+void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u8 port_num, struct scatterlist *sg, u32 sg_cnt,
+ struct scatterlist *prot_sg, u32 prot_sg_cnt,
+ enum dma_data_direction dir)
+{
+ if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR))
+ return;
+
+ ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr);
+ ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+
+ if (ctx->sig->prot.mr) {
+ ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr);
+ ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir);
+ }
+
+ ib_mr_pool_put(qp, &qp->sig_mrs, ctx->sig->sig_mr);
+ kfree(ctx->sig);
+}
+EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature);
+
+/**
+ * rdma_rw_mr_factor - return number of MRs required for a payload
+ * @device: device handling the connection
+ * @port_num: port num to which the connection is bound
+ * @maxpages: maximum payload pages per rdma_rw_ctx
+ *
+ * Returns the number of MRs the device requires to move @maxpayload
+ * bytes. The returned value is used during transport creation to
+ * compute max_rdma_ctxts and the size of the transport's Send and
+ * Send Completion Queues.
+ */
+unsigned int rdma_rw_mr_factor(struct ib_device *device, u8 port_num,
+ unsigned int maxpages)
+{
+ unsigned int mr_pages;
+
+ if (rdma_rw_can_use_mr(device, port_num))
+ mr_pages = rdma_rw_fr_page_list_len(device);
+ else
+ mr_pages = device->attrs.max_sge_rd;
+ return DIV_ROUND_UP(maxpages, mr_pages);
+}
+EXPORT_SYMBOL(rdma_rw_mr_factor);
+
+void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
+{
+ u32 factor;
+
+ WARN_ON_ONCE(attr->port_num == 0);
+
+ /*
+ * Each context needs at least one RDMA READ or WRITE WR.
+ *
+ * For some hardware we might need more, eventually we should ask the
+ * HCA driver for a multiplier here.
+ */
+ factor = 1;
+
+ /*
+ * If the devices needs MRs to perform RDMA READ or WRITE operations,
+ * we'll need two additional MRs for the registrations and the
+ * invalidation.
+ */
+ if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN)
+ factor += 6; /* (inv + reg) * (data + prot + sig) */
+ else if (rdma_rw_can_use_mr(dev, attr->port_num))
+ factor += 2; /* inv + reg */
+
+ attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs;
+
+ /*
+ * But maybe we were just too high in the sky and the device doesn't
+ * even support all we need, and we'll have to live with what we get..
+ */
+ attr->cap.max_send_wr =
+ min_t(u32, attr->cap.max_send_wr, dev->attrs.max_qp_wr);
+}
+
+int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
+{
+ struct ib_device *dev = qp->pd->device;
+ u32 nr_mrs = 0, nr_sig_mrs = 0;
+ int ret = 0;
+
+ if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) {
+ nr_sig_mrs = attr->cap.max_rdma_ctxs;
+ nr_mrs = attr->cap.max_rdma_ctxs * 2;
+ } else if (rdma_rw_can_use_mr(dev, attr->port_num)) {
+ nr_mrs = attr->cap.max_rdma_ctxs;
+ }
+
+ if (nr_mrs) {
+ ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs,
+ IB_MR_TYPE_MEM_REG,
+ rdma_rw_fr_page_list_len(dev));
+ if (ret) {
+ pr_err("%s: failed to allocated %d MRs\n",
+ __func__, nr_mrs);
+ return ret;
+ }
+ }
+
+ if (nr_sig_mrs) {
+ ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs,
+ IB_MR_TYPE_SIGNATURE, 2);
+ if (ret) {
+ pr_err("%s: failed to allocated %d SIG MRs\n",
+ __func__, nr_mrs);
+ goto out_free_rdma_mrs;
+ }
+ }
+
+ return 0;
+
+out_free_rdma_mrs:
+ ib_mr_pool_destroy(qp, &qp->rdma_mrs);
+ return ret;
+}
+
+void rdma_rw_cleanup_mrs(struct ib_qp *qp)
+{
+ ib_mr_pool_destroy(qp, &qp->sig_mrs);
+ ib_mr_pool_destroy(qp, &qp->rdma_mrs);
+}
diff --git a/drivers/infiniband/core/sa.h b/drivers/infiniband/core/sa.h
new file mode 100644
index 000000000..b1d4bbf4c
--- /dev/null
+++ b/drivers/infiniband/core/sa.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef SA_H
+#define SA_H
+
+#include <rdma/ib_sa.h>
+
+static inline void ib_sa_client_get(struct ib_sa_client *client)
+{
+ atomic_inc(&client->users);
+}
+
+static inline void ib_sa_client_put(struct ib_sa_client *client)
+{
+ if (atomic_dec_and_test(&client->users))
+ complete(&client->comp);
+}
+
+int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ u8 method,
+ struct ib_sa_mcmember_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_mcmember_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query);
+
+int mcast_init(void);
+void mcast_cleanup(void);
+
+#endif /* SA_H */
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
new file mode 100644
index 000000000..9881e6fa9
--- /dev/null
+++ b/drivers/infiniband/core/sa_query.c
@@ -0,0 +1,2527 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/dma-mapping.h>
+#include <linux/kref.h>
+#include <linux/idr.h>
+#include <linux/workqueue.h>
+#include <uapi/linux/if_ether.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_cache.h>
+#include <rdma/rdma_netlink.h>
+#include <net/netlink.h>
+#include <uapi/rdma/ib_user_sa.h>
+#include <rdma/ib_marshall.h>
+#include <rdma/ib_addr.h>
+#include <rdma/opa_addr.h>
+#include "sa.h"
+#include "core_priv.h"
+
+#define IB_SA_LOCAL_SVC_TIMEOUT_MIN 100
+#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT 2000
+#define IB_SA_LOCAL_SVC_TIMEOUT_MAX 200000
+#define IB_SA_CPI_MAX_RETRY_CNT 3
+#define IB_SA_CPI_RETRY_WAIT 1000 /*msecs */
+static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT;
+
+struct ib_sa_sm_ah {
+ struct ib_ah *ah;
+ struct kref ref;
+ u16 pkey_index;
+ u8 src_path_mask;
+};
+
+enum rdma_class_port_info_type {
+ RDMA_CLASS_PORT_INFO_IB,
+ RDMA_CLASS_PORT_INFO_OPA
+};
+
+struct rdma_class_port_info {
+ enum rdma_class_port_info_type type;
+ union {
+ struct ib_class_port_info ib;
+ struct opa_class_port_info opa;
+ };
+};
+
+struct ib_sa_classport_cache {
+ bool valid;
+ int retry_cnt;
+ struct rdma_class_port_info data;
+};
+
+struct ib_sa_port {
+ struct ib_mad_agent *agent;
+ struct ib_sa_sm_ah *sm_ah;
+ struct work_struct update_task;
+ struct ib_sa_classport_cache classport_info;
+ struct delayed_work ib_cpi_work;
+ spinlock_t classport_lock; /* protects class port info set */
+ spinlock_t ah_lock;
+ u8 port_num;
+};
+
+struct ib_sa_device {
+ int start_port, end_port;
+ struct ib_event_handler event_handler;
+ struct ib_sa_port port[0];
+};
+
+struct ib_sa_query {
+ void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *);
+ void (*release)(struct ib_sa_query *);
+ struct ib_sa_client *client;
+ struct ib_sa_port *port;
+ struct ib_mad_send_buf *mad_buf;
+ struct ib_sa_sm_ah *sm_ah;
+ int id;
+ u32 flags;
+ struct list_head list; /* Local svc request list */
+ u32 seq; /* Local svc request sequence number */
+ unsigned long timeout; /* Local svc timeout */
+ u8 path_use; /* How will the pathrecord be used */
+};
+
+#define IB_SA_ENABLE_LOCAL_SERVICE 0x00000001
+#define IB_SA_CANCEL 0x00000002
+#define IB_SA_QUERY_OPA 0x00000004
+
+struct ib_sa_service_query {
+ void (*callback)(int, struct ib_sa_service_rec *, void *);
+ void *context;
+ struct ib_sa_query sa_query;
+};
+
+struct ib_sa_path_query {
+ void (*callback)(int, struct sa_path_rec *, void *);
+ void *context;
+ struct ib_sa_query sa_query;
+ struct sa_path_rec *conv_pr;
+};
+
+struct ib_sa_guidinfo_query {
+ void (*callback)(int, struct ib_sa_guidinfo_rec *, void *);
+ void *context;
+ struct ib_sa_query sa_query;
+};
+
+struct ib_sa_classport_info_query {
+ void (*callback)(void *);
+ void *context;
+ struct ib_sa_query sa_query;
+};
+
+struct ib_sa_mcmember_query {
+ void (*callback)(int, struct ib_sa_mcmember_rec *, void *);
+ void *context;
+ struct ib_sa_query sa_query;
+};
+
+static LIST_HEAD(ib_nl_request_list);
+static DEFINE_SPINLOCK(ib_nl_request_lock);
+static atomic_t ib_nl_sa_request_seq;
+static struct workqueue_struct *ib_nl_wq;
+static struct delayed_work ib_nl_timed_work;
+static const struct nla_policy ib_nl_policy[LS_NLA_TYPE_MAX] = {
+ [LS_NLA_TYPE_PATH_RECORD] = {.type = NLA_BINARY,
+ .len = sizeof(struct ib_path_rec_data)},
+ [LS_NLA_TYPE_TIMEOUT] = {.type = NLA_U32},
+ [LS_NLA_TYPE_SERVICE_ID] = {.type = NLA_U64},
+ [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY,
+ .len = sizeof(struct rdma_nla_ls_gid)},
+ [LS_NLA_TYPE_SGID] = {.type = NLA_BINARY,
+ .len = sizeof(struct rdma_nla_ls_gid)},
+ [LS_NLA_TYPE_TCLASS] = {.type = NLA_U8},
+ [LS_NLA_TYPE_PKEY] = {.type = NLA_U16},
+ [LS_NLA_TYPE_QOS_CLASS] = {.type = NLA_U16},
+};
+
+
+static void ib_sa_add_one(struct ib_device *device);
+static void ib_sa_remove_one(struct ib_device *device, void *client_data);
+
+static struct ib_client sa_client = {
+ .name = "sa",
+ .add = ib_sa_add_one,
+ .remove = ib_sa_remove_one
+};
+
+static DEFINE_SPINLOCK(idr_lock);
+static DEFINE_IDR(query_idr);
+
+static DEFINE_SPINLOCK(tid_lock);
+static u32 tid;
+
+#define PATH_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct sa_path_rec, field), \
+ .struct_size_bytes = sizeof((struct sa_path_rec *)0)->field, \
+ .field_name = "sa_path_rec:" #field
+
+static const struct ib_field path_rec_table[] = {
+ { PATH_REC_FIELD(service_id),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 64 },
+ { PATH_REC_FIELD(dgid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { PATH_REC_FIELD(sgid),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { PATH_REC_FIELD(ib.dlid),
+ .offset_words = 10,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { PATH_REC_FIELD(ib.slid),
+ .offset_words = 10,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { PATH_REC_FIELD(ib.raw_traffic),
+ .offset_words = 11,
+ .offset_bits = 0,
+ .size_bits = 1 },
+ { RESERVED,
+ .offset_words = 11,
+ .offset_bits = 1,
+ .size_bits = 3 },
+ { PATH_REC_FIELD(flow_label),
+ .offset_words = 11,
+ .offset_bits = 4,
+ .size_bits = 20 },
+ { PATH_REC_FIELD(hop_limit),
+ .offset_words = 11,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { PATH_REC_FIELD(traffic_class),
+ .offset_words = 12,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { PATH_REC_FIELD(reversible),
+ .offset_words = 12,
+ .offset_bits = 8,
+ .size_bits = 1 },
+ { PATH_REC_FIELD(numb_path),
+ .offset_words = 12,
+ .offset_bits = 9,
+ .size_bits = 7 },
+ { PATH_REC_FIELD(pkey),
+ .offset_words = 12,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { PATH_REC_FIELD(qos_class),
+ .offset_words = 13,
+ .offset_bits = 0,
+ .size_bits = 12 },
+ { PATH_REC_FIELD(sl),
+ .offset_words = 13,
+ .offset_bits = 12,
+ .size_bits = 4 },
+ { PATH_REC_FIELD(mtu_selector),
+ .offset_words = 13,
+ .offset_bits = 16,
+ .size_bits = 2 },
+ { PATH_REC_FIELD(mtu),
+ .offset_words = 13,
+ .offset_bits = 18,
+ .size_bits = 6 },
+ { PATH_REC_FIELD(rate_selector),
+ .offset_words = 13,
+ .offset_bits = 24,
+ .size_bits = 2 },
+ { PATH_REC_FIELD(rate),
+ .offset_words = 13,
+ .offset_bits = 26,
+ .size_bits = 6 },
+ { PATH_REC_FIELD(packet_life_time_selector),
+ .offset_words = 14,
+ .offset_bits = 0,
+ .size_bits = 2 },
+ { PATH_REC_FIELD(packet_life_time),
+ .offset_words = 14,
+ .offset_bits = 2,
+ .size_bits = 6 },
+ { PATH_REC_FIELD(preference),
+ .offset_words = 14,
+ .offset_bits = 8,
+ .size_bits = 8 },
+ { RESERVED,
+ .offset_words = 14,
+ .offset_bits = 16,
+ .size_bits = 48 },
+};
+
+#define OPA_PATH_REC_FIELD(field) \
+ .struct_offset_bytes = \
+ offsetof(struct sa_path_rec, field), \
+ .struct_size_bytes = \
+ sizeof((struct sa_path_rec *)0)->field, \
+ .field_name = "sa_path_rec:" #field
+
+static const struct ib_field opa_path_rec_table[] = {
+ { OPA_PATH_REC_FIELD(service_id),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 64 },
+ { OPA_PATH_REC_FIELD(dgid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { OPA_PATH_REC_FIELD(sgid),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { OPA_PATH_REC_FIELD(opa.dlid),
+ .offset_words = 10,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_PATH_REC_FIELD(opa.slid),
+ .offset_words = 11,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_PATH_REC_FIELD(opa.raw_traffic),
+ .offset_words = 12,
+ .offset_bits = 0,
+ .size_bits = 1 },
+ { RESERVED,
+ .offset_words = 12,
+ .offset_bits = 1,
+ .size_bits = 3 },
+ { OPA_PATH_REC_FIELD(flow_label),
+ .offset_words = 12,
+ .offset_bits = 4,
+ .size_bits = 20 },
+ { OPA_PATH_REC_FIELD(hop_limit),
+ .offset_words = 12,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { OPA_PATH_REC_FIELD(traffic_class),
+ .offset_words = 13,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { OPA_PATH_REC_FIELD(reversible),
+ .offset_words = 13,
+ .offset_bits = 8,
+ .size_bits = 1 },
+ { OPA_PATH_REC_FIELD(numb_path),
+ .offset_words = 13,
+ .offset_bits = 9,
+ .size_bits = 7 },
+ { OPA_PATH_REC_FIELD(pkey),
+ .offset_words = 13,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { OPA_PATH_REC_FIELD(opa.l2_8B),
+ .offset_words = 14,
+ .offset_bits = 0,
+ .size_bits = 1 },
+ { OPA_PATH_REC_FIELD(opa.l2_10B),
+ .offset_words = 14,
+ .offset_bits = 1,
+ .size_bits = 1 },
+ { OPA_PATH_REC_FIELD(opa.l2_9B),
+ .offset_words = 14,
+ .offset_bits = 2,
+ .size_bits = 1 },
+ { OPA_PATH_REC_FIELD(opa.l2_16B),
+ .offset_words = 14,
+ .offset_bits = 3,
+ .size_bits = 1 },
+ { RESERVED,
+ .offset_words = 14,
+ .offset_bits = 4,
+ .size_bits = 2 },
+ { OPA_PATH_REC_FIELD(opa.qos_type),
+ .offset_words = 14,
+ .offset_bits = 6,
+ .size_bits = 2 },
+ { OPA_PATH_REC_FIELD(opa.qos_priority),
+ .offset_words = 14,
+ .offset_bits = 8,
+ .size_bits = 8 },
+ { RESERVED,
+ .offset_words = 14,
+ .offset_bits = 16,
+ .size_bits = 3 },
+ { OPA_PATH_REC_FIELD(sl),
+ .offset_words = 14,
+ .offset_bits = 19,
+ .size_bits = 5 },
+ { RESERVED,
+ .offset_words = 14,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { OPA_PATH_REC_FIELD(mtu_selector),
+ .offset_words = 15,
+ .offset_bits = 0,
+ .size_bits = 2 },
+ { OPA_PATH_REC_FIELD(mtu),
+ .offset_words = 15,
+ .offset_bits = 2,
+ .size_bits = 6 },
+ { OPA_PATH_REC_FIELD(rate_selector),
+ .offset_words = 15,
+ .offset_bits = 8,
+ .size_bits = 2 },
+ { OPA_PATH_REC_FIELD(rate),
+ .offset_words = 15,
+ .offset_bits = 10,
+ .size_bits = 6 },
+ { OPA_PATH_REC_FIELD(packet_life_time_selector),
+ .offset_words = 15,
+ .offset_bits = 16,
+ .size_bits = 2 },
+ { OPA_PATH_REC_FIELD(packet_life_time),
+ .offset_words = 15,
+ .offset_bits = 18,
+ .size_bits = 6 },
+ { OPA_PATH_REC_FIELD(preference),
+ .offset_words = 15,
+ .offset_bits = 24,
+ .size_bits = 8 },
+};
+
+#define MCMEMBER_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field), \
+ .struct_size_bytes = sizeof ((struct ib_sa_mcmember_rec *) 0)->field, \
+ .field_name = "sa_mcmember_rec:" #field
+
+static const struct ib_field mcmember_rec_table[] = {
+ { MCMEMBER_REC_FIELD(mgid),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { MCMEMBER_REC_FIELD(port_gid),
+ .offset_words = 4,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { MCMEMBER_REC_FIELD(qkey),
+ .offset_words = 8,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { MCMEMBER_REC_FIELD(mlid),
+ .offset_words = 9,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { MCMEMBER_REC_FIELD(mtu_selector),
+ .offset_words = 9,
+ .offset_bits = 16,
+ .size_bits = 2 },
+ { MCMEMBER_REC_FIELD(mtu),
+ .offset_words = 9,
+ .offset_bits = 18,
+ .size_bits = 6 },
+ { MCMEMBER_REC_FIELD(traffic_class),
+ .offset_words = 9,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { MCMEMBER_REC_FIELD(pkey),
+ .offset_words = 10,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { MCMEMBER_REC_FIELD(rate_selector),
+ .offset_words = 10,
+ .offset_bits = 16,
+ .size_bits = 2 },
+ { MCMEMBER_REC_FIELD(rate),
+ .offset_words = 10,
+ .offset_bits = 18,
+ .size_bits = 6 },
+ { MCMEMBER_REC_FIELD(packet_life_time_selector),
+ .offset_words = 10,
+ .offset_bits = 24,
+ .size_bits = 2 },
+ { MCMEMBER_REC_FIELD(packet_life_time),
+ .offset_words = 10,
+ .offset_bits = 26,
+ .size_bits = 6 },
+ { MCMEMBER_REC_FIELD(sl),
+ .offset_words = 11,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { MCMEMBER_REC_FIELD(flow_label),
+ .offset_words = 11,
+ .offset_bits = 4,
+ .size_bits = 20 },
+ { MCMEMBER_REC_FIELD(hop_limit),
+ .offset_words = 11,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { MCMEMBER_REC_FIELD(scope),
+ .offset_words = 12,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { MCMEMBER_REC_FIELD(join_state),
+ .offset_words = 12,
+ .offset_bits = 4,
+ .size_bits = 4 },
+ { MCMEMBER_REC_FIELD(proxy_join),
+ .offset_words = 12,
+ .offset_bits = 8,
+ .size_bits = 1 },
+ { RESERVED,
+ .offset_words = 12,
+ .offset_bits = 9,
+ .size_bits = 23 },
+};
+
+#define SERVICE_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct ib_sa_service_rec, field), \
+ .struct_size_bytes = sizeof ((struct ib_sa_service_rec *) 0)->field, \
+ .field_name = "sa_service_rec:" #field
+
+static const struct ib_field service_rec_table[] = {
+ { SERVICE_REC_FIELD(id),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 64 },
+ { SERVICE_REC_FIELD(gid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { SERVICE_REC_FIELD(pkey),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { SERVICE_REC_FIELD(lease),
+ .offset_words = 7,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { SERVICE_REC_FIELD(key),
+ .offset_words = 8,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { SERVICE_REC_FIELD(name),
+ .offset_words = 12,
+ .offset_bits = 0,
+ .size_bits = 64*8 },
+ { SERVICE_REC_FIELD(data8),
+ .offset_words = 28,
+ .offset_bits = 0,
+ .size_bits = 16*8 },
+ { SERVICE_REC_FIELD(data16),
+ .offset_words = 32,
+ .offset_bits = 0,
+ .size_bits = 8*16 },
+ { SERVICE_REC_FIELD(data32),
+ .offset_words = 36,
+ .offset_bits = 0,
+ .size_bits = 4*32 },
+ { SERVICE_REC_FIELD(data64),
+ .offset_words = 40,
+ .offset_bits = 0,
+ .size_bits = 2*64 },
+};
+
+#define CLASSPORTINFO_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct ib_class_port_info, field), \
+ .struct_size_bytes = sizeof((struct ib_class_port_info *)0)->field, \
+ .field_name = "ib_class_port_info:" #field
+
+static const struct ib_field ib_classport_info_rec_table[] = {
+ { CLASSPORTINFO_REC_FIELD(base_version),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { CLASSPORTINFO_REC_FIELD(class_version),
+ .offset_words = 0,
+ .offset_bits = 8,
+ .size_bits = 8 },
+ { CLASSPORTINFO_REC_FIELD(capability_mask),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { CLASSPORTINFO_REC_FIELD(cap_mask2_resp_time),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { CLASSPORTINFO_REC_FIELD(redirect_gid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { CLASSPORTINFO_REC_FIELD(redirect_tcslfl),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { CLASSPORTINFO_REC_FIELD(redirect_lid),
+ .offset_words = 7,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { CLASSPORTINFO_REC_FIELD(redirect_pkey),
+ .offset_words = 7,
+ .offset_bits = 16,
+ .size_bits = 16 },
+
+ { CLASSPORTINFO_REC_FIELD(redirect_qp),
+ .offset_words = 8,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { CLASSPORTINFO_REC_FIELD(redirect_qkey),
+ .offset_words = 9,
+ .offset_bits = 0,
+ .size_bits = 32 },
+
+ { CLASSPORTINFO_REC_FIELD(trap_gid),
+ .offset_words = 10,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { CLASSPORTINFO_REC_FIELD(trap_tcslfl),
+ .offset_words = 14,
+ .offset_bits = 0,
+ .size_bits = 32 },
+
+ { CLASSPORTINFO_REC_FIELD(trap_lid),
+ .offset_words = 15,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { CLASSPORTINFO_REC_FIELD(trap_pkey),
+ .offset_words = 15,
+ .offset_bits = 16,
+ .size_bits = 16 },
+
+ { CLASSPORTINFO_REC_FIELD(trap_hlqp),
+ .offset_words = 16,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { CLASSPORTINFO_REC_FIELD(trap_qkey),
+ .offset_words = 17,
+ .offset_bits = 0,
+ .size_bits = 32 },
+};
+
+#define OPA_CLASSPORTINFO_REC_FIELD(field) \
+ .struct_offset_bytes =\
+ offsetof(struct opa_class_port_info, field), \
+ .struct_size_bytes = \
+ sizeof((struct opa_class_port_info *)0)->field, \
+ .field_name = "opa_class_port_info:" #field
+
+static const struct ib_field opa_classport_info_rec_table[] = {
+ { OPA_CLASSPORTINFO_REC_FIELD(base_version),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { OPA_CLASSPORTINFO_REC_FIELD(class_version),
+ .offset_words = 0,
+ .offset_bits = 8,
+ .size_bits = 8 },
+ { OPA_CLASSPORTINFO_REC_FIELD(cap_mask),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { OPA_CLASSPORTINFO_REC_FIELD(cap_mask2_resp_time),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_CLASSPORTINFO_REC_FIELD(redirect_gid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { OPA_CLASSPORTINFO_REC_FIELD(redirect_tc_fl),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_CLASSPORTINFO_REC_FIELD(redirect_lid),
+ .offset_words = 7,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_CLASSPORTINFO_REC_FIELD(redirect_sl_qp),
+ .offset_words = 8,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_CLASSPORTINFO_REC_FIELD(redirect_qkey),
+ .offset_words = 9,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_CLASSPORTINFO_REC_FIELD(trap_gid),
+ .offset_words = 10,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { OPA_CLASSPORTINFO_REC_FIELD(trap_tc_fl),
+ .offset_words = 14,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_CLASSPORTINFO_REC_FIELD(trap_lid),
+ .offset_words = 15,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_CLASSPORTINFO_REC_FIELD(trap_hl_qp),
+ .offset_words = 16,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_CLASSPORTINFO_REC_FIELD(trap_qkey),
+ .offset_words = 17,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_CLASSPORTINFO_REC_FIELD(trap_pkey),
+ .offset_words = 18,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { OPA_CLASSPORTINFO_REC_FIELD(redirect_pkey),
+ .offset_words = 18,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { OPA_CLASSPORTINFO_REC_FIELD(trap_sl_rsvd),
+ .offset_words = 19,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { RESERVED,
+ .offset_words = 19,
+ .offset_bits = 8,
+ .size_bits = 24 },
+};
+
+#define GUIDINFO_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \
+ .struct_size_bytes = sizeof((struct ib_sa_guidinfo_rec *) 0)->field, \
+ .field_name = "sa_guidinfo_rec:" #field
+
+static const struct ib_field guidinfo_rec_table[] = {
+ { GUIDINFO_REC_FIELD(lid),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { GUIDINFO_REC_FIELD(block_num),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 8 },
+ { GUIDINFO_REC_FIELD(res1),
+ .offset_words = 0,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { GUIDINFO_REC_FIELD(res2),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { GUIDINFO_REC_FIELD(guid_info_list),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 512 },
+};
+
+static inline void ib_sa_disable_local_svc(struct ib_sa_query *query)
+{
+ query->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE;
+}
+
+static inline int ib_sa_query_cancelled(struct ib_sa_query *query)
+{
+ return (query->flags & IB_SA_CANCEL);
+}
+
+static void ib_nl_set_path_rec_attrs(struct sk_buff *skb,
+ struct ib_sa_query *query)
+{
+ struct sa_path_rec *sa_rec = query->mad_buf->context[1];
+ struct ib_sa_mad *mad = query->mad_buf->mad;
+ ib_sa_comp_mask comp_mask = mad->sa_hdr.comp_mask;
+ u16 val16;
+ u64 val64;
+ struct rdma_ls_resolve_header *header;
+
+ query->mad_buf->context[1] = NULL;
+
+ /* Construct the family header first */
+ header = skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
+ memcpy(header->device_name, query->port->agent->device->name,
+ LS_DEVICE_NAME_MAX);
+ header->port_num = query->port->port_num;
+
+ if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) &&
+ sa_rec->reversible != 0)
+ query->path_use = LS_RESOLVE_PATH_USE_GMP;
+ else
+ query->path_use = LS_RESOLVE_PATH_USE_UNIDIRECTIONAL;
+ header->path_use = query->path_use;
+
+ /* Now build the attributes */
+ if (comp_mask & IB_SA_PATH_REC_SERVICE_ID) {
+ val64 = be64_to_cpu(sa_rec->service_id);
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SERVICE_ID,
+ sizeof(val64), &val64);
+ }
+ if (comp_mask & IB_SA_PATH_REC_DGID)
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_DGID,
+ sizeof(sa_rec->dgid), &sa_rec->dgid);
+ if (comp_mask & IB_SA_PATH_REC_SGID)
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SGID,
+ sizeof(sa_rec->sgid), &sa_rec->sgid);
+ if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_TCLASS,
+ sizeof(sa_rec->traffic_class), &sa_rec->traffic_class);
+
+ if (comp_mask & IB_SA_PATH_REC_PKEY) {
+ val16 = be16_to_cpu(sa_rec->pkey);
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_PKEY,
+ sizeof(val16), &val16);
+ }
+ if (comp_mask & IB_SA_PATH_REC_QOS_CLASS) {
+ val16 = be16_to_cpu(sa_rec->qos_class);
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_QOS_CLASS,
+ sizeof(val16), &val16);
+ }
+}
+
+static int ib_nl_get_path_rec_attrs_len(ib_sa_comp_mask comp_mask)
+{
+ int len = 0;
+
+ if (comp_mask & IB_SA_PATH_REC_SERVICE_ID)
+ len += nla_total_size(sizeof(u64));
+ if (comp_mask & IB_SA_PATH_REC_DGID)
+ len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
+ if (comp_mask & IB_SA_PATH_REC_SGID)
+ len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
+ if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
+ len += nla_total_size(sizeof(u8));
+ if (comp_mask & IB_SA_PATH_REC_PKEY)
+ len += nla_total_size(sizeof(u16));
+ if (comp_mask & IB_SA_PATH_REC_QOS_CLASS)
+ len += nla_total_size(sizeof(u16));
+
+ /*
+ * Make sure that at least some of the required comp_mask bits are
+ * set.
+ */
+ if (WARN_ON(len == 0))
+ return len;
+
+ /* Add the family header */
+ len += NLMSG_ALIGN(sizeof(struct rdma_ls_resolve_header));
+
+ return len;
+}
+
+static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask)
+{
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ void *data;
+ int ret = 0;
+ struct ib_sa_mad *mad;
+ int len;
+
+ mad = query->mad_buf->mad;
+ len = ib_nl_get_path_rec_attrs_len(mad->sa_hdr.comp_mask);
+ if (len <= 0)
+ return -EMSGSIZE;
+
+ skb = nlmsg_new(len, gfp_mask);
+ if (!skb)
+ return -ENOMEM;
+
+ /* Put nlmsg header only for now */
+ data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS,
+ RDMA_NL_LS_OP_RESOLVE, NLM_F_REQUEST);
+ if (!data) {
+ nlmsg_free(skb);
+ return -EMSGSIZE;
+ }
+
+ /* Add attributes */
+ ib_nl_set_path_rec_attrs(skb, query);
+
+ /* Repair the nlmsg header length */
+ nlmsg_end(skb, nlh);
+
+ ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask);
+ if (!ret)
+ ret = len;
+ else
+ ret = 0;
+
+ return ret;
+}
+
+static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask)
+{
+ unsigned long flags;
+ unsigned long delay;
+ int ret;
+
+ INIT_LIST_HEAD(&query->list);
+ query->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq);
+
+ /* Put the request on the list first.*/
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ delay = msecs_to_jiffies(sa_local_svc_timeout_ms);
+ query->timeout = delay + jiffies;
+ list_add_tail(&query->list, &ib_nl_request_list);
+ /* Start the timeout if this is the only request */
+ if (ib_nl_request_list.next == &query->list)
+ queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+ ret = ib_nl_send_msg(query, gfp_mask);
+ if (ret <= 0) {
+ ret = -EIO;
+ /* Remove the request */
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ list_del(&query->list);
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ } else {
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static int ib_nl_cancel_request(struct ib_sa_query *query)
+{
+ unsigned long flags;
+ struct ib_sa_query *wait_query;
+ int found = 0;
+
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ list_for_each_entry(wait_query, &ib_nl_request_list, list) {
+ /* Let the timeout to take care of the callback */
+ if (query == wait_query) {
+ query->flags |= IB_SA_CANCEL;
+ query->timeout = jiffies;
+ list_move(&query->list, &ib_nl_request_list);
+ found = 1;
+ mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, 1);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+ return found;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *mad_send_wc);
+
+static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query,
+ const struct nlmsghdr *nlh)
+{
+ struct ib_mad_send_wc mad_send_wc;
+ struct ib_sa_mad *mad = NULL;
+ const struct nlattr *head, *curr;
+ struct ib_path_rec_data *rec;
+ int len, rem;
+ u32 mask = 0;
+ int status = -EIO;
+
+ if (query->callback) {
+ head = (const struct nlattr *) nlmsg_data(nlh);
+ len = nlmsg_len(nlh);
+ switch (query->path_use) {
+ case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL:
+ mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND;
+ break;
+
+ case LS_RESOLVE_PATH_USE_ALL:
+ case LS_RESOLVE_PATH_USE_GMP:
+ default:
+ mask = IB_PATH_PRIMARY | IB_PATH_GMP |
+ IB_PATH_BIDIRECTIONAL;
+ break;
+ }
+ nla_for_each_attr(curr, head, len, rem) {
+ if (curr->nla_type == LS_NLA_TYPE_PATH_RECORD) {
+ rec = nla_data(curr);
+ /*
+ * Get the first one. In the future, we may
+ * need to get up to 6 pathrecords.
+ */
+ if ((rec->flags & mask) == mask) {
+ mad = query->mad_buf->mad;
+ mad->mad_hdr.method |=
+ IB_MGMT_METHOD_RESP;
+ memcpy(mad->data, rec->path_rec,
+ sizeof(rec->path_rec));
+ status = 0;
+ break;
+ }
+ }
+ }
+ query->callback(query, status, mad);
+ }
+
+ mad_send_wc.send_buf = query->mad_buf;
+ mad_send_wc.status = IB_WC_SUCCESS;
+ send_handler(query->mad_buf->mad_agent, &mad_send_wc);
+}
+
+static void ib_nl_request_timeout(struct work_struct *work)
+{
+ unsigned long flags;
+ struct ib_sa_query *query;
+ unsigned long delay;
+ struct ib_mad_send_wc mad_send_wc;
+ int ret;
+
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ while (!list_empty(&ib_nl_request_list)) {
+ query = list_entry(ib_nl_request_list.next,
+ struct ib_sa_query, list);
+
+ if (time_after(query->timeout, jiffies)) {
+ delay = query->timeout - jiffies;
+ if ((long)delay <= 0)
+ delay = 1;
+ queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+ break;
+ }
+
+ list_del(&query->list);
+ ib_sa_disable_local_svc(query);
+ /* Hold the lock to protect against query cancellation */
+ if (ib_sa_query_cancelled(query))
+ ret = -1;
+ else
+ ret = ib_post_send_mad(query->mad_buf, NULL);
+ if (ret) {
+ mad_send_wc.send_buf = query->mad_buf;
+ mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ send_handler(query->port->agent, &mad_send_wc);
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ }
+ }
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+}
+
+int ib_nl_handle_set_timeout(struct sk_buff *skb,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ int timeout, delta, abs_delta;
+ const struct nlattr *attr;
+ unsigned long flags;
+ struct ib_sa_query *query;
+ long delay = 0;
+ struct nlattr *tb[LS_NLA_TYPE_MAX];
+ int ret;
+
+ if (!(nlh->nlmsg_flags & NLM_F_REQUEST) ||
+ !(NETLINK_CB(skb).sk))
+ return -EPERM;
+
+ ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+ nlmsg_len(nlh), ib_nl_policy, NULL);
+ attr = (const struct nlattr *)tb[LS_NLA_TYPE_TIMEOUT];
+ if (ret || !attr)
+ goto settimeout_out;
+
+ timeout = *(int *) nla_data(attr);
+ if (timeout < IB_SA_LOCAL_SVC_TIMEOUT_MIN)
+ timeout = IB_SA_LOCAL_SVC_TIMEOUT_MIN;
+ if (timeout > IB_SA_LOCAL_SVC_TIMEOUT_MAX)
+ timeout = IB_SA_LOCAL_SVC_TIMEOUT_MAX;
+
+ delta = timeout - sa_local_svc_timeout_ms;
+ if (delta < 0)
+ abs_delta = -delta;
+ else
+ abs_delta = delta;
+
+ if (delta != 0) {
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ sa_local_svc_timeout_ms = timeout;
+ list_for_each_entry(query, &ib_nl_request_list, list) {
+ if (delta < 0 && abs_delta > query->timeout)
+ query->timeout = 0;
+ else
+ query->timeout += delta;
+
+ /* Get the new delay from the first entry */
+ if (!delay) {
+ delay = query->timeout - jiffies;
+ if (delay <= 0)
+ delay = 1;
+ }
+ }
+ if (delay)
+ mod_delayed_work(ib_nl_wq, &ib_nl_timed_work,
+ (unsigned long)delay);
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ }
+
+settimeout_out:
+ return 0;
+}
+
+static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh)
+{
+ struct nlattr *tb[LS_NLA_TYPE_MAX];
+ int ret;
+
+ if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
+ return 0;
+
+ ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+ nlmsg_len(nlh), ib_nl_policy, NULL);
+ if (ret)
+ return 0;
+
+ return 1;
+}
+
+int ib_nl_handle_resolve_resp(struct sk_buff *skb,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ unsigned long flags;
+ struct ib_sa_query *query;
+ struct ib_mad_send_buf *send_buf;
+ struct ib_mad_send_wc mad_send_wc;
+ int found = 0;
+ int ret;
+
+ if ((nlh->nlmsg_flags & NLM_F_REQUEST) ||
+ !(NETLINK_CB(skb).sk))
+ return -EPERM;
+
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ list_for_each_entry(query, &ib_nl_request_list, list) {
+ /*
+ * If the query is cancelled, let the timeout routine
+ * take care of it.
+ */
+ if (nlh->nlmsg_seq == query->seq) {
+ found = !ib_sa_query_cancelled(query);
+ if (found)
+ list_del(&query->list);
+ break;
+ }
+ }
+
+ if (!found) {
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ goto resp_out;
+ }
+
+ send_buf = query->mad_buf;
+
+ if (!ib_nl_is_good_resolve_resp(nlh)) {
+ /* if the result is a failure, send out the packet via IB */
+ ib_sa_disable_local_svc(query);
+ ret = ib_post_send_mad(query->mad_buf, NULL);
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ if (ret) {
+ mad_send_wc.send_buf = send_buf;
+ mad_send_wc.status = IB_WC_GENERAL_ERR;
+ send_handler(query->port->agent, &mad_send_wc);
+ }
+ } else {
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ ib_nl_process_good_resolve_rsp(query, nlh);
+ }
+
+resp_out:
+ return 0;
+}
+
+static void free_sm_ah(struct kref *kref)
+{
+ struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
+
+ rdma_destroy_ah(sm_ah->ah);
+ kfree(sm_ah);
+}
+
+void ib_sa_register_client(struct ib_sa_client *client)
+{
+ atomic_set(&client->users, 1);
+ init_completion(&client->comp);
+}
+EXPORT_SYMBOL(ib_sa_register_client);
+
+void ib_sa_unregister_client(struct ib_sa_client *client)
+{
+ ib_sa_client_put(client);
+ wait_for_completion(&client->comp);
+}
+EXPORT_SYMBOL(ib_sa_unregister_client);
+
+/**
+ * ib_sa_cancel_query - try to cancel an SA query
+ * @id:ID of query to cancel
+ * @query:query pointer to cancel
+ *
+ * Try to cancel an SA query. If the id and query don't match up or
+ * the query has already completed, nothing is done. Otherwise the
+ * query is canceled and will complete with a status of -EINTR.
+ */
+void ib_sa_cancel_query(int id, struct ib_sa_query *query)
+{
+ unsigned long flags;
+ struct ib_mad_agent *agent;
+ struct ib_mad_send_buf *mad_buf;
+
+ spin_lock_irqsave(&idr_lock, flags);
+ if (idr_find(&query_idr, id) != query) {
+ spin_unlock_irqrestore(&idr_lock, flags);
+ return;
+ }
+ agent = query->port->agent;
+ mad_buf = query->mad_buf;
+ spin_unlock_irqrestore(&idr_lock, flags);
+
+ /*
+ * If the query is still on the netlink request list, schedule
+ * it to be cancelled by the timeout routine. Otherwise, it has been
+ * sent to the MAD layer and has to be cancelled from there.
+ */
+ if (!ib_nl_cancel_request(query))
+ ib_cancel_mad(agent, mad_buf);
+}
+EXPORT_SYMBOL(ib_sa_cancel_query);
+
+static u8 get_src_path_mask(struct ib_device *device, u8 port_num)
+{
+ struct ib_sa_device *sa_dev;
+ struct ib_sa_port *port;
+ unsigned long flags;
+ u8 src_path_mask;
+
+ sa_dev = ib_get_client_data(device, &sa_client);
+ if (!sa_dev)
+ return 0x7f;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ spin_lock_irqsave(&port->ah_lock, flags);
+ src_path_mask = port->sm_ah ? port->sm_ah->src_path_mask : 0x7f;
+ spin_unlock_irqrestore(&port->ah_lock, flags);
+
+ return src_path_mask;
+}
+
+static int roce_resolve_route_from_path(struct sa_path_rec *rec,
+ const struct ib_gid_attr *attr)
+{
+ struct rdma_dev_addr dev_addr = {};
+ union {
+ struct sockaddr_in _sockaddr_in;
+ struct sockaddr_in6 _sockaddr_in6;
+ } sgid_addr, dgid_addr;
+ int ret;
+
+ if (rec->roce.route_resolved)
+ return 0;
+ if (!attr || !attr->ndev)
+ return -EINVAL;
+
+ dev_addr.bound_dev_if = attr->ndev->ifindex;
+ /* TODO: Use net from the ib_gid_attr once it is added to it,
+ * until than, limit itself to init_net.
+ */
+ dev_addr.net = &init_net;
+
+ rdma_gid2ip((struct sockaddr *)&sgid_addr, &rec->sgid);
+ rdma_gid2ip((struct sockaddr *)&dgid_addr, &rec->dgid);
+
+ /* validate the route */
+ ret = rdma_resolve_ip_route((struct sockaddr *)&sgid_addr,
+ (struct sockaddr *)&dgid_addr, &dev_addr);
+ if (ret)
+ return ret;
+
+ if ((dev_addr.network == RDMA_NETWORK_IPV4 ||
+ dev_addr.network == RDMA_NETWORK_IPV6) &&
+ rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2)
+ return -EINVAL;
+
+ rec->roce.route_resolved = true;
+ return 0;
+}
+
+static int init_ah_attr_grh_fields(struct ib_device *device, u8 port_num,
+ struct sa_path_rec *rec,
+ struct rdma_ah_attr *ah_attr,
+ const struct ib_gid_attr *gid_attr)
+{
+ enum ib_gid_type type = sa_conv_pathrec_to_gid_type(rec);
+
+ if (!gid_attr) {
+ gid_attr = rdma_find_gid_by_port(device, &rec->sgid, type,
+ port_num, NULL);
+ if (IS_ERR(gid_attr))
+ return PTR_ERR(gid_attr);
+ } else
+ rdma_hold_gid_attr(gid_attr);
+
+ rdma_move_grh_sgid_attr(ah_attr, &rec->dgid,
+ be32_to_cpu(rec->flow_label),
+ rec->hop_limit, rec->traffic_class,
+ gid_attr);
+ return 0;
+}
+
+/**
+ * ib_init_ah_attr_from_path - Initialize address handle attributes based on
+ * an SA path record.
+ * @device: Device associated ah attributes initialization.
+ * @port_num: Port on the specified device.
+ * @rec: path record entry to use for ah attributes initialization.
+ * @ah_attr: address handle attributes to initialization from path record.
+ * @sgid_attr: SGID attribute to consider during initialization.
+ *
+ * When ib_init_ah_attr_from_path() returns success,
+ * (a) for IB link layer it optionally contains a reference to SGID attribute
+ * when GRH is present for IB link layer.
+ * (b) for RoCE link layer it contains a reference to SGID attribute.
+ * User must invoke rdma_destroy_ah_attr() to release reference to SGID
+ * attributes which are initialized using ib_init_ah_attr_from_path().
+ */
+int ib_init_ah_attr_from_path(struct ib_device *device, u8 port_num,
+ struct sa_path_rec *rec,
+ struct rdma_ah_attr *ah_attr,
+ const struct ib_gid_attr *gid_attr)
+{
+ int ret = 0;
+
+ memset(ah_attr, 0, sizeof(*ah_attr));
+ ah_attr->type = rdma_ah_find_type(device, port_num);
+ rdma_ah_set_sl(ah_attr, rec->sl);
+ rdma_ah_set_port_num(ah_attr, port_num);
+ rdma_ah_set_static_rate(ah_attr, rec->rate);
+
+ if (sa_path_is_roce(rec)) {
+ ret = roce_resolve_route_from_path(rec, gid_attr);
+ if (ret)
+ return ret;
+
+ memcpy(ah_attr->roce.dmac, sa_path_get_dmac(rec), ETH_ALEN);
+ } else {
+ rdma_ah_set_dlid(ah_attr, be32_to_cpu(sa_path_get_dlid(rec)));
+ if (sa_path_is_opa(rec) &&
+ rdma_ah_get_dlid(ah_attr) == be16_to_cpu(IB_LID_PERMISSIVE))
+ rdma_ah_set_make_grd(ah_attr, true);
+
+ rdma_ah_set_path_bits(ah_attr,
+ be32_to_cpu(sa_path_get_slid(rec)) &
+ get_src_path_mask(device, port_num));
+ }
+
+ if (rec->hop_limit > 0 || sa_path_is_roce(rec))
+ ret = init_ah_attr_grh_fields(device, port_num,
+ rec, ah_attr, gid_attr);
+ return ret;
+}
+EXPORT_SYMBOL(ib_init_ah_attr_from_path);
+
+static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask)
+{
+ struct rdma_ah_attr ah_attr;
+ unsigned long flags;
+
+ spin_lock_irqsave(&query->port->ah_lock, flags);
+ if (!query->port->sm_ah) {
+ spin_unlock_irqrestore(&query->port->ah_lock, flags);
+ return -EAGAIN;
+ }
+ kref_get(&query->port->sm_ah->ref);
+ query->sm_ah = query->port->sm_ah;
+ spin_unlock_irqrestore(&query->port->ah_lock, flags);
+
+ /*
+ * Always check if sm_ah has valid dlid assigned,
+ * before querying for class port info
+ */
+ if ((rdma_query_ah(query->sm_ah->ah, &ah_attr) < 0) ||
+ !rdma_is_valid_unicast_lid(&ah_attr)) {
+ kref_put(&query->sm_ah->ref, free_sm_ah);
+ return -EAGAIN;
+ }
+ query->mad_buf = ib_create_send_mad(query->port->agent, 1,
+ query->sm_ah->pkey_index,
+ 0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA,
+ gfp_mask,
+ ((query->flags & IB_SA_QUERY_OPA) ?
+ OPA_MGMT_BASE_VERSION :
+ IB_MGMT_BASE_VERSION));
+ if (IS_ERR(query->mad_buf)) {
+ kref_put(&query->sm_ah->ref, free_sm_ah);
+ return -ENOMEM;
+ }
+
+ query->mad_buf->ah = query->sm_ah->ah;
+
+ return 0;
+}
+
+static void free_mad(struct ib_sa_query *query)
+{
+ ib_free_send_mad(query->mad_buf);
+ kref_put(&query->sm_ah->ref, free_sm_ah);
+}
+
+static void init_mad(struct ib_sa_query *query, struct ib_mad_agent *agent)
+{
+ struct ib_sa_mad *mad = query->mad_buf->mad;
+ unsigned long flags;
+
+ memset(mad, 0, sizeof *mad);
+
+ if (query->flags & IB_SA_QUERY_OPA) {
+ mad->mad_hdr.base_version = OPA_MGMT_BASE_VERSION;
+ mad->mad_hdr.class_version = OPA_SA_CLASS_VERSION;
+ } else {
+ mad->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
+ mad->mad_hdr.class_version = IB_SA_CLASS_VERSION;
+ }
+ mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+ spin_lock_irqsave(&tid_lock, flags);
+ mad->mad_hdr.tid =
+ cpu_to_be64(((u64) agent->hi_tid) << 32 | tid++);
+ spin_unlock_irqrestore(&tid_lock, flags);
+}
+
+static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
+{
+ bool preload = gfpflags_allow_blocking(gfp_mask);
+ unsigned long flags;
+ int ret, id;
+
+ if (preload)
+ idr_preload(gfp_mask);
+ spin_lock_irqsave(&idr_lock, flags);
+
+ id = idr_alloc(&query_idr, query, 0, 0, GFP_NOWAIT);
+
+ spin_unlock_irqrestore(&idr_lock, flags);
+ if (preload)
+ idr_preload_end();
+ if (id < 0)
+ return id;
+
+ query->mad_buf->timeout_ms = timeout_ms;
+ query->mad_buf->context[0] = query;
+ query->id = id;
+
+ if ((query->flags & IB_SA_ENABLE_LOCAL_SERVICE) &&
+ (!(query->flags & IB_SA_QUERY_OPA))) {
+ if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) {
+ if (!ib_nl_make_request(query, gfp_mask))
+ return id;
+ }
+ ib_sa_disable_local_svc(query);
+ }
+
+ ret = ib_post_send_mad(query->mad_buf, NULL);
+ if (ret) {
+ spin_lock_irqsave(&idr_lock, flags);
+ idr_remove(&query_idr, id);
+ spin_unlock_irqrestore(&idr_lock, flags);
+ }
+
+ /*
+ * It's not safe to dereference query any more, because the
+ * send may already have completed and freed the query in
+ * another context.
+ */
+ return ret ? ret : id;
+}
+
+void ib_sa_unpack_path(void *attribute, struct sa_path_rec *rec)
+{
+ ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), attribute, rec);
+}
+EXPORT_SYMBOL(ib_sa_unpack_path);
+
+void ib_sa_pack_path(struct sa_path_rec *rec, void *attribute)
+{
+ ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, attribute);
+}
+EXPORT_SYMBOL(ib_sa_pack_path);
+
+static bool ib_sa_opa_pathrecord_support(struct ib_sa_client *client,
+ struct ib_device *device,
+ u8 port_num)
+{
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_port *port;
+ unsigned long flags;
+ bool ret = false;
+
+ if (!sa_dev)
+ return ret;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ spin_lock_irqsave(&port->classport_lock, flags);
+ if (!port->classport_info.valid)
+ goto ret;
+
+ if (port->classport_info.data.type == RDMA_CLASS_PORT_INFO_OPA)
+ ret = opa_get_cpi_capmask2(&port->classport_info.data.opa) &
+ OPA_CLASS_PORT_INFO_PR_SUPPORT;
+ret:
+ spin_unlock_irqrestore(&port->classport_lock, flags);
+ return ret;
+}
+
+enum opa_pr_supported {
+ PR_NOT_SUPPORTED,
+ PR_OPA_SUPPORTED,
+ PR_IB_SUPPORTED
+};
+
+/**
+ * Check if current PR query can be an OPA query.
+ * Retuns PR_NOT_SUPPORTED if a path record query is not
+ * possible, PR_OPA_SUPPORTED if an OPA path record query
+ * is possible and PR_IB_SUPPORTED if an IB path record
+ * query is possible.
+ */
+static int opa_pr_query_possible(struct ib_sa_client *client,
+ struct ib_device *device,
+ u8 port_num,
+ struct sa_path_rec *rec)
+{
+ struct ib_port_attr port_attr;
+
+ if (ib_query_port(device, port_num, &port_attr))
+ return PR_NOT_SUPPORTED;
+
+ if (ib_sa_opa_pathrecord_support(client, device, port_num))
+ return PR_OPA_SUPPORTED;
+
+ if (port_attr.lid >= be16_to_cpu(IB_MULTICAST_LID_BASE))
+ return PR_NOT_SUPPORTED;
+ else
+ return PR_IB_SUPPORTED;
+}
+
+static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
+ int status,
+ struct ib_sa_mad *mad)
+{
+ struct ib_sa_path_query *query =
+ container_of(sa_query, struct ib_sa_path_query, sa_query);
+
+ if (mad) {
+ struct sa_path_rec rec;
+
+ if (sa_query->flags & IB_SA_QUERY_OPA) {
+ ib_unpack(opa_path_rec_table,
+ ARRAY_SIZE(opa_path_rec_table),
+ mad->data, &rec);
+ rec.rec_type = SA_PATH_REC_TYPE_OPA;
+ query->callback(status, &rec, query->context);
+ } else {
+ ib_unpack(path_rec_table,
+ ARRAY_SIZE(path_rec_table),
+ mad->data, &rec);
+ rec.rec_type = SA_PATH_REC_TYPE_IB;
+ sa_path_set_dmac_zero(&rec);
+
+ if (query->conv_pr) {
+ struct sa_path_rec opa;
+
+ memset(&opa, 0, sizeof(struct sa_path_rec));
+ sa_convert_path_ib_to_opa(&opa, &rec);
+ query->callback(status, &opa, query->context);
+ } else {
+ query->callback(status, &rec, query->context);
+ }
+ }
+ } else
+ query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
+{
+ struct ib_sa_path_query *query =
+ container_of(sa_query, struct ib_sa_path_query, sa_query);
+
+ kfree(query->conv_pr);
+ kfree(query);
+}
+
+/**
+ * ib_sa_path_rec_get - Start a Path get query
+ * @client:SA client
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:Path Record to send in query
+ * @comp_mask:component mask to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when query completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * Send a Path Record Get query to the SA to look up a path. The
+ * callback function will be called when the query completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query. The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_path_rec_get() is negative, it is an
+ * error code. Otherwise it is a query ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_path_rec_get(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ struct sa_path_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct sa_path_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct ib_sa_path_query *query;
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_port *port;
+ struct ib_mad_agent *agent;
+ struct ib_sa_mad *mad;
+ enum opa_pr_supported status;
+ int ret;
+
+ if (!sa_dev)
+ return -ENODEV;
+
+ if ((rec->rec_type != SA_PATH_REC_TYPE_IB) &&
+ (rec->rec_type != SA_PATH_REC_TYPE_OPA))
+ return -EINVAL;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ agent = port->agent;
+
+ query = kzalloc(sizeof(*query), gfp_mask);
+ if (!query)
+ return -ENOMEM;
+
+ query->sa_query.port = port;
+ if (rec->rec_type == SA_PATH_REC_TYPE_OPA) {
+ status = opa_pr_query_possible(client, device, port_num, rec);
+ if (status == PR_NOT_SUPPORTED) {
+ ret = -EINVAL;
+ goto err1;
+ } else if (status == PR_OPA_SUPPORTED) {
+ query->sa_query.flags |= IB_SA_QUERY_OPA;
+ } else {
+ query->conv_pr =
+ kmalloc(sizeof(*query->conv_pr), gfp_mask);
+ if (!query->conv_pr) {
+ ret = -ENOMEM;
+ goto err1;
+ }
+ }
+ }
+
+ ret = alloc_mad(&query->sa_query, gfp_mask);
+ if (ret)
+ goto err2;
+
+ ib_sa_client_get(client);
+ query->sa_query.client = client;
+ query->callback = callback;
+ query->context = context;
+
+ mad = query->sa_query.mad_buf->mad;
+ init_mad(&query->sa_query, agent);
+
+ query->sa_query.callback = callback ? ib_sa_path_rec_callback : NULL;
+ query->sa_query.release = ib_sa_path_rec_release;
+ mad->mad_hdr.method = IB_MGMT_METHOD_GET;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_PATH_REC);
+ mad->sa_hdr.comp_mask = comp_mask;
+
+ if (query->sa_query.flags & IB_SA_QUERY_OPA) {
+ ib_pack(opa_path_rec_table, ARRAY_SIZE(opa_path_rec_table),
+ rec, mad->data);
+ } else if (query->conv_pr) {
+ sa_convert_path_opa_to_ib(query->conv_pr, rec);
+ ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table),
+ query->conv_pr, mad->data);
+ } else {
+ ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table),
+ rec, mad->data);
+ }
+
+ *sa_query = &query->sa_query;
+
+ query->sa_query.flags |= IB_SA_ENABLE_LOCAL_SERVICE;
+ query->sa_query.mad_buf->context[1] = (query->conv_pr) ?
+ query->conv_pr : rec;
+
+ ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+ if (ret < 0)
+ goto err3;
+
+ return ret;
+
+err3:
+ *sa_query = NULL;
+ ib_sa_client_put(query->sa_query.client);
+ free_mad(&query->sa_query);
+err2:
+ kfree(query->conv_pr);
+err1:
+ kfree(query);
+ return ret;
+}
+EXPORT_SYMBOL(ib_sa_path_rec_get);
+
+static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query,
+ int status,
+ struct ib_sa_mad *mad)
+{
+ struct ib_sa_service_query *query =
+ container_of(sa_query, struct ib_sa_service_query, sa_query);
+
+ if (mad) {
+ struct ib_sa_service_rec rec;
+
+ ib_unpack(service_rec_table, ARRAY_SIZE(service_rec_table),
+ mad->data, &rec);
+ query->callback(status, &rec, query->context);
+ } else
+ query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_service_rec_release(struct ib_sa_query *sa_query)
+{
+ kfree(container_of(sa_query, struct ib_sa_service_query, sa_query));
+}
+
+/**
+ * ib_sa_service_rec_query - Start Service Record operation
+ * @client:SA client
+ * @device:device to send request on
+ * @port_num: port number to send request on
+ * @method:SA method - should be get, set, or delete
+ * @rec:Service Record to send in request
+ * @comp_mask:component mask to send in request
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when request completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:request context, used to cancel request
+ *
+ * Send a Service Record set/get/delete to the SA to register,
+ * unregister or query a service record.
+ * The callback function will be called when the request completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query. The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_service_rec_query() is negative, it is an
+ * error code. Otherwise it is a request ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_service_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num, u8 method,
+ struct ib_sa_service_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_service_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct ib_sa_service_query *query;
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_port *port;
+ struct ib_mad_agent *agent;
+ struct ib_sa_mad *mad;
+ int ret;
+
+ if (!sa_dev)
+ return -ENODEV;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ agent = port->agent;
+
+ if (method != IB_MGMT_METHOD_GET &&
+ method != IB_MGMT_METHOD_SET &&
+ method != IB_SA_METHOD_DELETE)
+ return -EINVAL;
+
+ query = kzalloc(sizeof(*query), gfp_mask);
+ if (!query)
+ return -ENOMEM;
+
+ query->sa_query.port = port;
+ ret = alloc_mad(&query->sa_query, gfp_mask);
+ if (ret)
+ goto err1;
+
+ ib_sa_client_get(client);
+ query->sa_query.client = client;
+ query->callback = callback;
+ query->context = context;
+
+ mad = query->sa_query.mad_buf->mad;
+ init_mad(&query->sa_query, agent);
+
+ query->sa_query.callback = callback ? ib_sa_service_rec_callback : NULL;
+ query->sa_query.release = ib_sa_service_rec_release;
+ mad->mad_hdr.method = method;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_SERVICE_REC);
+ mad->sa_hdr.comp_mask = comp_mask;
+
+ ib_pack(service_rec_table, ARRAY_SIZE(service_rec_table),
+ rec, mad->data);
+
+ *sa_query = &query->sa_query;
+
+ ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+ if (ret < 0)
+ goto err2;
+
+ return ret;
+
+err2:
+ *sa_query = NULL;
+ ib_sa_client_put(query->sa_query.client);
+ free_mad(&query->sa_query);
+
+err1:
+ kfree(query);
+ return ret;
+}
+EXPORT_SYMBOL(ib_sa_service_rec_query);
+
+static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query,
+ int status,
+ struct ib_sa_mad *mad)
+{
+ struct ib_sa_mcmember_query *query =
+ container_of(sa_query, struct ib_sa_mcmember_query, sa_query);
+
+ if (mad) {
+ struct ib_sa_mcmember_rec rec;
+
+ ib_unpack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+ mad->data, &rec);
+ query->callback(status, &rec, query->context);
+ } else
+ query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_mcmember_rec_release(struct ib_sa_query *sa_query)
+{
+ kfree(container_of(sa_query, struct ib_sa_mcmember_query, sa_query));
+}
+
+int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ u8 method,
+ struct ib_sa_mcmember_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_mcmember_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct ib_sa_mcmember_query *query;
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_port *port;
+ struct ib_mad_agent *agent;
+ struct ib_sa_mad *mad;
+ int ret;
+
+ if (!sa_dev)
+ return -ENODEV;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ agent = port->agent;
+
+ query = kzalloc(sizeof(*query), gfp_mask);
+ if (!query)
+ return -ENOMEM;
+
+ query->sa_query.port = port;
+ ret = alloc_mad(&query->sa_query, gfp_mask);
+ if (ret)
+ goto err1;
+
+ ib_sa_client_get(client);
+ query->sa_query.client = client;
+ query->callback = callback;
+ query->context = context;
+
+ mad = query->sa_query.mad_buf->mad;
+ init_mad(&query->sa_query, agent);
+
+ query->sa_query.callback = callback ? ib_sa_mcmember_rec_callback : NULL;
+ query->sa_query.release = ib_sa_mcmember_rec_release;
+ mad->mad_hdr.method = method;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+ mad->sa_hdr.comp_mask = comp_mask;
+
+ ib_pack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+ rec, mad->data);
+
+ *sa_query = &query->sa_query;
+
+ ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+ if (ret < 0)
+ goto err2;
+
+ return ret;
+
+err2:
+ *sa_query = NULL;
+ ib_sa_client_put(query->sa_query.client);
+ free_mad(&query->sa_query);
+
+err1:
+ kfree(query);
+ return ret;
+}
+
+/* Support GuidInfoRecord */
+static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query,
+ int status,
+ struct ib_sa_mad *mad)
+{
+ struct ib_sa_guidinfo_query *query =
+ container_of(sa_query, struct ib_sa_guidinfo_query, sa_query);
+
+ if (mad) {
+ struct ib_sa_guidinfo_rec rec;
+
+ ib_unpack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table),
+ mad->data, &rec);
+ query->callback(status, &rec, query->context);
+ } else
+ query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_guidinfo_rec_release(struct ib_sa_query *sa_query)
+{
+ kfree(container_of(sa_query, struct ib_sa_guidinfo_query, sa_query));
+}
+
+int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ struct ib_sa_guidinfo_rec *rec,
+ ib_sa_comp_mask comp_mask, u8 method,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_guidinfo_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct ib_sa_guidinfo_query *query;
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_port *port;
+ struct ib_mad_agent *agent;
+ struct ib_sa_mad *mad;
+ int ret;
+
+ if (!sa_dev)
+ return -ENODEV;
+
+ if (method != IB_MGMT_METHOD_GET &&
+ method != IB_MGMT_METHOD_SET &&
+ method != IB_SA_METHOD_DELETE) {
+ return -EINVAL;
+ }
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ agent = port->agent;
+
+ query = kzalloc(sizeof(*query), gfp_mask);
+ if (!query)
+ return -ENOMEM;
+
+ query->sa_query.port = port;
+ ret = alloc_mad(&query->sa_query, gfp_mask);
+ if (ret)
+ goto err1;
+
+ ib_sa_client_get(client);
+ query->sa_query.client = client;
+ query->callback = callback;
+ query->context = context;
+
+ mad = query->sa_query.mad_buf->mad;
+ init_mad(&query->sa_query, agent);
+
+ query->sa_query.callback = callback ? ib_sa_guidinfo_rec_callback : NULL;
+ query->sa_query.release = ib_sa_guidinfo_rec_release;
+
+ mad->mad_hdr.method = method;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_GUID_INFO_REC);
+ mad->sa_hdr.comp_mask = comp_mask;
+
+ ib_pack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), rec,
+ mad->data);
+
+ *sa_query = &query->sa_query;
+
+ ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+ if (ret < 0)
+ goto err2;
+
+ return ret;
+
+err2:
+ *sa_query = NULL;
+ ib_sa_client_put(query->sa_query.client);
+ free_mad(&query->sa_query);
+
+err1:
+ kfree(query);
+ return ret;
+}
+EXPORT_SYMBOL(ib_sa_guid_info_rec_query);
+
+bool ib_sa_sendonly_fullmem_support(struct ib_sa_client *client,
+ struct ib_device *device,
+ u8 port_num)
+{
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_port *port;
+ bool ret = false;
+ unsigned long flags;
+
+ if (!sa_dev)
+ return ret;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+
+ spin_lock_irqsave(&port->classport_lock, flags);
+ if ((port->classport_info.valid) &&
+ (port->classport_info.data.type == RDMA_CLASS_PORT_INFO_IB))
+ ret = ib_get_cpi_capmask2(&port->classport_info.data.ib)
+ & IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT;
+ spin_unlock_irqrestore(&port->classport_lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_sa_sendonly_fullmem_support);
+
+struct ib_classport_info_context {
+ struct completion done;
+ struct ib_sa_query *sa_query;
+};
+
+static void ib_classportinfo_cb(void *context)
+{
+ struct ib_classport_info_context *cb_ctx = context;
+
+ complete(&cb_ctx->done);
+}
+
+static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query,
+ int status,
+ struct ib_sa_mad *mad)
+{
+ unsigned long flags;
+ struct ib_sa_classport_info_query *query =
+ container_of(sa_query, struct ib_sa_classport_info_query, sa_query);
+ struct ib_sa_classport_cache *info = &sa_query->port->classport_info;
+
+ if (mad) {
+ if (sa_query->flags & IB_SA_QUERY_OPA) {
+ struct opa_class_port_info rec;
+
+ ib_unpack(opa_classport_info_rec_table,
+ ARRAY_SIZE(opa_classport_info_rec_table),
+ mad->data, &rec);
+
+ spin_lock_irqsave(&sa_query->port->classport_lock,
+ flags);
+ if (!status && !info->valid) {
+ memcpy(&info->data.opa, &rec,
+ sizeof(info->data.opa));
+
+ info->valid = true;
+ info->data.type = RDMA_CLASS_PORT_INFO_OPA;
+ }
+ spin_unlock_irqrestore(&sa_query->port->classport_lock,
+ flags);
+
+ } else {
+ struct ib_class_port_info rec;
+
+ ib_unpack(ib_classport_info_rec_table,
+ ARRAY_SIZE(ib_classport_info_rec_table),
+ mad->data, &rec);
+
+ spin_lock_irqsave(&sa_query->port->classport_lock,
+ flags);
+ if (!status && !info->valid) {
+ memcpy(&info->data.ib, &rec,
+ sizeof(info->data.ib));
+
+ info->valid = true;
+ info->data.type = RDMA_CLASS_PORT_INFO_IB;
+ }
+ spin_unlock_irqrestore(&sa_query->port->classport_lock,
+ flags);
+ }
+ }
+ query->callback(query->context);
+}
+
+static void ib_sa_classport_info_rec_release(struct ib_sa_query *sa_query)
+{
+ kfree(container_of(sa_query, struct ib_sa_classport_info_query,
+ sa_query));
+}
+
+static int ib_sa_classport_info_rec_query(struct ib_sa_port *port,
+ int timeout_ms,
+ void (*callback)(void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct ib_mad_agent *agent;
+ struct ib_sa_classport_info_query *query;
+ struct ib_sa_mad *mad;
+ gfp_t gfp_mask = GFP_KERNEL;
+ int ret;
+
+ agent = port->agent;
+
+ query = kzalloc(sizeof(*query), gfp_mask);
+ if (!query)
+ return -ENOMEM;
+
+ query->sa_query.port = port;
+ query->sa_query.flags |= rdma_cap_opa_ah(port->agent->device,
+ port->port_num) ?
+ IB_SA_QUERY_OPA : 0;
+ ret = alloc_mad(&query->sa_query, gfp_mask);
+ if (ret)
+ goto err_free;
+
+ query->callback = callback;
+ query->context = context;
+
+ mad = query->sa_query.mad_buf->mad;
+ init_mad(&query->sa_query, agent);
+
+ query->sa_query.callback = ib_sa_classport_info_rec_callback;
+ query->sa_query.release = ib_sa_classport_info_rec_release;
+ mad->mad_hdr.method = IB_MGMT_METHOD_GET;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_CLASS_PORTINFO);
+ mad->sa_hdr.comp_mask = 0;
+ *sa_query = &query->sa_query;
+
+ ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+ if (ret < 0)
+ goto err_free_mad;
+
+ return ret;
+
+err_free_mad:
+ *sa_query = NULL;
+ free_mad(&query->sa_query);
+
+err_free:
+ kfree(query);
+ return ret;
+}
+
+static void update_ib_cpi(struct work_struct *work)
+{
+ struct ib_sa_port *port =
+ container_of(work, struct ib_sa_port, ib_cpi_work.work);
+ struct ib_classport_info_context *cb_context;
+ unsigned long flags;
+ int ret;
+
+ /* If the classport info is valid, nothing
+ * to do here.
+ */
+ spin_lock_irqsave(&port->classport_lock, flags);
+ if (port->classport_info.valid) {
+ spin_unlock_irqrestore(&port->classport_lock, flags);
+ return;
+ }
+ spin_unlock_irqrestore(&port->classport_lock, flags);
+
+ cb_context = kmalloc(sizeof(*cb_context), GFP_KERNEL);
+ if (!cb_context)
+ goto err_nomem;
+
+ init_completion(&cb_context->done);
+
+ ret = ib_sa_classport_info_rec_query(port, 3000,
+ ib_classportinfo_cb, cb_context,
+ &cb_context->sa_query);
+ if (ret < 0)
+ goto free_cb_err;
+ wait_for_completion(&cb_context->done);
+free_cb_err:
+ kfree(cb_context);
+ spin_lock_irqsave(&port->classport_lock, flags);
+
+ /* If the classport info is still not valid, the query should have
+ * failed for some reason. Retry issuing the query
+ */
+ if (!port->classport_info.valid) {
+ port->classport_info.retry_cnt++;
+ if (port->classport_info.retry_cnt <=
+ IB_SA_CPI_MAX_RETRY_CNT) {
+ unsigned long delay =
+ msecs_to_jiffies(IB_SA_CPI_RETRY_WAIT);
+
+ queue_delayed_work(ib_wq, &port->ib_cpi_work, delay);
+ }
+ }
+ spin_unlock_irqrestore(&port->classport_lock, flags);
+
+err_nomem:
+ return;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_sa_query *query = mad_send_wc->send_buf->context[0];
+ unsigned long flags;
+
+ if (query->callback)
+ switch (mad_send_wc->status) {
+ case IB_WC_SUCCESS:
+ /* No callback -- already got recv */
+ break;
+ case IB_WC_RESP_TIMEOUT_ERR:
+ query->callback(query, -ETIMEDOUT, NULL);
+ break;
+ case IB_WC_WR_FLUSH_ERR:
+ query->callback(query, -EINTR, NULL);
+ break;
+ default:
+ query->callback(query, -EIO, NULL);
+ break;
+ }
+
+ spin_lock_irqsave(&idr_lock, flags);
+ idr_remove(&query_idr, query->id);
+ spin_unlock_irqrestore(&idr_lock, flags);
+
+ free_mad(query);
+ if (query->client)
+ ib_sa_client_put(query->client);
+ query->release(query);
+}
+
+static void recv_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_sa_query *query;
+
+ if (!send_buf)
+ return;
+
+ query = send_buf->context[0];
+ if (query->callback) {
+ if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
+ query->callback(query,
+ mad_recv_wc->recv_buf.mad->mad_hdr.status ?
+ -EINVAL : 0,
+ (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad);
+ else
+ query->callback(query, -EIO, NULL);
+ }
+
+ ib_free_recv_mad(mad_recv_wc);
+}
+
+static void update_sm_ah(struct work_struct *work)
+{
+ struct ib_sa_port *port =
+ container_of(work, struct ib_sa_port, update_task);
+ struct ib_sa_sm_ah *new_ah;
+ struct ib_port_attr port_attr;
+ struct rdma_ah_attr ah_attr;
+ bool grh_required;
+
+ if (ib_query_port(port->agent->device, port->port_num, &port_attr)) {
+ pr_warn("Couldn't query port\n");
+ return;
+ }
+
+ new_ah = kmalloc(sizeof(*new_ah), GFP_KERNEL);
+ if (!new_ah)
+ return;
+
+ kref_init(&new_ah->ref);
+ new_ah->src_path_mask = (1 << port_attr.lmc) - 1;
+
+ new_ah->pkey_index = 0;
+ if (ib_find_pkey(port->agent->device, port->port_num,
+ IB_DEFAULT_PKEY_FULL, &new_ah->pkey_index))
+ pr_err("Couldn't find index for default PKey\n");
+
+ memset(&ah_attr, 0, sizeof(ah_attr));
+ ah_attr.type = rdma_ah_find_type(port->agent->device,
+ port->port_num);
+ rdma_ah_set_dlid(&ah_attr, port_attr.sm_lid);
+ rdma_ah_set_sl(&ah_attr, port_attr.sm_sl);
+ rdma_ah_set_port_num(&ah_attr, port->port_num);
+
+ grh_required = rdma_is_grh_required(port->agent->device,
+ port->port_num);
+
+ /*
+ * The OPA sm_lid of 0xFFFF needs special handling so that it can be
+ * differentiated from a permissive LID of 0xFFFF. We set the
+ * grh_required flag here so the SA can program the DGID in the
+ * address handle appropriately
+ */
+ if (ah_attr.type == RDMA_AH_ATTR_TYPE_OPA &&
+ (grh_required ||
+ port_attr.sm_lid == be16_to_cpu(IB_LID_PERMISSIVE)))
+ rdma_ah_set_make_grd(&ah_attr, true);
+
+ if (ah_attr.type == RDMA_AH_ATTR_TYPE_IB && grh_required) {
+ rdma_ah_set_ah_flags(&ah_attr, IB_AH_GRH);
+ rdma_ah_set_subnet_prefix(&ah_attr,
+ cpu_to_be64(port_attr.subnet_prefix));
+ rdma_ah_set_interface_id(&ah_attr,
+ cpu_to_be64(IB_SA_WELL_KNOWN_GUID));
+ }
+
+ new_ah->ah = rdma_create_ah(port->agent->qp->pd, &ah_attr);
+ if (IS_ERR(new_ah->ah)) {
+ pr_warn("Couldn't create new SM AH\n");
+ kfree(new_ah);
+ return;
+ }
+
+ spin_lock_irq(&port->ah_lock);
+ if (port->sm_ah)
+ kref_put(&port->sm_ah->ref, free_sm_ah);
+ port->sm_ah = new_ah;
+ spin_unlock_irq(&port->ah_lock);
+}
+
+static void ib_sa_event(struct ib_event_handler *handler,
+ struct ib_event *event)
+{
+ if (event->event == IB_EVENT_PORT_ERR ||
+ event->event == IB_EVENT_PORT_ACTIVE ||
+ event->event == IB_EVENT_LID_CHANGE ||
+ event->event == IB_EVENT_PKEY_CHANGE ||
+ event->event == IB_EVENT_SM_CHANGE ||
+ event->event == IB_EVENT_CLIENT_REREGISTER) {
+ unsigned long flags;
+ struct ib_sa_device *sa_dev =
+ container_of(handler, typeof(*sa_dev), event_handler);
+ u8 port_num = event->element.port_num - sa_dev->start_port;
+ struct ib_sa_port *port = &sa_dev->port[port_num];
+
+ if (!rdma_cap_ib_sa(handler->device, port->port_num))
+ return;
+
+ spin_lock_irqsave(&port->ah_lock, flags);
+ if (port->sm_ah)
+ kref_put(&port->sm_ah->ref, free_sm_ah);
+ port->sm_ah = NULL;
+ spin_unlock_irqrestore(&port->ah_lock, flags);
+
+ if (event->event == IB_EVENT_SM_CHANGE ||
+ event->event == IB_EVENT_CLIENT_REREGISTER ||
+ event->event == IB_EVENT_LID_CHANGE ||
+ event->event == IB_EVENT_PORT_ACTIVE) {
+ unsigned long delay =
+ msecs_to_jiffies(IB_SA_CPI_RETRY_WAIT);
+
+ spin_lock_irqsave(&port->classport_lock, flags);
+ port->classport_info.valid = false;
+ port->classport_info.retry_cnt = 0;
+ spin_unlock_irqrestore(&port->classport_lock, flags);
+ queue_delayed_work(ib_wq,
+ &port->ib_cpi_work, delay);
+ }
+ queue_work(ib_wq, &sa_dev->port[port_num].update_task);
+ }
+}
+
+static void ib_sa_add_one(struct ib_device *device)
+{
+ struct ib_sa_device *sa_dev;
+ int s, e, i;
+ int count = 0;
+
+ s = rdma_start_port(device);
+ e = rdma_end_port(device);
+
+ sa_dev = kzalloc(sizeof *sa_dev +
+ (e - s + 1) * sizeof (struct ib_sa_port),
+ GFP_KERNEL);
+ if (!sa_dev)
+ return;
+
+ sa_dev->start_port = s;
+ sa_dev->end_port = e;
+
+ for (i = 0; i <= e - s; ++i) {
+ spin_lock_init(&sa_dev->port[i].ah_lock);
+ if (!rdma_cap_ib_sa(device, i + 1))
+ continue;
+
+ sa_dev->port[i].sm_ah = NULL;
+ sa_dev->port[i].port_num = i + s;
+
+ spin_lock_init(&sa_dev->port[i].classport_lock);
+ sa_dev->port[i].classport_info.valid = false;
+
+ sa_dev->port[i].agent =
+ ib_register_mad_agent(device, i + s, IB_QPT_GSI,
+ NULL, 0, send_handler,
+ recv_handler, sa_dev, 0);
+ if (IS_ERR(sa_dev->port[i].agent))
+ goto err;
+
+ INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah);
+ INIT_DELAYED_WORK(&sa_dev->port[i].ib_cpi_work,
+ update_ib_cpi);
+
+ count++;
+ }
+
+ if (!count)
+ goto free;
+
+ ib_set_client_data(device, &sa_client, sa_dev);
+
+ /*
+ * We register our event handler after everything is set up,
+ * and then update our cached info after the event handler is
+ * registered to avoid any problems if a port changes state
+ * during our initialization.
+ */
+
+ INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event);
+ ib_register_event_handler(&sa_dev->event_handler);
+
+ for (i = 0; i <= e - s; ++i) {
+ if (rdma_cap_ib_sa(device, i + 1))
+ update_sm_ah(&sa_dev->port[i].update_task);
+ }
+
+ return;
+
+err:
+ while (--i >= 0) {
+ if (rdma_cap_ib_sa(device, i + 1))
+ ib_unregister_mad_agent(sa_dev->port[i].agent);
+ }
+free:
+ kfree(sa_dev);
+ return;
+}
+
+static void ib_sa_remove_one(struct ib_device *device, void *client_data)
+{
+ struct ib_sa_device *sa_dev = client_data;
+ int i;
+
+ if (!sa_dev)
+ return;
+
+ ib_unregister_event_handler(&sa_dev->event_handler);
+ flush_workqueue(ib_wq);
+
+ for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) {
+ if (rdma_cap_ib_sa(device, i + 1)) {
+ cancel_delayed_work_sync(&sa_dev->port[i].ib_cpi_work);
+ ib_unregister_mad_agent(sa_dev->port[i].agent);
+ if (sa_dev->port[i].sm_ah)
+ kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah);
+ }
+
+ }
+
+ kfree(sa_dev);
+}
+
+int ib_sa_init(void)
+{
+ int ret;
+
+ get_random_bytes(&tid, sizeof tid);
+
+ atomic_set(&ib_nl_sa_request_seq, 0);
+
+ ret = ib_register_client(&sa_client);
+ if (ret) {
+ pr_err("Couldn't register ib_sa client\n");
+ goto err1;
+ }
+
+ ret = mcast_init();
+ if (ret) {
+ pr_err("Couldn't initialize multicast handling\n");
+ goto err2;
+ }
+
+ ib_nl_wq = alloc_ordered_workqueue("ib_nl_sa_wq", WQ_MEM_RECLAIM);
+ if (!ib_nl_wq) {
+ ret = -ENOMEM;
+ goto err3;
+ }
+
+ INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout);
+
+ return 0;
+
+err3:
+ mcast_cleanup();
+err2:
+ ib_unregister_client(&sa_client);
+err1:
+ return ret;
+}
+
+void ib_sa_cleanup(void)
+{
+ cancel_delayed_work(&ib_nl_timed_work);
+ flush_workqueue(ib_nl_wq);
+ destroy_workqueue(ib_nl_wq);
+ mcast_cleanup();
+ ib_unregister_client(&sa_client);
+ idr_destroy(&query_idr);
+}
diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c
new file mode 100644
index 000000000..6df6cc55f
--- /dev/null
+++ b/drivers/infiniband/core/security.c
@@ -0,0 +1,749 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/security.h>
+#include <linux/completion.h>
+#include <linux/list.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+#include "core_priv.h"
+#include "mad_priv.h"
+
+static struct pkey_index_qp_list *get_pkey_idx_qp_list(struct ib_port_pkey *pp)
+{
+ struct pkey_index_qp_list *pkey = NULL;
+ struct pkey_index_qp_list *tmp_pkey;
+ struct ib_device *dev = pp->sec->dev;
+
+ spin_lock(&dev->port_pkey_list[pp->port_num].list_lock);
+ list_for_each_entry(tmp_pkey,
+ &dev->port_pkey_list[pp->port_num].pkey_list,
+ pkey_index_list) {
+ if (tmp_pkey->pkey_index == pp->pkey_index) {
+ pkey = tmp_pkey;
+ break;
+ }
+ }
+ spin_unlock(&dev->port_pkey_list[pp->port_num].list_lock);
+ return pkey;
+}
+
+static int get_pkey_and_subnet_prefix(struct ib_port_pkey *pp,
+ u16 *pkey,
+ u64 *subnet_prefix)
+{
+ struct ib_device *dev = pp->sec->dev;
+ int ret;
+
+ ret = ib_get_cached_pkey(dev, pp->port_num, pp->pkey_index, pkey);
+ if (ret)
+ return ret;
+
+ ret = ib_get_cached_subnet_prefix(dev, pp->port_num, subnet_prefix);
+
+ return ret;
+}
+
+static int enforce_qp_pkey_security(u16 pkey,
+ u64 subnet_prefix,
+ struct ib_qp_security *qp_sec)
+{
+ struct ib_qp_security *shared_qp_sec;
+ int ret;
+
+ ret = security_ib_pkey_access(qp_sec->security, subnet_prefix, pkey);
+ if (ret)
+ return ret;
+
+ list_for_each_entry(shared_qp_sec,
+ &qp_sec->shared_qp_list,
+ shared_qp_list) {
+ ret = security_ib_pkey_access(shared_qp_sec->security,
+ subnet_prefix,
+ pkey);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/* The caller of this function must hold the QP security
+ * mutex of the QP of the security structure in *pps.
+ *
+ * It takes separate ports_pkeys and security structure
+ * because in some cases the pps will be for a new settings
+ * or the pps will be for the real QP and security structure
+ * will be for a shared QP.
+ */
+static int check_qp_port_pkey_settings(struct ib_ports_pkeys *pps,
+ struct ib_qp_security *sec)
+{
+ u64 subnet_prefix;
+ u16 pkey;
+ int ret = 0;
+
+ if (!pps)
+ return 0;
+
+ if (pps->main.state != IB_PORT_PKEY_NOT_VALID) {
+ ret = get_pkey_and_subnet_prefix(&pps->main,
+ &pkey,
+ &subnet_prefix);
+ if (ret)
+ return ret;
+
+ ret = enforce_qp_pkey_security(pkey,
+ subnet_prefix,
+ sec);
+ if (ret)
+ return ret;
+ }
+
+ if (pps->alt.state != IB_PORT_PKEY_NOT_VALID) {
+ ret = get_pkey_and_subnet_prefix(&pps->alt,
+ &pkey,
+ &subnet_prefix);
+ if (ret)
+ return ret;
+
+ ret = enforce_qp_pkey_security(pkey,
+ subnet_prefix,
+ sec);
+ }
+
+ return ret;
+}
+
+/* The caller of this function must hold the QP security
+ * mutex.
+ */
+static void qp_to_error(struct ib_qp_security *sec)
+{
+ struct ib_qp_security *shared_qp_sec;
+ struct ib_qp_attr attr = {
+ .qp_state = IB_QPS_ERR
+ };
+ struct ib_event event = {
+ .event = IB_EVENT_QP_FATAL
+ };
+
+ /* If the QP is in the process of being destroyed
+ * the qp pointer in the security structure is
+ * undefined. It cannot be modified now.
+ */
+ if (sec->destroying)
+ return;
+
+ ib_modify_qp(sec->qp,
+ &attr,
+ IB_QP_STATE);
+
+ if (sec->qp->event_handler && sec->qp->qp_context) {
+ event.element.qp = sec->qp;
+ sec->qp->event_handler(&event,
+ sec->qp->qp_context);
+ }
+
+ list_for_each_entry(shared_qp_sec,
+ &sec->shared_qp_list,
+ shared_qp_list) {
+ struct ib_qp *qp = shared_qp_sec->qp;
+
+ if (qp->event_handler && qp->qp_context) {
+ event.element.qp = qp;
+ event.device = qp->device;
+ qp->event_handler(&event,
+ qp->qp_context);
+ }
+ }
+}
+
+static inline void check_pkey_qps(struct pkey_index_qp_list *pkey,
+ struct ib_device *device,
+ u8 port_num,
+ u64 subnet_prefix)
+{
+ struct ib_port_pkey *pp, *tmp_pp;
+ bool comp;
+ LIST_HEAD(to_error_list);
+ u16 pkey_val;
+
+ if (!ib_get_cached_pkey(device,
+ port_num,
+ pkey->pkey_index,
+ &pkey_val)) {
+ spin_lock(&pkey->qp_list_lock);
+ list_for_each_entry(pp, &pkey->qp_list, qp_list) {
+ if (atomic_read(&pp->sec->error_list_count))
+ continue;
+
+ if (enforce_qp_pkey_security(pkey_val,
+ subnet_prefix,
+ pp->sec)) {
+ atomic_inc(&pp->sec->error_list_count);
+ list_add(&pp->to_error_list,
+ &to_error_list);
+ }
+ }
+ spin_unlock(&pkey->qp_list_lock);
+ }
+
+ list_for_each_entry_safe(pp,
+ tmp_pp,
+ &to_error_list,
+ to_error_list) {
+ mutex_lock(&pp->sec->mutex);
+ qp_to_error(pp->sec);
+ list_del(&pp->to_error_list);
+ atomic_dec(&pp->sec->error_list_count);
+ comp = pp->sec->destroying;
+ mutex_unlock(&pp->sec->mutex);
+
+ if (comp)
+ complete(&pp->sec->error_complete);
+ }
+}
+
+/* The caller of this function must hold the QP security
+ * mutex.
+ */
+static int port_pkey_list_insert(struct ib_port_pkey *pp)
+{
+ struct pkey_index_qp_list *tmp_pkey;
+ struct pkey_index_qp_list *pkey;
+ struct ib_device *dev;
+ u8 port_num = pp->port_num;
+ int ret = 0;
+
+ if (pp->state != IB_PORT_PKEY_VALID)
+ return 0;
+
+ dev = pp->sec->dev;
+
+ pkey = get_pkey_idx_qp_list(pp);
+
+ if (!pkey) {
+ bool found = false;
+
+ pkey = kzalloc(sizeof(*pkey), GFP_KERNEL);
+ if (!pkey)
+ return -ENOMEM;
+
+ spin_lock(&dev->port_pkey_list[port_num].list_lock);
+ /* Check for the PKey again. A racing process may
+ * have created it.
+ */
+ list_for_each_entry(tmp_pkey,
+ &dev->port_pkey_list[port_num].pkey_list,
+ pkey_index_list) {
+ if (tmp_pkey->pkey_index == pp->pkey_index) {
+ kfree(pkey);
+ pkey = tmp_pkey;
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ pkey->pkey_index = pp->pkey_index;
+ spin_lock_init(&pkey->qp_list_lock);
+ INIT_LIST_HEAD(&pkey->qp_list);
+ list_add(&pkey->pkey_index_list,
+ &dev->port_pkey_list[port_num].pkey_list);
+ }
+ spin_unlock(&dev->port_pkey_list[port_num].list_lock);
+ }
+
+ spin_lock(&pkey->qp_list_lock);
+ list_add(&pp->qp_list, &pkey->qp_list);
+ spin_unlock(&pkey->qp_list_lock);
+
+ pp->state = IB_PORT_PKEY_LISTED;
+
+ return ret;
+}
+
+/* The caller of this function must hold the QP security
+ * mutex.
+ */
+static void port_pkey_list_remove(struct ib_port_pkey *pp)
+{
+ struct pkey_index_qp_list *pkey;
+
+ if (pp->state != IB_PORT_PKEY_LISTED)
+ return;
+
+ pkey = get_pkey_idx_qp_list(pp);
+
+ spin_lock(&pkey->qp_list_lock);
+ list_del(&pp->qp_list);
+ spin_unlock(&pkey->qp_list_lock);
+
+ /* The setting may still be valid, i.e. after
+ * a destroy has failed for example.
+ */
+ pp->state = IB_PORT_PKEY_VALID;
+}
+
+static void destroy_qp_security(struct ib_qp_security *sec)
+{
+ security_ib_free_security(sec->security);
+ kfree(sec->ports_pkeys);
+ kfree(sec);
+}
+
+/* The caller of this function must hold the QP security
+ * mutex.
+ */
+static struct ib_ports_pkeys *get_new_pps(const struct ib_qp *qp,
+ const struct ib_qp_attr *qp_attr,
+ int qp_attr_mask)
+{
+ struct ib_ports_pkeys *new_pps;
+ struct ib_ports_pkeys *qp_pps = qp->qp_sec->ports_pkeys;
+
+ new_pps = kzalloc(sizeof(*new_pps), GFP_KERNEL);
+ if (!new_pps)
+ return NULL;
+
+ if (qp_attr_mask & IB_QP_PORT)
+ new_pps->main.port_num = qp_attr->port_num;
+ else if (qp_pps)
+ new_pps->main.port_num = qp_pps->main.port_num;
+
+ if (qp_attr_mask & IB_QP_PKEY_INDEX)
+ new_pps->main.pkey_index = qp_attr->pkey_index;
+ else if (qp_pps)
+ new_pps->main.pkey_index = qp_pps->main.pkey_index;
+
+ if (((qp_attr_mask & IB_QP_PKEY_INDEX) &&
+ (qp_attr_mask & IB_QP_PORT)) ||
+ (qp_pps && qp_pps->main.state != IB_PORT_PKEY_NOT_VALID))
+ new_pps->main.state = IB_PORT_PKEY_VALID;
+
+ if (qp_attr_mask & IB_QP_ALT_PATH) {
+ new_pps->alt.port_num = qp_attr->alt_port_num;
+ new_pps->alt.pkey_index = qp_attr->alt_pkey_index;
+ new_pps->alt.state = IB_PORT_PKEY_VALID;
+ } else if (qp_pps) {
+ new_pps->alt.port_num = qp_pps->alt.port_num;
+ new_pps->alt.pkey_index = qp_pps->alt.pkey_index;
+ if (qp_pps->alt.state != IB_PORT_PKEY_NOT_VALID)
+ new_pps->alt.state = IB_PORT_PKEY_VALID;
+ }
+
+ new_pps->main.sec = qp->qp_sec;
+ new_pps->alt.sec = qp->qp_sec;
+ return new_pps;
+}
+
+int ib_open_shared_qp_security(struct ib_qp *qp, struct ib_device *dev)
+{
+ struct ib_qp *real_qp = qp->real_qp;
+ int ret;
+
+ ret = ib_create_qp_security(qp, dev);
+
+ if (ret)
+ return ret;
+
+ if (!qp->qp_sec)
+ return 0;
+
+ mutex_lock(&real_qp->qp_sec->mutex);
+ ret = check_qp_port_pkey_settings(real_qp->qp_sec->ports_pkeys,
+ qp->qp_sec);
+
+ if (ret)
+ goto ret;
+
+ if (qp != real_qp)
+ list_add(&qp->qp_sec->shared_qp_list,
+ &real_qp->qp_sec->shared_qp_list);
+ret:
+ mutex_unlock(&real_qp->qp_sec->mutex);
+ if (ret)
+ destroy_qp_security(qp->qp_sec);
+
+ return ret;
+}
+
+void ib_close_shared_qp_security(struct ib_qp_security *sec)
+{
+ struct ib_qp *real_qp = sec->qp->real_qp;
+
+ mutex_lock(&real_qp->qp_sec->mutex);
+ list_del(&sec->shared_qp_list);
+ mutex_unlock(&real_qp->qp_sec->mutex);
+
+ destroy_qp_security(sec);
+}
+
+int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev)
+{
+ u8 i = rdma_start_port(dev);
+ bool is_ib = false;
+ int ret;
+
+ while (i <= rdma_end_port(dev) && !is_ib)
+ is_ib = rdma_protocol_ib(dev, i++);
+
+ /* If this isn't an IB device don't create the security context */
+ if (!is_ib)
+ return 0;
+
+ qp->qp_sec = kzalloc(sizeof(*qp->qp_sec), GFP_KERNEL);
+ if (!qp->qp_sec)
+ return -ENOMEM;
+
+ qp->qp_sec->qp = qp;
+ qp->qp_sec->dev = dev;
+ mutex_init(&qp->qp_sec->mutex);
+ INIT_LIST_HEAD(&qp->qp_sec->shared_qp_list);
+ atomic_set(&qp->qp_sec->error_list_count, 0);
+ init_completion(&qp->qp_sec->error_complete);
+ ret = security_ib_alloc_security(&qp->qp_sec->security);
+ if (ret) {
+ kfree(qp->qp_sec);
+ qp->qp_sec = NULL;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_create_qp_security);
+
+void ib_destroy_qp_security_begin(struct ib_qp_security *sec)
+{
+ /* Return if not IB */
+ if (!sec)
+ return;
+
+ mutex_lock(&sec->mutex);
+
+ /* Remove the QP from the lists so it won't get added to
+ * a to_error_list during the destroy process.
+ */
+ if (sec->ports_pkeys) {
+ port_pkey_list_remove(&sec->ports_pkeys->main);
+ port_pkey_list_remove(&sec->ports_pkeys->alt);
+ }
+
+ /* If the QP is already in one or more of those lists
+ * the destroying flag will ensure the to error flow
+ * doesn't operate on an undefined QP.
+ */
+ sec->destroying = true;
+
+ /* Record the error list count to know how many completions
+ * to wait for.
+ */
+ sec->error_comps_pending = atomic_read(&sec->error_list_count);
+
+ mutex_unlock(&sec->mutex);
+}
+
+void ib_destroy_qp_security_abort(struct ib_qp_security *sec)
+{
+ int ret;
+ int i;
+
+ /* Return if not IB */
+ if (!sec)
+ return;
+
+ /* If a concurrent cache update is in progress this
+ * QP security could be marked for an error state
+ * transition. Wait for this to complete.
+ */
+ for (i = 0; i < sec->error_comps_pending; i++)
+ wait_for_completion(&sec->error_complete);
+
+ mutex_lock(&sec->mutex);
+ sec->destroying = false;
+
+ /* Restore the position in the lists and verify
+ * access is still allowed in case a cache update
+ * occurred while attempting to destroy.
+ *
+ * Because these setting were listed already
+ * and removed during ib_destroy_qp_security_begin
+ * we know the pkey_index_qp_list for the PKey
+ * already exists so port_pkey_list_insert won't fail.
+ */
+ if (sec->ports_pkeys) {
+ port_pkey_list_insert(&sec->ports_pkeys->main);
+ port_pkey_list_insert(&sec->ports_pkeys->alt);
+ }
+
+ ret = check_qp_port_pkey_settings(sec->ports_pkeys, sec);
+ if (ret)
+ qp_to_error(sec);
+
+ mutex_unlock(&sec->mutex);
+}
+
+void ib_destroy_qp_security_end(struct ib_qp_security *sec)
+{
+ int i;
+
+ /* Return if not IB */
+ if (!sec)
+ return;
+
+ /* If a concurrent cache update is occurring we must
+ * wait until this QP security structure is processed
+ * in the QP to error flow before destroying it because
+ * the to_error_list is in use.
+ */
+ for (i = 0; i < sec->error_comps_pending; i++)
+ wait_for_completion(&sec->error_complete);
+
+ destroy_qp_security(sec);
+}
+
+void ib_security_cache_change(struct ib_device *device,
+ u8 port_num,
+ u64 subnet_prefix)
+{
+ struct pkey_index_qp_list *pkey;
+
+ list_for_each_entry(pkey,
+ &device->port_pkey_list[port_num].pkey_list,
+ pkey_index_list) {
+ check_pkey_qps(pkey,
+ device,
+ port_num,
+ subnet_prefix);
+ }
+}
+
+void ib_security_destroy_port_pkey_list(struct ib_device *device)
+{
+ struct pkey_index_qp_list *pkey, *tmp_pkey;
+ int i;
+
+ for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+ spin_lock(&device->port_pkey_list[i].list_lock);
+ list_for_each_entry_safe(pkey,
+ tmp_pkey,
+ &device->port_pkey_list[i].pkey_list,
+ pkey_index_list) {
+ list_del(&pkey->pkey_index_list);
+ kfree(pkey);
+ }
+ spin_unlock(&device->port_pkey_list[i].list_lock);
+ }
+}
+
+int ib_security_modify_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_udata *udata)
+{
+ int ret = 0;
+ struct ib_ports_pkeys *tmp_pps;
+ struct ib_ports_pkeys *new_pps = NULL;
+ struct ib_qp *real_qp = qp->real_qp;
+ bool special_qp = (real_qp->qp_type == IB_QPT_SMI ||
+ real_qp->qp_type == IB_QPT_GSI ||
+ real_qp->qp_type >= IB_QPT_RESERVED1);
+ bool pps_change = ((qp_attr_mask & (IB_QP_PKEY_INDEX | IB_QP_PORT)) ||
+ (qp_attr_mask & IB_QP_ALT_PATH));
+
+ WARN_ONCE((qp_attr_mask & IB_QP_PORT &&
+ rdma_protocol_ib(real_qp->device, qp_attr->port_num) &&
+ !real_qp->qp_sec),
+ "%s: QP security is not initialized for IB QP: %d\n",
+ __func__, real_qp->qp_num);
+
+ /* The port/pkey settings are maintained only for the real QP. Open
+ * handles on the real QP will be in the shared_qp_list. When
+ * enforcing security on the real QP all the shared QPs will be
+ * checked as well.
+ */
+
+ if (pps_change && !special_qp && real_qp->qp_sec) {
+ mutex_lock(&real_qp->qp_sec->mutex);
+ new_pps = get_new_pps(real_qp,
+ qp_attr,
+ qp_attr_mask);
+ if (!new_pps) {
+ mutex_unlock(&real_qp->qp_sec->mutex);
+ return -ENOMEM;
+ }
+ /* Add this QP to the lists for the new port
+ * and pkey settings before checking for permission
+ * in case there is a concurrent cache update
+ * occurring. Walking the list for a cache change
+ * doesn't acquire the security mutex unless it's
+ * sending the QP to error.
+ */
+ ret = port_pkey_list_insert(&new_pps->main);
+
+ if (!ret)
+ ret = port_pkey_list_insert(&new_pps->alt);
+
+ if (!ret)
+ ret = check_qp_port_pkey_settings(new_pps,
+ real_qp->qp_sec);
+ }
+
+ if (!ret)
+ ret = real_qp->device->modify_qp(real_qp,
+ qp_attr,
+ qp_attr_mask,
+ udata);
+
+ if (new_pps) {
+ /* Clean up the lists and free the appropriate
+ * ports_pkeys structure.
+ */
+ if (ret) {
+ tmp_pps = new_pps;
+ } else {
+ tmp_pps = real_qp->qp_sec->ports_pkeys;
+ real_qp->qp_sec->ports_pkeys = new_pps;
+ }
+
+ if (tmp_pps) {
+ port_pkey_list_remove(&tmp_pps->main);
+ port_pkey_list_remove(&tmp_pps->alt);
+ }
+ kfree(tmp_pps);
+ mutex_unlock(&real_qp->qp_sec->mutex);
+ }
+ return ret;
+}
+
+static int ib_security_pkey_access(struct ib_device *dev,
+ u8 port_num,
+ u16 pkey_index,
+ void *sec)
+{
+ u64 subnet_prefix;
+ u16 pkey;
+ int ret;
+
+ if (!rdma_protocol_ib(dev, port_num))
+ return 0;
+
+ ret = ib_get_cached_pkey(dev, port_num, pkey_index, &pkey);
+ if (ret)
+ return ret;
+
+ ret = ib_get_cached_subnet_prefix(dev, port_num, &subnet_prefix);
+
+ if (ret)
+ return ret;
+
+ return security_ib_pkey_access(sec, subnet_prefix, pkey);
+}
+
+static int ib_mad_agent_security_change(struct notifier_block *nb,
+ unsigned long event,
+ void *data)
+{
+ struct ib_mad_agent *ag = container_of(nb, struct ib_mad_agent, lsm_nb);
+
+ if (event != LSM_POLICY_CHANGE)
+ return NOTIFY_DONE;
+
+ ag->smp_allowed = !security_ib_endport_manage_subnet(ag->security,
+ ag->device->name,
+ ag->port_num);
+
+ return NOTIFY_OK;
+}
+
+int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
+ enum ib_qp_type qp_type)
+{
+ int ret;
+
+ if (!rdma_protocol_ib(agent->device, agent->port_num))
+ return 0;
+
+ ret = security_ib_alloc_security(&agent->security);
+ if (ret)
+ return ret;
+
+ if (qp_type != IB_QPT_SMI)
+ return 0;
+
+ ret = security_ib_endport_manage_subnet(agent->security,
+ agent->device->name,
+ agent->port_num);
+ if (ret)
+ goto free_security;
+
+ agent->lsm_nb.notifier_call = ib_mad_agent_security_change;
+ ret = register_lsm_notifier(&agent->lsm_nb);
+ if (ret)
+ goto free_security;
+
+ agent->smp_allowed = true;
+ agent->lsm_nb_reg = true;
+ return 0;
+
+free_security:
+ security_ib_free_security(agent->security);
+ return ret;
+}
+
+void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent)
+{
+ if (!rdma_protocol_ib(agent->device, agent->port_num))
+ return;
+
+ if (agent->lsm_nb_reg)
+ unregister_lsm_notifier(&agent->lsm_nb);
+
+ security_ib_free_security(agent->security);
+}
+
+int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index)
+{
+ if (!rdma_protocol_ib(map->agent.device, map->agent.port_num))
+ return 0;
+
+ if (map->agent.qp->qp_type == IB_QPT_SMI) {
+ if (!map->agent.smp_allowed)
+ return -EACCES;
+ return 0;
+ }
+
+ return ib_security_pkey_access(map->agent.device,
+ map->agent.port_num,
+ pkey_index,
+ map->agent.security);
+}
diff --git a/drivers/infiniband/core/smi.c b/drivers/infiniband/core/smi.c
new file mode 100644
index 000000000..f19b23817
--- /dev/null
+++ b/drivers/infiniband/core/smi.c
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2004, 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004, 2005 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <rdma/ib_smi.h>
+#include "smi.h"
+#include "opa_smi.h"
+
+static enum smi_action __smi_handle_dr_smp_send(bool is_switch, int port_num,
+ u8 *hop_ptr, u8 hop_cnt,
+ const u8 *initial_path,
+ const u8 *return_path,
+ u8 direction,
+ bool dr_dlid_is_permissive,
+ bool dr_slid_is_permissive)
+{
+ /* See section 14.2.2.2, Vol 1 IB spec */
+ /* C14-6 -- valid hop_cnt values are from 0 to 63 */
+ if (hop_cnt >= IB_SMP_MAX_PATH_HOPS)
+ return IB_SMI_DISCARD;
+
+ if (!direction) {
+ /* C14-9:1 */
+ if (hop_cnt && *hop_ptr == 0) {
+ (*hop_ptr)++;
+ return (initial_path[*hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:2 */
+ if (*hop_ptr && *hop_ptr < hop_cnt) {
+ if (!is_switch)
+ return IB_SMI_DISCARD;
+
+ /* return_path set when received */
+ (*hop_ptr)++;
+ return (initial_path[*hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:3 -- We're at the end of the DR segment of path */
+ if (*hop_ptr == hop_cnt) {
+ /* return_path set when received */
+ (*hop_ptr)++;
+ return (is_switch ||
+ dr_dlid_is_permissive ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+ /* C14-9:5 -- Fail unreasonable hop pointer */
+ return (*hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+
+ } else {
+ /* C14-13:1 */
+ if (hop_cnt && *hop_ptr == hop_cnt + 1) {
+ (*hop_ptr)--;
+ return (return_path[*hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:2 */
+ if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) {
+ if (!is_switch)
+ return IB_SMI_DISCARD;
+
+ (*hop_ptr)--;
+ return (return_path[*hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:3 -- at the end of the DR segment of path */
+ if (*hop_ptr == 1) {
+ (*hop_ptr)--;
+ /* C14-13:3 -- SMPs destined for SM shouldn't be here */
+ return (is_switch ||
+ dr_slid_is_permissive ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:4 -- hop_ptr = 0 -> should have gone to SM */
+ if (*hop_ptr == 0)
+ return IB_SMI_HANDLE;
+
+ /* C14-13:5 -- Check for unreasonable hop pointer */
+ return IB_SMI_DISCARD;
+ }
+}
+
+/*
+ * Fixup a directed route SMP for sending
+ * Return IB_SMI_DISCARD if the SMP should be discarded
+ */
+enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
+ bool is_switch, int port_num)
+{
+ return __smi_handle_dr_smp_send(is_switch, port_num,
+ &smp->hop_ptr, smp->hop_cnt,
+ smp->initial_path,
+ smp->return_path,
+ ib_get_smp_direction(smp),
+ smp->dr_dlid == IB_LID_PERMISSIVE,
+ smp->dr_slid == IB_LID_PERMISSIVE);
+}
+
+enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp,
+ bool is_switch, int port_num)
+{
+ return __smi_handle_dr_smp_send(is_switch, port_num,
+ &smp->hop_ptr, smp->hop_cnt,
+ smp->route.dr.initial_path,
+ smp->route.dr.return_path,
+ opa_get_smp_direction(smp),
+ smp->route.dr.dr_dlid ==
+ OPA_LID_PERMISSIVE,
+ smp->route.dr.dr_slid ==
+ OPA_LID_PERMISSIVE);
+}
+
+static enum smi_action __smi_handle_dr_smp_recv(bool is_switch, int port_num,
+ int phys_port_cnt,
+ u8 *hop_ptr, u8 hop_cnt,
+ const u8 *initial_path,
+ u8 *return_path,
+ u8 direction,
+ bool dr_dlid_is_permissive,
+ bool dr_slid_is_permissive)
+{
+ /* See section 14.2.2.2, Vol 1 IB spec */
+ /* C14-6 -- valid hop_cnt values are from 0 to 63 */
+ if (hop_cnt >= IB_SMP_MAX_PATH_HOPS)
+ return IB_SMI_DISCARD;
+
+ if (!direction) {
+ /* C14-9:1 -- sender should have incremented hop_ptr */
+ if (hop_cnt && *hop_ptr == 0)
+ return IB_SMI_DISCARD;
+
+ /* C14-9:2 -- intermediate hop */
+ if (*hop_ptr && *hop_ptr < hop_cnt) {
+ if (!is_switch)
+ return IB_SMI_DISCARD;
+
+ return_path[*hop_ptr] = port_num;
+ /* hop_ptr updated when sending */
+ return (initial_path[*hop_ptr+1] <= phys_port_cnt ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:3 -- We're at the end of the DR segment of path */
+ if (*hop_ptr == hop_cnt) {
+ if (hop_cnt)
+ return_path[*hop_ptr] = port_num;
+ /* hop_ptr updated when sending */
+
+ return (is_switch ||
+ dr_dlid_is_permissive ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+ /* C14-9:5 -- fail unreasonable hop pointer */
+ return (*hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+
+ } else {
+
+ /* C14-13:1 */
+ if (hop_cnt && *hop_ptr == hop_cnt + 1) {
+ (*hop_ptr)--;
+ return (return_path[*hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:2 */
+ if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) {
+ if (!is_switch)
+ return IB_SMI_DISCARD;
+
+ /* hop_ptr updated when sending */
+ return (return_path[*hop_ptr-1] <= phys_port_cnt ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:3 -- We're at the end of the DR segment of path */
+ if (*hop_ptr == 1) {
+ if (dr_slid_is_permissive) {
+ /* giving SMP to SM - update hop_ptr */
+ (*hop_ptr)--;
+ return IB_SMI_HANDLE;
+ }
+ /* hop_ptr updated when sending */
+ return (is_switch ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:4 -- hop_ptr = 0 -> give to SM */
+ /* C14-13:5 -- Check for unreasonable hop pointer */
+ return (*hop_ptr == 0 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+}
+
+/*
+ * Adjust information for a received SMP
+ * Return IB_SMI_DISCARD if the SMP should be dropped
+ */
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch,
+ int port_num, int phys_port_cnt)
+{
+ return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt,
+ &smp->hop_ptr, smp->hop_cnt,
+ smp->initial_path,
+ smp->return_path,
+ ib_get_smp_direction(smp),
+ smp->dr_dlid == IB_LID_PERMISSIVE,
+ smp->dr_slid == IB_LID_PERMISSIVE);
+}
+
+/*
+ * Adjust information for a received SMP
+ * Return IB_SMI_DISCARD if the SMP should be dropped
+ */
+enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch,
+ int port_num, int phys_port_cnt)
+{
+ return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt,
+ &smp->hop_ptr, smp->hop_cnt,
+ smp->route.dr.initial_path,
+ smp->route.dr.return_path,
+ opa_get_smp_direction(smp),
+ smp->route.dr.dr_dlid ==
+ OPA_LID_PERMISSIVE,
+ smp->route.dr.dr_slid ==
+ OPA_LID_PERMISSIVE);
+}
+
+static enum smi_forward_action __smi_check_forward_dr_smp(u8 hop_ptr, u8 hop_cnt,
+ u8 direction,
+ bool dr_dlid_is_permissive,
+ bool dr_slid_is_permissive)
+{
+ if (!direction) {
+ /* C14-9:2 -- intermediate hop */
+ if (hop_ptr && hop_ptr < hop_cnt)
+ return IB_SMI_FORWARD;
+
+ /* C14-9:3 -- at the end of the DR segment of path */
+ if (hop_ptr == hop_cnt)
+ return (dr_dlid_is_permissive ?
+ IB_SMI_SEND : IB_SMI_LOCAL);
+
+ /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+ if (hop_ptr == hop_cnt + 1)
+ return IB_SMI_SEND;
+ } else {
+ /* C14-13:2 -- intermediate hop */
+ if (2 <= hop_ptr && hop_ptr <= hop_cnt)
+ return IB_SMI_FORWARD;
+
+ /* C14-13:3 -- at the end of the DR segment of path */
+ if (hop_ptr == 1)
+ return (!dr_slid_is_permissive ?
+ IB_SMI_SEND : IB_SMI_LOCAL);
+ }
+ return IB_SMI_LOCAL;
+
+}
+
+enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp)
+{
+ return __smi_check_forward_dr_smp(smp->hop_ptr, smp->hop_cnt,
+ ib_get_smp_direction(smp),
+ smp->dr_dlid == IB_LID_PERMISSIVE,
+ smp->dr_slid == IB_LID_PERMISSIVE);
+}
+
+enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp)
+{
+ return __smi_check_forward_dr_smp(smp->hop_ptr, smp->hop_cnt,
+ opa_get_smp_direction(smp),
+ smp->route.dr.dr_dlid ==
+ OPA_LID_PERMISSIVE,
+ smp->route.dr.dr_slid ==
+ OPA_LID_PERMISSIVE);
+}
+
+/*
+ * Return the forwarding port number from initial_path for outgoing SMP and
+ * from return_path for returning SMP
+ */
+int smi_get_fwd_port(struct ib_smp *smp)
+{
+ return (!ib_get_smp_direction(smp) ? smp->initial_path[smp->hop_ptr+1] :
+ smp->return_path[smp->hop_ptr-1]);
+}
+
+/*
+ * Return the forwarding port number from initial_path for outgoing SMP and
+ * from return_path for returning SMP
+ */
+int opa_smi_get_fwd_port(struct opa_smp *smp)
+{
+ return !opa_get_smp_direction(smp) ? smp->route.dr.initial_path[smp->hop_ptr+1] :
+ smp->route.dr.return_path[smp->hop_ptr-1];
+}
diff --git a/drivers/infiniband/core/smi.h b/drivers/infiniband/core/smi.h
new file mode 100644
index 000000000..33c91c8a1
--- /dev/null
+++ b/drivers/infiniband/core/smi.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __SMI_H_
+#define __SMI_H_
+
+#include <rdma/ib_smi.h>
+
+enum smi_action {
+ IB_SMI_DISCARD,
+ IB_SMI_HANDLE
+};
+
+enum smi_forward_action {
+ IB_SMI_LOCAL, /* SMP should be completed up the stack */
+ IB_SMI_SEND, /* received DR SMP should be forwarded to the send queue */
+ IB_SMI_FORWARD /* SMP should be forwarded (for switches only) */
+};
+
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch,
+ int port_num, int phys_port_cnt);
+int smi_get_fwd_port(struct ib_smp *smp);
+extern enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp);
+extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
+ bool is_switch, int port_num);
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action smi_check_local_smp(struct ib_smp *smp,
+ struct ib_device *device)
+{
+ /* C14-9:3 -- We're at the end of the DR segment of path */
+ /* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */
+ return ((device->process_mad &&
+ !ib_get_smp_direction(smp) &&
+ (smp->hop_ptr == smp->hop_cnt + 1)) ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+}
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action smi_check_local_returning_smp(struct ib_smp *smp,
+ struct ib_device *device)
+{
+ /* C14-13:3 -- We're at the end of the DR segment of path */
+ /* C14-13:4 -- Hop Pointer == 0 -> give to SM */
+ return ((device->process_mad &&
+ ib_get_smp_direction(smp) &&
+ !smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+}
+
+#endif /* __SMI_H_ */
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
new file mode 100644
index 000000000..ace40bb98
--- /dev/null
+++ b/drivers/infiniband/core/sysfs.c
@@ -0,0 +1,1381 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "core_priv.h"
+
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_pma.h>
+#include <rdma/ib_cache.h>
+
+struct ib_port;
+
+struct gid_attr_group {
+ struct ib_port *port;
+ struct kobject kobj;
+ struct attribute_group ndev;
+ struct attribute_group type;
+};
+struct ib_port {
+ struct kobject kobj;
+ struct ib_device *ibdev;
+ struct gid_attr_group *gid_attr_group;
+ struct attribute_group gid_group;
+ struct attribute_group pkey_group;
+ struct attribute_group *pma_table;
+ struct attribute_group *hw_stats_ag;
+ struct rdma_hw_stats *hw_stats;
+ u8 port_num;
+};
+
+struct port_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf);
+ ssize_t (*store)(struct ib_port *, struct port_attribute *,
+ const char *buf, size_t count);
+};
+
+#define PORT_ATTR(_name, _mode, _show, _store) \
+struct port_attribute port_attr_##_name = __ATTR(_name, _mode, _show, _store)
+
+#define PORT_ATTR_RO(_name) \
+struct port_attribute port_attr_##_name = __ATTR_RO(_name)
+
+struct port_table_attribute {
+ struct port_attribute attr;
+ char name[8];
+ int index;
+ __be16 attr_id;
+};
+
+struct hw_stats_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct kobject *kobj,
+ struct attribute *attr, char *buf);
+ ssize_t (*store)(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf,
+ size_t count);
+ int index;
+ u8 port_num;
+};
+
+static ssize_t port_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct port_attribute *port_attr =
+ container_of(attr, struct port_attribute, attr);
+ struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+
+ if (!port_attr->show)
+ return -EIO;
+
+ return port_attr->show(p, port_attr, buf);
+}
+
+static ssize_t port_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct port_attribute *port_attr =
+ container_of(attr, struct port_attribute, attr);
+ struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+
+ if (!port_attr->store)
+ return -EIO;
+ return port_attr->store(p, port_attr, buf, count);
+}
+
+static const struct sysfs_ops port_sysfs_ops = {
+ .show = port_attr_show,
+ .store = port_attr_store
+};
+
+static ssize_t gid_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct port_attribute *port_attr =
+ container_of(attr, struct port_attribute, attr);
+ struct ib_port *p = container_of(kobj, struct gid_attr_group,
+ kobj)->port;
+
+ if (!port_attr->show)
+ return -EIO;
+
+ return port_attr->show(p, port_attr, buf);
+}
+
+static const struct sysfs_ops gid_attr_sysfs_ops = {
+ .show = gid_attr_show
+};
+
+static ssize_t state_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ static const char *state_name[] = {
+ [IB_PORT_NOP] = "NOP",
+ [IB_PORT_DOWN] = "DOWN",
+ [IB_PORT_INIT] = "INIT",
+ [IB_PORT_ARMED] = "ARMED",
+ [IB_PORT_ACTIVE] = "ACTIVE",
+ [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER"
+ };
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%d: %s\n", attr.state,
+ attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ?
+ state_name[attr.state] : "UNKNOWN");
+}
+
+static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "0x%x\n", attr.lid);
+}
+
+static ssize_t lid_mask_count_show(struct ib_port *p,
+ struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%d\n", attr.lmc);
+}
+
+static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "0x%x\n", attr.sm_lid);
+}
+
+static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%d\n", attr.sm_sl);
+}
+
+static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "0x%08x\n", attr.port_cap_flags);
+}
+
+static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ char *speed = "";
+ int rate; /* in deci-Gb/sec */
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ switch (attr.active_speed) {
+ case IB_SPEED_DDR:
+ speed = " DDR";
+ rate = 50;
+ break;
+ case IB_SPEED_QDR:
+ speed = " QDR";
+ rate = 100;
+ break;
+ case IB_SPEED_FDR10:
+ speed = " FDR10";
+ rate = 100;
+ break;
+ case IB_SPEED_FDR:
+ speed = " FDR";
+ rate = 140;
+ break;
+ case IB_SPEED_EDR:
+ speed = " EDR";
+ rate = 250;
+ break;
+ case IB_SPEED_HDR:
+ speed = " HDR";
+ rate = 500;
+ break;
+ case IB_SPEED_SDR:
+ default: /* default to SDR for invalid rates */
+ speed = " SDR";
+ rate = 25;
+ break;
+ }
+
+ rate *= ib_width_enum_to_int(attr.active_width);
+ if (rate < 0)
+ return -EINVAL;
+
+ return sprintf(buf, "%d%s Gb/sec (%dX%s)\n",
+ rate / 10, rate % 10 ? ".5" : "",
+ ib_width_enum_to_int(attr.active_width), speed);
+}
+
+static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ switch (attr.phys_state) {
+ case 1: return sprintf(buf, "1: Sleep\n");
+ case 2: return sprintf(buf, "2: Polling\n");
+ case 3: return sprintf(buf, "3: Disabled\n");
+ case 4: return sprintf(buf, "4: PortConfigurationTraining\n");
+ case 5: return sprintf(buf, "5: LinkUp\n");
+ case 6: return sprintf(buf, "6: LinkErrorRecovery\n");
+ case 7: return sprintf(buf, "7: Phy Test\n");
+ default: return sprintf(buf, "%d: <unknown>\n", attr.phys_state);
+ }
+}
+
+static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ switch (rdma_port_get_link_layer(p->ibdev, p->port_num)) {
+ case IB_LINK_LAYER_INFINIBAND:
+ return sprintf(buf, "%s\n", "InfiniBand");
+ case IB_LINK_LAYER_ETHERNET:
+ return sprintf(buf, "%s\n", "Ethernet");
+ default:
+ return sprintf(buf, "%s\n", "Unknown");
+ }
+}
+
+static PORT_ATTR_RO(state);
+static PORT_ATTR_RO(lid);
+static PORT_ATTR_RO(lid_mask_count);
+static PORT_ATTR_RO(sm_lid);
+static PORT_ATTR_RO(sm_sl);
+static PORT_ATTR_RO(cap_mask);
+static PORT_ATTR_RO(rate);
+static PORT_ATTR_RO(phys_state);
+static PORT_ATTR_RO(link_layer);
+
+static struct attribute *port_default_attrs[] = {
+ &port_attr_state.attr,
+ &port_attr_lid.attr,
+ &port_attr_lid_mask_count.attr,
+ &port_attr_sm_lid.attr,
+ &port_attr_sm_sl.attr,
+ &port_attr_cap_mask.attr,
+ &port_attr_rate.attr,
+ &port_attr_phys_state.attr,
+ &port_attr_link_layer.attr,
+ NULL
+};
+
+static size_t print_ndev(const struct ib_gid_attr *gid_attr, char *buf)
+{
+ if (!gid_attr->ndev)
+ return -EINVAL;
+
+ return sprintf(buf, "%s\n", gid_attr->ndev->name);
+}
+
+static size_t print_gid_type(const struct ib_gid_attr *gid_attr, char *buf)
+{
+ return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_attr->gid_type));
+}
+
+static ssize_t _show_port_gid_attr(
+ struct ib_port *p, struct port_attribute *attr, char *buf,
+ size_t (*print)(const struct ib_gid_attr *gid_attr, char *buf))
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ const struct ib_gid_attr *gid_attr;
+ ssize_t ret;
+
+ gid_attr = rdma_get_gid_attr(p->ibdev, p->port_num, tab_attr->index);
+ if (IS_ERR(gid_attr))
+ return PTR_ERR(gid_attr);
+
+ ret = print(gid_attr, buf);
+ rdma_put_gid_attr(gid_attr);
+ return ret;
+}
+
+static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
+ char *buf)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ const struct ib_gid_attr *gid_attr;
+ ssize_t ret;
+
+ gid_attr = rdma_get_gid_attr(p->ibdev, p->port_num, tab_attr->index);
+ if (IS_ERR(gid_attr)) {
+ const union ib_gid zgid = {};
+
+ /* If reading GID fails, it is likely due to GID entry being
+ * empty (invalid) or reserved GID in the table. User space
+ * expects to read GID table entries as long as it given index
+ * is within GID table size. Administrative/debugging tool
+ * fails to query rest of the GID entries if it hits error
+ * while querying a GID of the given index. To avoid user
+ * space throwing such error on fail to read gid, return zero
+ * GID as before. This maintains backward compatibility.
+ */
+ return sprintf(buf, "%pI6\n", zgid.raw);
+ }
+
+ ret = sprintf(buf, "%pI6\n", gid_attr->gid.raw);
+ rdma_put_gid_attr(gid_attr);
+ return ret;
+}
+
+static ssize_t show_port_gid_attr_ndev(struct ib_port *p,
+ struct port_attribute *attr, char *buf)
+{
+ return _show_port_gid_attr(p, attr, buf, print_ndev);
+}
+
+static ssize_t show_port_gid_attr_gid_type(struct ib_port *p,
+ struct port_attribute *attr,
+ char *buf)
+{
+ return _show_port_gid_attr(p, attr, buf, print_gid_type);
+}
+
+static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
+ char *buf)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ u16 pkey;
+ ssize_t ret;
+
+ ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "0x%04x\n", pkey);
+}
+
+#define PORT_PMA_ATTR(_name, _counter, _width, _offset) \
+struct port_table_attribute port_pma_attr_##_name = { \
+ .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \
+ .index = (_offset) | ((_width) << 16) | ((_counter) << 24), \
+ .attr_id = IB_PMA_PORT_COUNTERS , \
+}
+
+#define PORT_PMA_ATTR_EXT(_name, _width, _offset) \
+struct port_table_attribute port_pma_attr_ext_##_name = { \
+ .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \
+ .index = (_offset) | ((_width) << 16), \
+ .attr_id = IB_PMA_PORT_COUNTERS_EXT , \
+}
+
+/*
+ * Get a Perfmgmt MAD block of data.
+ * Returns error code or the number of bytes retrieved.
+ */
+static int get_perf_mad(struct ib_device *dev, int port_num, __be16 attr,
+ void *data, int offset, size_t size)
+{
+ struct ib_mad *in_mad;
+ struct ib_mad *out_mad;
+ size_t mad_size = sizeof(*out_mad);
+ u16 out_mad_pkey_index = 0;
+ ssize_t ret;
+
+ if (!dev->process_mad)
+ return -ENOSYS;
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ in_mad->mad_hdr.base_version = 1;
+ in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT;
+ in_mad->mad_hdr.class_version = 1;
+ in_mad->mad_hdr.method = IB_MGMT_METHOD_GET;
+ in_mad->mad_hdr.attr_id = attr;
+
+ if (attr != IB_PMA_CLASS_PORT_INFO)
+ in_mad->data[41] = port_num; /* PortSelect field */
+
+ if ((dev->process_mad(dev, IB_MAD_IGNORE_MKEY,
+ port_num, NULL, NULL,
+ (const struct ib_mad_hdr *)in_mad, mad_size,
+ (struct ib_mad_hdr *)out_mad, &mad_size,
+ &out_mad_pkey_index) &
+ (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) !=
+ (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ memcpy(data, out_mad->data + offset, size);
+ ret = size;
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return ret;
+}
+
+static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
+ char *buf)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ int offset = tab_attr->index & 0xffff;
+ int width = (tab_attr->index >> 16) & 0xff;
+ ssize_t ret;
+ u8 data[8];
+
+ ret = get_perf_mad(p->ibdev, p->port_num, tab_attr->attr_id, &data,
+ 40 + offset / 8, sizeof(data));
+ if (ret < 0)
+ return ret;
+
+ switch (width) {
+ case 4:
+ ret = sprintf(buf, "%u\n", (*data >>
+ (4 - (offset % 8))) & 0xf);
+ break;
+ case 8:
+ ret = sprintf(buf, "%u\n", *data);
+ break;
+ case 16:
+ ret = sprintf(buf, "%u\n",
+ be16_to_cpup((__be16 *)data));
+ break;
+ case 32:
+ ret = sprintf(buf, "%u\n",
+ be32_to_cpup((__be32 *)data));
+ break;
+ case 64:
+ ret = sprintf(buf, "%llu\n",
+ be64_to_cpup((__be64 *)data));
+ break;
+
+ default:
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static PORT_PMA_ATTR(symbol_error , 0, 16, 32);
+static PORT_PMA_ATTR(link_error_recovery , 1, 8, 48);
+static PORT_PMA_ATTR(link_downed , 2, 8, 56);
+static PORT_PMA_ATTR(port_rcv_errors , 3, 16, 64);
+static PORT_PMA_ATTR(port_rcv_remote_physical_errors, 4, 16, 80);
+static PORT_PMA_ATTR(port_rcv_switch_relay_errors , 5, 16, 96);
+static PORT_PMA_ATTR(port_xmit_discards , 6, 16, 112);
+static PORT_PMA_ATTR(port_xmit_constraint_errors , 7, 8, 128);
+static PORT_PMA_ATTR(port_rcv_constraint_errors , 8, 8, 136);
+static PORT_PMA_ATTR(local_link_integrity_errors , 9, 4, 152);
+static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10, 4, 156);
+static PORT_PMA_ATTR(VL15_dropped , 11, 16, 176);
+static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192);
+static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224);
+static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256);
+static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288);
+static PORT_PMA_ATTR(port_xmit_wait , 0, 32, 320);
+
+/*
+ * Counters added by extended set
+ */
+static PORT_PMA_ATTR_EXT(port_xmit_data , 64, 64);
+static PORT_PMA_ATTR_EXT(port_rcv_data , 64, 128);
+static PORT_PMA_ATTR_EXT(port_xmit_packets , 64, 192);
+static PORT_PMA_ATTR_EXT(port_rcv_packets , 64, 256);
+static PORT_PMA_ATTR_EXT(unicast_xmit_packets , 64, 320);
+static PORT_PMA_ATTR_EXT(unicast_rcv_packets , 64, 384);
+static PORT_PMA_ATTR_EXT(multicast_xmit_packets , 64, 448);
+static PORT_PMA_ATTR_EXT(multicast_rcv_packets , 64, 512);
+
+static struct attribute *pma_attrs[] = {
+ &port_pma_attr_symbol_error.attr.attr,
+ &port_pma_attr_link_error_recovery.attr.attr,
+ &port_pma_attr_link_downed.attr.attr,
+ &port_pma_attr_port_rcv_errors.attr.attr,
+ &port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+ &port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+ &port_pma_attr_port_xmit_discards.attr.attr,
+ &port_pma_attr_port_xmit_constraint_errors.attr.attr,
+ &port_pma_attr_port_rcv_constraint_errors.attr.attr,
+ &port_pma_attr_local_link_integrity_errors.attr.attr,
+ &port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+ &port_pma_attr_VL15_dropped.attr.attr,
+ &port_pma_attr_port_xmit_data.attr.attr,
+ &port_pma_attr_port_rcv_data.attr.attr,
+ &port_pma_attr_port_xmit_packets.attr.attr,
+ &port_pma_attr_port_rcv_packets.attr.attr,
+ &port_pma_attr_port_xmit_wait.attr.attr,
+ NULL
+};
+
+static struct attribute *pma_attrs_ext[] = {
+ &port_pma_attr_symbol_error.attr.attr,
+ &port_pma_attr_link_error_recovery.attr.attr,
+ &port_pma_attr_link_downed.attr.attr,
+ &port_pma_attr_port_rcv_errors.attr.attr,
+ &port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+ &port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+ &port_pma_attr_port_xmit_discards.attr.attr,
+ &port_pma_attr_port_xmit_constraint_errors.attr.attr,
+ &port_pma_attr_port_rcv_constraint_errors.attr.attr,
+ &port_pma_attr_local_link_integrity_errors.attr.attr,
+ &port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+ &port_pma_attr_VL15_dropped.attr.attr,
+ &port_pma_attr_ext_port_xmit_data.attr.attr,
+ &port_pma_attr_ext_port_rcv_data.attr.attr,
+ &port_pma_attr_ext_port_xmit_packets.attr.attr,
+ &port_pma_attr_port_xmit_wait.attr.attr,
+ &port_pma_attr_ext_port_rcv_packets.attr.attr,
+ &port_pma_attr_ext_unicast_rcv_packets.attr.attr,
+ &port_pma_attr_ext_unicast_xmit_packets.attr.attr,
+ &port_pma_attr_ext_multicast_rcv_packets.attr.attr,
+ &port_pma_attr_ext_multicast_xmit_packets.attr.attr,
+ NULL
+};
+
+static struct attribute *pma_attrs_noietf[] = {
+ &port_pma_attr_symbol_error.attr.attr,
+ &port_pma_attr_link_error_recovery.attr.attr,
+ &port_pma_attr_link_downed.attr.attr,
+ &port_pma_attr_port_rcv_errors.attr.attr,
+ &port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+ &port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+ &port_pma_attr_port_xmit_discards.attr.attr,
+ &port_pma_attr_port_xmit_constraint_errors.attr.attr,
+ &port_pma_attr_port_rcv_constraint_errors.attr.attr,
+ &port_pma_attr_local_link_integrity_errors.attr.attr,
+ &port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+ &port_pma_attr_VL15_dropped.attr.attr,
+ &port_pma_attr_ext_port_xmit_data.attr.attr,
+ &port_pma_attr_ext_port_rcv_data.attr.attr,
+ &port_pma_attr_ext_port_xmit_packets.attr.attr,
+ &port_pma_attr_ext_port_rcv_packets.attr.attr,
+ &port_pma_attr_port_xmit_wait.attr.attr,
+ NULL
+};
+
+static struct attribute_group pma_group = {
+ .name = "counters",
+ .attrs = pma_attrs
+};
+
+static struct attribute_group pma_group_ext = {
+ .name = "counters",
+ .attrs = pma_attrs_ext
+};
+
+static struct attribute_group pma_group_noietf = {
+ .name = "counters",
+ .attrs = pma_attrs_noietf
+};
+
+static void ib_port_release(struct kobject *kobj)
+{
+ struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+ struct attribute *a;
+ int i;
+
+ if (p->gid_group.attrs) {
+ for (i = 0; (a = p->gid_group.attrs[i]); ++i)
+ kfree(a);
+
+ kfree(p->gid_group.attrs);
+ }
+
+ if (p->pkey_group.attrs) {
+ for (i = 0; (a = p->pkey_group.attrs[i]); ++i)
+ kfree(a);
+
+ kfree(p->pkey_group.attrs);
+ }
+
+ kfree(p);
+}
+
+static void ib_port_gid_attr_release(struct kobject *kobj)
+{
+ struct gid_attr_group *g = container_of(kobj, struct gid_attr_group,
+ kobj);
+ struct attribute *a;
+ int i;
+
+ if (g->ndev.attrs) {
+ for (i = 0; (a = g->ndev.attrs[i]); ++i)
+ kfree(a);
+
+ kfree(g->ndev.attrs);
+ }
+
+ if (g->type.attrs) {
+ for (i = 0; (a = g->type.attrs[i]); ++i)
+ kfree(a);
+
+ kfree(g->type.attrs);
+ }
+
+ kfree(g);
+}
+
+static struct kobj_type port_type = {
+ .release = ib_port_release,
+ .sysfs_ops = &port_sysfs_ops,
+ .default_attrs = port_default_attrs
+};
+
+static struct kobj_type gid_attr_type = {
+ .sysfs_ops = &gid_attr_sysfs_ops,
+ .release = ib_port_gid_attr_release
+};
+
+static struct attribute **
+alloc_group_attrs(ssize_t (*show)(struct ib_port *,
+ struct port_attribute *, char *buf),
+ int len)
+{
+ struct attribute **tab_attr;
+ struct port_table_attribute *element;
+ int i;
+
+ tab_attr = kcalloc(1 + len, sizeof(struct attribute *), GFP_KERNEL);
+ if (!tab_attr)
+ return NULL;
+
+ for (i = 0; i < len; i++) {
+ element = kzalloc(sizeof(struct port_table_attribute),
+ GFP_KERNEL);
+ if (!element)
+ goto err;
+
+ if (snprintf(element->name, sizeof(element->name),
+ "%d", i) >= sizeof(element->name)) {
+ kfree(element);
+ goto err;
+ }
+
+ element->attr.attr.name = element->name;
+ element->attr.attr.mode = S_IRUGO;
+ element->attr.show = show;
+ element->index = i;
+ sysfs_attr_init(&element->attr.attr);
+
+ tab_attr[i] = &element->attr.attr;
+ }
+
+ return tab_attr;
+
+err:
+ while (--i >= 0)
+ kfree(tab_attr[i]);
+ kfree(tab_attr);
+ return NULL;
+}
+
+/*
+ * Figure out which counter table to use depending on
+ * the device capabilities.
+ */
+static struct attribute_group *get_counter_table(struct ib_device *dev,
+ int port_num)
+{
+ struct ib_class_port_info cpi;
+
+ if (get_perf_mad(dev, port_num, IB_PMA_CLASS_PORT_INFO,
+ &cpi, 40, sizeof(cpi)) >= 0) {
+ if (cpi.capability_mask & IB_PMA_CLASS_CAP_EXT_WIDTH)
+ /* We have extended counters */
+ return &pma_group_ext;
+
+ if (cpi.capability_mask & IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF)
+ /* But not the IETF ones */
+ return &pma_group_noietf;
+ }
+
+ /* Fall back to normal counters */
+ return &pma_group;
+}
+
+static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats,
+ u8 port_num, int index)
+{
+ int ret;
+
+ if (time_is_after_eq_jiffies(stats->timestamp + stats->lifespan))
+ return 0;
+ ret = dev->get_hw_stats(dev, stats, port_num, index);
+ if (ret < 0)
+ return ret;
+ if (ret == stats->num_counters)
+ stats->timestamp = jiffies;
+
+ return 0;
+}
+
+static ssize_t print_hw_stat(struct rdma_hw_stats *stats, int index, char *buf)
+{
+ return sprintf(buf, "%llu\n", stats->value[index]);
+}
+
+static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct ib_device *dev;
+ struct ib_port *port;
+ struct hw_stats_attribute *hsa;
+ struct rdma_hw_stats *stats;
+ int ret;
+
+ hsa = container_of(attr, struct hw_stats_attribute, attr);
+ if (!hsa->port_num) {
+ dev = container_of((struct device *)kobj,
+ struct ib_device, dev);
+ stats = dev->hw_stats;
+ } else {
+ port = container_of(kobj, struct ib_port, kobj);
+ dev = port->ibdev;
+ stats = port->hw_stats;
+ }
+ mutex_lock(&stats->lock);
+ ret = update_hw_stats(dev, stats, hsa->port_num, hsa->index);
+ if (ret)
+ goto unlock;
+ ret = print_hw_stat(stats, hsa->index, buf);
+unlock:
+ mutex_unlock(&stats->lock);
+
+ return ret;
+}
+
+static ssize_t show_stats_lifespan(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct hw_stats_attribute *hsa;
+ struct rdma_hw_stats *stats;
+ int msecs;
+
+ hsa = container_of(attr, struct hw_stats_attribute, attr);
+ if (!hsa->port_num) {
+ struct ib_device *dev = container_of((struct device *)kobj,
+ struct ib_device, dev);
+
+ stats = dev->hw_stats;
+ } else {
+ struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+
+ stats = p->hw_stats;
+ }
+
+ mutex_lock(&stats->lock);
+ msecs = jiffies_to_msecs(stats->lifespan);
+ mutex_unlock(&stats->lock);
+
+ return sprintf(buf, "%d\n", msecs);
+}
+
+static ssize_t set_stats_lifespan(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct hw_stats_attribute *hsa;
+ struct rdma_hw_stats *stats;
+ int msecs;
+ int jiffies;
+ int ret;
+
+ ret = kstrtoint(buf, 10, &msecs);
+ if (ret)
+ return ret;
+ if (msecs < 0 || msecs > 10000)
+ return -EINVAL;
+ jiffies = msecs_to_jiffies(msecs);
+ hsa = container_of(attr, struct hw_stats_attribute, attr);
+ if (!hsa->port_num) {
+ struct ib_device *dev = container_of((struct device *)kobj,
+ struct ib_device, dev);
+
+ stats = dev->hw_stats;
+ } else {
+ struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+
+ stats = p->hw_stats;
+ }
+
+ mutex_lock(&stats->lock);
+ stats->lifespan = jiffies;
+ mutex_unlock(&stats->lock);
+
+ return count;
+}
+
+static void free_hsag(struct kobject *kobj, struct attribute_group *attr_group)
+{
+ struct attribute **attr;
+
+ sysfs_remove_group(kobj, attr_group);
+
+ for (attr = attr_group->attrs; *attr; attr++)
+ kfree(*attr);
+ kfree(attr_group);
+}
+
+static struct attribute *alloc_hsa(int index, u8 port_num, const char *name)
+{
+ struct hw_stats_attribute *hsa;
+
+ hsa = kmalloc(sizeof(*hsa), GFP_KERNEL);
+ if (!hsa)
+ return NULL;
+
+ hsa->attr.name = (char *)name;
+ hsa->attr.mode = S_IRUGO;
+ hsa->show = show_hw_stats;
+ hsa->store = NULL;
+ hsa->index = index;
+ hsa->port_num = port_num;
+
+ return &hsa->attr;
+}
+
+static struct attribute *alloc_hsa_lifespan(char *name, u8 port_num)
+{
+ struct hw_stats_attribute *hsa;
+
+ hsa = kmalloc(sizeof(*hsa), GFP_KERNEL);
+ if (!hsa)
+ return NULL;
+
+ hsa->attr.name = name;
+ hsa->attr.mode = S_IWUSR | S_IRUGO;
+ hsa->show = show_stats_lifespan;
+ hsa->store = set_stats_lifespan;
+ hsa->index = 0;
+ hsa->port_num = port_num;
+
+ return &hsa->attr;
+}
+
+static void setup_hw_stats(struct ib_device *device, struct ib_port *port,
+ u8 port_num)
+{
+ struct attribute_group *hsag;
+ struct rdma_hw_stats *stats;
+ int i, ret;
+
+ stats = device->alloc_hw_stats(device, port_num);
+
+ if (!stats)
+ return;
+
+ if (!stats->names || stats->num_counters <= 0)
+ goto err_free_stats;
+
+ /*
+ * Two extra attribue elements here, one for the lifespan entry and
+ * one to NULL terminate the list for the sysfs core code
+ */
+ hsag = kzalloc(sizeof(*hsag) +
+ sizeof(void *) * (stats->num_counters + 2),
+ GFP_KERNEL);
+ if (!hsag)
+ goto err_free_stats;
+
+ ret = device->get_hw_stats(device, stats, port_num,
+ stats->num_counters);
+ if (ret != stats->num_counters)
+ goto err_free_hsag;
+
+ stats->timestamp = jiffies;
+
+ hsag->name = "hw_counters";
+ hsag->attrs = (void *)hsag + sizeof(*hsag);
+
+ for (i = 0; i < stats->num_counters; i++) {
+ hsag->attrs[i] = alloc_hsa(i, port_num, stats->names[i]);
+ if (!hsag->attrs[i])
+ goto err;
+ sysfs_attr_init(hsag->attrs[i]);
+ }
+
+ mutex_init(&stats->lock);
+ /* treat an error here as non-fatal */
+ hsag->attrs[i] = alloc_hsa_lifespan("lifespan", port_num);
+ if (hsag->attrs[i])
+ sysfs_attr_init(hsag->attrs[i]);
+
+ if (port) {
+ struct kobject *kobj = &port->kobj;
+ ret = sysfs_create_group(kobj, hsag);
+ if (ret)
+ goto err;
+ port->hw_stats_ag = hsag;
+ port->hw_stats = stats;
+ } else {
+ struct kobject *kobj = &device->dev.kobj;
+ ret = sysfs_create_group(kobj, hsag);
+ if (ret)
+ goto err;
+ device->hw_stats_ag = hsag;
+ device->hw_stats = stats;
+ }
+
+ return;
+
+err:
+ for (; i >= 0; i--)
+ kfree(hsag->attrs[i]);
+err_free_hsag:
+ kfree(hsag);
+err_free_stats:
+ kfree(stats);
+ return;
+}
+
+static int add_port(struct ib_device *device, int port_num,
+ int (*port_callback)(struct ib_device *,
+ u8, struct kobject *))
+{
+ struct ib_port *p;
+ struct ib_port_attr attr;
+ int i;
+ int ret;
+
+ ret = ib_query_port(device, port_num, &attr);
+ if (ret)
+ return ret;
+
+ p = kzalloc(sizeof *p, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ p->ibdev = device;
+ p->port_num = port_num;
+
+ ret = kobject_init_and_add(&p->kobj, &port_type,
+ device->ports_parent,
+ "%d", port_num);
+ if (ret) {
+ kfree(p);
+ return ret;
+ }
+
+ p->gid_attr_group = kzalloc(sizeof(*p->gid_attr_group), GFP_KERNEL);
+ if (!p->gid_attr_group) {
+ ret = -ENOMEM;
+ goto err_put;
+ }
+
+ p->gid_attr_group->port = p;
+ ret = kobject_init_and_add(&p->gid_attr_group->kobj, &gid_attr_type,
+ &p->kobj, "gid_attrs");
+ if (ret) {
+ kfree(p->gid_attr_group);
+ goto err_put;
+ }
+
+ if (device->process_mad) {
+ p->pma_table = get_counter_table(device, port_num);
+ ret = sysfs_create_group(&p->kobj, p->pma_table);
+ if (ret)
+ goto err_put_gid_attrs;
+ }
+
+ p->gid_group.name = "gids";
+ p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len);
+ if (!p->gid_group.attrs) {
+ ret = -ENOMEM;
+ goto err_remove_pma;
+ }
+
+ ret = sysfs_create_group(&p->kobj, &p->gid_group);
+ if (ret)
+ goto err_free_gid;
+
+ p->gid_attr_group->ndev.name = "ndevs";
+ p->gid_attr_group->ndev.attrs = alloc_group_attrs(show_port_gid_attr_ndev,
+ attr.gid_tbl_len);
+ if (!p->gid_attr_group->ndev.attrs) {
+ ret = -ENOMEM;
+ goto err_remove_gid;
+ }
+
+ ret = sysfs_create_group(&p->gid_attr_group->kobj,
+ &p->gid_attr_group->ndev);
+ if (ret)
+ goto err_free_gid_ndev;
+
+ p->gid_attr_group->type.name = "types";
+ p->gid_attr_group->type.attrs = alloc_group_attrs(show_port_gid_attr_gid_type,
+ attr.gid_tbl_len);
+ if (!p->gid_attr_group->type.attrs) {
+ ret = -ENOMEM;
+ goto err_remove_gid_ndev;
+ }
+
+ ret = sysfs_create_group(&p->gid_attr_group->kobj,
+ &p->gid_attr_group->type);
+ if (ret)
+ goto err_free_gid_type;
+
+ p->pkey_group.name = "pkeys";
+ p->pkey_group.attrs = alloc_group_attrs(show_port_pkey,
+ attr.pkey_tbl_len);
+ if (!p->pkey_group.attrs) {
+ ret = -ENOMEM;
+ goto err_remove_gid_type;
+ }
+
+ ret = sysfs_create_group(&p->kobj, &p->pkey_group);
+ if (ret)
+ goto err_free_pkey;
+
+ if (port_callback) {
+ ret = port_callback(device, port_num, &p->kobj);
+ if (ret)
+ goto err_remove_pkey;
+ }
+
+ /*
+ * If port == 0, it means we have only one port and the parent
+ * device, not this port device, should be the holder of the
+ * hw_counters
+ */
+ if (device->alloc_hw_stats && port_num)
+ setup_hw_stats(device, p, port_num);
+
+ list_add_tail(&p->kobj.entry, &device->port_list);
+
+ kobject_uevent(&p->kobj, KOBJ_ADD);
+ return 0;
+
+err_remove_pkey:
+ sysfs_remove_group(&p->kobj, &p->pkey_group);
+
+err_free_pkey:
+ for (i = 0; i < attr.pkey_tbl_len; ++i)
+ kfree(p->pkey_group.attrs[i]);
+
+ kfree(p->pkey_group.attrs);
+ p->pkey_group.attrs = NULL;
+
+err_remove_gid_type:
+ sysfs_remove_group(&p->gid_attr_group->kobj,
+ &p->gid_attr_group->type);
+
+err_free_gid_type:
+ for (i = 0; i < attr.gid_tbl_len; ++i)
+ kfree(p->gid_attr_group->type.attrs[i]);
+
+ kfree(p->gid_attr_group->type.attrs);
+ p->gid_attr_group->type.attrs = NULL;
+
+err_remove_gid_ndev:
+ sysfs_remove_group(&p->gid_attr_group->kobj,
+ &p->gid_attr_group->ndev);
+
+err_free_gid_ndev:
+ for (i = 0; i < attr.gid_tbl_len; ++i)
+ kfree(p->gid_attr_group->ndev.attrs[i]);
+
+ kfree(p->gid_attr_group->ndev.attrs);
+ p->gid_attr_group->ndev.attrs = NULL;
+
+err_remove_gid:
+ sysfs_remove_group(&p->kobj, &p->gid_group);
+
+err_free_gid:
+ for (i = 0; i < attr.gid_tbl_len; ++i)
+ kfree(p->gid_group.attrs[i]);
+
+ kfree(p->gid_group.attrs);
+ p->gid_group.attrs = NULL;
+
+err_remove_pma:
+ if (p->pma_table)
+ sysfs_remove_group(&p->kobj, p->pma_table);
+
+err_put_gid_attrs:
+ kobject_put(&p->gid_attr_group->kobj);
+
+err_put:
+ kobject_put(&p->kobj);
+ return ret;
+}
+
+static ssize_t show_node_type(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ switch (dev->node_type) {
+ case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type);
+ case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type);
+ case RDMA_NODE_USNIC: return sprintf(buf, "%d: usNIC\n", dev->node_type);
+ case RDMA_NODE_USNIC_UDP: return sprintf(buf, "%d: usNIC UDP\n", dev->node_type);
+ case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type);
+ case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type);
+ default: return sprintf(buf, "%d: <unknown>\n", dev->node_type);
+ }
+}
+
+static ssize_t show_sys_image_guid(struct device *device,
+ struct device_attribute *dev_attr, char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+ be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[0]),
+ be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[1]),
+ be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[2]),
+ be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[3]));
+}
+
+static ssize_t show_node_guid(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+ be16_to_cpu(((__be16 *) &dev->node_guid)[0]),
+ be16_to_cpu(((__be16 *) &dev->node_guid)[1]),
+ be16_to_cpu(((__be16 *) &dev->node_guid)[2]),
+ be16_to_cpu(((__be16 *) &dev->node_guid)[3]));
+}
+
+static ssize_t show_node_desc(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ return sprintf(buf, "%.64s\n", dev->node_desc);
+}
+
+static ssize_t set_node_desc(struct device *device,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+ struct ib_device_modify desc = {};
+ int ret;
+
+ if (!dev->modify_device)
+ return -EIO;
+
+ memcpy(desc.node_desc, buf, min_t(int, count, IB_DEVICE_NODE_DESC_MAX));
+ ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ ib_get_device_fw_str(dev, buf);
+ strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX);
+ return strlen(buf);
+}
+
+static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL);
+static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL);
+static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL);
+static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+
+static struct device_attribute *ib_class_attributes[] = {
+ &dev_attr_node_type,
+ &dev_attr_sys_image_guid,
+ &dev_attr_node_guid,
+ &dev_attr_node_desc,
+ &dev_attr_fw_ver,
+};
+
+static void free_port_list_attributes(struct ib_device *device)
+{
+ struct kobject *p, *t;
+
+ list_for_each_entry_safe(p, t, &device->port_list, entry) {
+ struct ib_port *port = container_of(p, struct ib_port, kobj);
+ list_del(&p->entry);
+ if (port->hw_stats) {
+ kfree(port->hw_stats);
+ free_hsag(&port->kobj, port->hw_stats_ag);
+ }
+
+ if (port->pma_table)
+ sysfs_remove_group(p, port->pma_table);
+ sysfs_remove_group(p, &port->pkey_group);
+ sysfs_remove_group(p, &port->gid_group);
+ sysfs_remove_group(&port->gid_attr_group->kobj,
+ &port->gid_attr_group->ndev);
+ sysfs_remove_group(&port->gid_attr_group->kobj,
+ &port->gid_attr_group->type);
+ kobject_put(&port->gid_attr_group->kobj);
+ kobject_put(p);
+ }
+
+ kobject_put(device->ports_parent);
+}
+
+int ib_device_register_sysfs(struct ib_device *device,
+ int (*port_callback)(struct ib_device *,
+ u8, struct kobject *))
+{
+ struct device *class_dev = &device->dev;
+ int ret;
+ int i;
+
+ ret = dev_set_name(class_dev, "%s", device->name);
+ if (ret)
+ return ret;
+
+ ret = device_add(class_dev);
+ if (ret)
+ goto err;
+
+ for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) {
+ ret = device_create_file(class_dev, ib_class_attributes[i]);
+ if (ret)
+ goto err_unregister;
+ }
+
+ device->ports_parent = kobject_create_and_add("ports",
+ &class_dev->kobj);
+ if (!device->ports_parent) {
+ ret = -ENOMEM;
+ goto err_put;
+ }
+
+ if (rdma_cap_ib_switch(device)) {
+ ret = add_port(device, 0, port_callback);
+ if (ret)
+ goto err_put;
+ } else {
+ for (i = 1; i <= device->phys_port_cnt; ++i) {
+ ret = add_port(device, i, port_callback);
+ if (ret)
+ goto err_put;
+ }
+ }
+
+ if (device->alloc_hw_stats)
+ setup_hw_stats(device, NULL, 0);
+
+ return 0;
+
+err_put:
+ free_port_list_attributes(device);
+
+err_unregister:
+ device_del(class_dev);
+
+err:
+ return ret;
+}
+
+void ib_device_unregister_sysfs(struct ib_device *device)
+{
+ int i;
+
+ /* Hold kobject until ib_dealloc_device() */
+ kobject_get(&device->dev.kobj);
+
+ free_port_list_attributes(device);
+
+ if (device->hw_stats) {
+ kfree(device->hw_stats);
+ free_hsag(&device->dev.kobj, device->hw_stats_ag);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i)
+ device_remove_file(&device->dev, ib_class_attributes[i]);
+
+ device_unregister(&device->dev);
+}
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
new file mode 100644
index 000000000..73332b9a2
--- /dev/null
+++ b/drivers/infiniband/core/ucm.c
@@ -0,0 +1,1359 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/cdev.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+
+#include <linux/nospec.h>
+
+#include <linux/uaccess.h>
+
+#include <rdma/ib.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_user_cm.h>
+#include <rdma/ib_marshall.h>
+
+#include "core_priv.h"
+
+MODULE_AUTHOR("Libor Michalek");
+MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_ucm_device {
+ int devnum;
+ struct cdev cdev;
+ struct device dev;
+ struct ib_device *ib_dev;
+};
+
+struct ib_ucm_file {
+ struct mutex file_mutex;
+ struct file *filp;
+ struct ib_ucm_device *device;
+
+ struct list_head ctxs;
+ struct list_head events;
+ wait_queue_head_t poll_wait;
+};
+
+struct ib_ucm_context {
+ int id;
+ struct completion comp;
+ atomic_t ref;
+ int events_reported;
+
+ struct ib_ucm_file *file;
+ struct ib_cm_id *cm_id;
+ __u64 uid;
+
+ struct list_head events; /* list of pending events. */
+ struct list_head file_list; /* member in file ctx list */
+};
+
+struct ib_ucm_event {
+ struct ib_ucm_context *ctx;
+ struct list_head file_list; /* member in file event list */
+ struct list_head ctx_list; /* member in ctx event list */
+
+ struct ib_cm_id *cm_id;
+ struct ib_ucm_event_resp resp;
+ void *data;
+ void *info;
+ int data_len;
+ int info_len;
+};
+
+enum {
+ IB_UCM_MAJOR = 231,
+ IB_UCM_BASE_MINOR = 224,
+ IB_UCM_MAX_DEVICES = RDMA_MAX_PORTS,
+ IB_UCM_NUM_FIXED_MINOR = 32,
+ IB_UCM_NUM_DYNAMIC_MINOR = IB_UCM_MAX_DEVICES - IB_UCM_NUM_FIXED_MINOR,
+};
+
+#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
+static dev_t dynamic_ucm_dev;
+
+static void ib_ucm_add_one(struct ib_device *device);
+static void ib_ucm_remove_one(struct ib_device *device, void *client_data);
+
+static struct ib_client ucm_client = {
+ .name = "ucm",
+ .add = ib_ucm_add_one,
+ .remove = ib_ucm_remove_one
+};
+
+static DEFINE_MUTEX(ctx_id_mutex);
+static DEFINE_IDR(ctx_id_table);
+static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES);
+
+static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id)
+{
+ struct ib_ucm_context *ctx;
+
+ mutex_lock(&ctx_id_mutex);
+ ctx = idr_find(&ctx_id_table, id);
+ if (!ctx)
+ ctx = ERR_PTR(-ENOENT);
+ else if (ctx->file != file)
+ ctx = ERR_PTR(-EINVAL);
+ else
+ atomic_inc(&ctx->ref);
+ mutex_unlock(&ctx_id_mutex);
+
+ return ctx;
+}
+
+static void ib_ucm_ctx_put(struct ib_ucm_context *ctx)
+{
+ if (atomic_dec_and_test(&ctx->ref))
+ complete(&ctx->comp);
+}
+
+static inline int ib_ucm_new_cm_id(int event)
+{
+ return event == IB_CM_REQ_RECEIVED || event == IB_CM_SIDR_REQ_RECEIVED;
+}
+
+static void ib_ucm_cleanup_events(struct ib_ucm_context *ctx)
+{
+ struct ib_ucm_event *uevent;
+
+ mutex_lock(&ctx->file->file_mutex);
+ list_del(&ctx->file_list);
+ while (!list_empty(&ctx->events)) {
+
+ uevent = list_entry(ctx->events.next,
+ struct ib_ucm_event, ctx_list);
+ list_del(&uevent->file_list);
+ list_del(&uevent->ctx_list);
+ mutex_unlock(&ctx->file->file_mutex);
+
+ /* clear incoming connections. */
+ if (ib_ucm_new_cm_id(uevent->resp.event))
+ ib_destroy_cm_id(uevent->cm_id);
+
+ kfree(uevent);
+ mutex_lock(&ctx->file->file_mutex);
+ }
+ mutex_unlock(&ctx->file->file_mutex);
+}
+
+static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file)
+{
+ struct ib_ucm_context *ctx;
+
+ ctx = kzalloc(sizeof *ctx, GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ atomic_set(&ctx->ref, 1);
+ init_completion(&ctx->comp);
+ ctx->file = file;
+ INIT_LIST_HEAD(&ctx->events);
+
+ mutex_lock(&ctx_id_mutex);
+ ctx->id = idr_alloc(&ctx_id_table, ctx, 0, 0, GFP_KERNEL);
+ mutex_unlock(&ctx_id_mutex);
+ if (ctx->id < 0)
+ goto error;
+
+ list_add_tail(&ctx->file_list, &file->ctxs);
+ return ctx;
+
+error:
+ kfree(ctx);
+ return NULL;
+}
+
+static void ib_ucm_event_req_get(struct ib_ucm_req_event_resp *ureq,
+ const struct ib_cm_req_event_param *kreq)
+{
+ ureq->remote_ca_guid = kreq->remote_ca_guid;
+ ureq->remote_qkey = kreq->remote_qkey;
+ ureq->remote_qpn = kreq->remote_qpn;
+ ureq->qp_type = kreq->qp_type;
+ ureq->starting_psn = kreq->starting_psn;
+ ureq->responder_resources = kreq->responder_resources;
+ ureq->initiator_depth = kreq->initiator_depth;
+ ureq->local_cm_response_timeout = kreq->local_cm_response_timeout;
+ ureq->flow_control = kreq->flow_control;
+ ureq->remote_cm_response_timeout = kreq->remote_cm_response_timeout;
+ ureq->retry_count = kreq->retry_count;
+ ureq->rnr_retry_count = kreq->rnr_retry_count;
+ ureq->srq = kreq->srq;
+ ureq->port = kreq->port;
+
+ ib_copy_path_rec_to_user(&ureq->primary_path, kreq->primary_path);
+ if (kreq->alternate_path)
+ ib_copy_path_rec_to_user(&ureq->alternate_path,
+ kreq->alternate_path);
+}
+
+static void ib_ucm_event_rep_get(struct ib_ucm_rep_event_resp *urep,
+ const struct ib_cm_rep_event_param *krep)
+{
+ urep->remote_ca_guid = krep->remote_ca_guid;
+ urep->remote_qkey = krep->remote_qkey;
+ urep->remote_qpn = krep->remote_qpn;
+ urep->starting_psn = krep->starting_psn;
+ urep->responder_resources = krep->responder_resources;
+ urep->initiator_depth = krep->initiator_depth;
+ urep->target_ack_delay = krep->target_ack_delay;
+ urep->failover_accepted = krep->failover_accepted;
+ urep->flow_control = krep->flow_control;
+ urep->rnr_retry_count = krep->rnr_retry_count;
+ urep->srq = krep->srq;
+}
+
+static void ib_ucm_event_sidr_rep_get(struct ib_ucm_sidr_rep_event_resp *urep,
+ const struct ib_cm_sidr_rep_event_param *krep)
+{
+ urep->status = krep->status;
+ urep->qkey = krep->qkey;
+ urep->qpn = krep->qpn;
+};
+
+static int ib_ucm_event_process(const struct ib_cm_event *evt,
+ struct ib_ucm_event *uvt)
+{
+ void *info = NULL;
+
+ switch (evt->event) {
+ case IB_CM_REQ_RECEIVED:
+ ib_ucm_event_req_get(&uvt->resp.u.req_resp,
+ &evt->param.req_rcvd);
+ uvt->data_len = IB_CM_REQ_PRIVATE_DATA_SIZE;
+ uvt->resp.present = IB_UCM_PRES_PRIMARY;
+ uvt->resp.present |= (evt->param.req_rcvd.alternate_path ?
+ IB_UCM_PRES_ALTERNATE : 0);
+ break;
+ case IB_CM_REP_RECEIVED:
+ ib_ucm_event_rep_get(&uvt->resp.u.rep_resp,
+ &evt->param.rep_rcvd);
+ uvt->data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
+ break;
+ case IB_CM_RTU_RECEIVED:
+ uvt->data_len = IB_CM_RTU_PRIVATE_DATA_SIZE;
+ uvt->resp.u.send_status = evt->param.send_status;
+ break;
+ case IB_CM_DREQ_RECEIVED:
+ uvt->data_len = IB_CM_DREQ_PRIVATE_DATA_SIZE;
+ uvt->resp.u.send_status = evt->param.send_status;
+ break;
+ case IB_CM_DREP_RECEIVED:
+ uvt->data_len = IB_CM_DREP_PRIVATE_DATA_SIZE;
+ uvt->resp.u.send_status = evt->param.send_status;
+ break;
+ case IB_CM_MRA_RECEIVED:
+ uvt->resp.u.mra_resp.timeout =
+ evt->param.mra_rcvd.service_timeout;
+ uvt->data_len = IB_CM_MRA_PRIVATE_DATA_SIZE;
+ break;
+ case IB_CM_REJ_RECEIVED:
+ uvt->resp.u.rej_resp.reason = evt->param.rej_rcvd.reason;
+ uvt->data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
+ uvt->info_len = evt->param.rej_rcvd.ari_length;
+ info = evt->param.rej_rcvd.ari;
+ break;
+ case IB_CM_LAP_RECEIVED:
+ ib_copy_path_rec_to_user(&uvt->resp.u.lap_resp.path,
+ evt->param.lap_rcvd.alternate_path);
+ uvt->data_len = IB_CM_LAP_PRIVATE_DATA_SIZE;
+ uvt->resp.present = IB_UCM_PRES_ALTERNATE;
+ break;
+ case IB_CM_APR_RECEIVED:
+ uvt->resp.u.apr_resp.status = evt->param.apr_rcvd.ap_status;
+ uvt->data_len = IB_CM_APR_PRIVATE_DATA_SIZE;
+ uvt->info_len = evt->param.apr_rcvd.info_len;
+ info = evt->param.apr_rcvd.apr_info;
+ break;
+ case IB_CM_SIDR_REQ_RECEIVED:
+ uvt->resp.u.sidr_req_resp.pkey =
+ evt->param.sidr_req_rcvd.pkey;
+ uvt->resp.u.sidr_req_resp.port =
+ evt->param.sidr_req_rcvd.port;
+ uvt->data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE;
+ break;
+ case IB_CM_SIDR_REP_RECEIVED:
+ ib_ucm_event_sidr_rep_get(&uvt->resp.u.sidr_rep_resp,
+ &evt->param.sidr_rep_rcvd);
+ uvt->data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
+ uvt->info_len = evt->param.sidr_rep_rcvd.info_len;
+ info = evt->param.sidr_rep_rcvd.info;
+ break;
+ default:
+ uvt->resp.u.send_status = evt->param.send_status;
+ break;
+ }
+
+ if (uvt->data_len) {
+ uvt->data = kmemdup(evt->private_data, uvt->data_len, GFP_KERNEL);
+ if (!uvt->data)
+ goto err1;
+
+ uvt->resp.present |= IB_UCM_PRES_DATA;
+ }
+
+ if (uvt->info_len) {
+ uvt->info = kmemdup(info, uvt->info_len, GFP_KERNEL);
+ if (!uvt->info)
+ goto err2;
+
+ uvt->resp.present |= IB_UCM_PRES_INFO;
+ }
+ return 0;
+
+err2:
+ kfree(uvt->data);
+err1:
+ return -ENOMEM;
+}
+
+static int ib_ucm_event_handler(struct ib_cm_id *cm_id,
+ const struct ib_cm_event *event)
+{
+ struct ib_ucm_event *uevent;
+ struct ib_ucm_context *ctx;
+ int result = 0;
+
+ ctx = cm_id->context;
+
+ uevent = kzalloc(sizeof *uevent, GFP_KERNEL);
+ if (!uevent)
+ goto err1;
+
+ uevent->ctx = ctx;
+ uevent->cm_id = cm_id;
+ uevent->resp.uid = ctx->uid;
+ uevent->resp.id = ctx->id;
+ uevent->resp.event = event->event;
+
+ result = ib_ucm_event_process(event, uevent);
+ if (result)
+ goto err2;
+
+ mutex_lock(&ctx->file->file_mutex);
+ list_add_tail(&uevent->file_list, &ctx->file->events);
+ list_add_tail(&uevent->ctx_list, &ctx->events);
+ wake_up_interruptible(&ctx->file->poll_wait);
+ mutex_unlock(&ctx->file->file_mutex);
+ return 0;
+
+err2:
+ kfree(uevent);
+err1:
+ /* Destroy new cm_id's */
+ return ib_ucm_new_cm_id(event->event);
+}
+
+static ssize_t ib_ucm_event(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_context *ctx;
+ struct ib_ucm_event_get cmd;
+ struct ib_ucm_event *uevent;
+ int result = 0;
+
+ if (out_len < sizeof(struct ib_ucm_event_resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ mutex_lock(&file->file_mutex);
+ while (list_empty(&file->events)) {
+ mutex_unlock(&file->file_mutex);
+
+ if (file->filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ if (wait_event_interruptible(file->poll_wait,
+ !list_empty(&file->events)))
+ return -ERESTARTSYS;
+
+ mutex_lock(&file->file_mutex);
+ }
+
+ uevent = list_entry(file->events.next, struct ib_ucm_event, file_list);
+
+ if (ib_ucm_new_cm_id(uevent->resp.event)) {
+ ctx = ib_ucm_ctx_alloc(file);
+ if (!ctx) {
+ result = -ENOMEM;
+ goto done;
+ }
+
+ ctx->cm_id = uevent->cm_id;
+ ctx->cm_id->context = ctx;
+ uevent->resp.id = ctx->id;
+ }
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response),
+ &uevent->resp, sizeof(uevent->resp))) {
+ result = -EFAULT;
+ goto done;
+ }
+
+ if (uevent->data) {
+ if (cmd.data_len < uevent->data_len) {
+ result = -ENOMEM;
+ goto done;
+ }
+ if (copy_to_user(u64_to_user_ptr(cmd.data),
+ uevent->data, uevent->data_len)) {
+ result = -EFAULT;
+ goto done;
+ }
+ }
+
+ if (uevent->info) {
+ if (cmd.info_len < uevent->info_len) {
+ result = -ENOMEM;
+ goto done;
+ }
+ if (copy_to_user(u64_to_user_ptr(cmd.info),
+ uevent->info, uevent->info_len)) {
+ result = -EFAULT;
+ goto done;
+ }
+ }
+
+ list_del(&uevent->file_list);
+ list_del(&uevent->ctx_list);
+ uevent->ctx->events_reported++;
+
+ kfree(uevent->data);
+ kfree(uevent->info);
+ kfree(uevent);
+done:
+ mutex_unlock(&file->file_mutex);
+ return result;
+}
+
+static ssize_t ib_ucm_create_id(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_create_id cmd;
+ struct ib_ucm_create_id_resp resp;
+ struct ib_ucm_context *ctx;
+ int result;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ mutex_lock(&file->file_mutex);
+ ctx = ib_ucm_ctx_alloc(file);
+ mutex_unlock(&file->file_mutex);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->uid = cmd.uid;
+ ctx->cm_id = ib_create_cm_id(file->device->ib_dev,
+ ib_ucm_event_handler, ctx);
+ if (IS_ERR(ctx->cm_id)) {
+ result = PTR_ERR(ctx->cm_id);
+ goto err1;
+ }
+
+ resp.id = ctx->id;
+ if (copy_to_user(u64_to_user_ptr(cmd.response),
+ &resp, sizeof(resp))) {
+ result = -EFAULT;
+ goto err2;
+ }
+ return 0;
+
+err2:
+ ib_destroy_cm_id(ctx->cm_id);
+err1:
+ mutex_lock(&ctx_id_mutex);
+ idr_remove(&ctx_id_table, ctx->id);
+ mutex_unlock(&ctx_id_mutex);
+ kfree(ctx);
+ return result;
+}
+
+static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_destroy_id cmd;
+ struct ib_ucm_destroy_id_resp resp;
+ struct ib_ucm_context *ctx;
+ int result = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ mutex_lock(&ctx_id_mutex);
+ ctx = idr_find(&ctx_id_table, cmd.id);
+ if (!ctx)
+ ctx = ERR_PTR(-ENOENT);
+ else if (ctx->file != file)
+ ctx = ERR_PTR(-EINVAL);
+ else
+ idr_remove(&ctx_id_table, ctx->id);
+ mutex_unlock(&ctx_id_mutex);
+
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ib_ucm_ctx_put(ctx);
+ wait_for_completion(&ctx->comp);
+
+ /* No new events will be generated after destroying the cm_id. */
+ ib_destroy_cm_id(ctx->cm_id);
+ /* Cleanup events not yet reported to the user. */
+ ib_ucm_cleanup_events(ctx);
+
+ resp.events_reported = ctx->events_reported;
+ if (copy_to_user(u64_to_user_ptr(cmd.response),
+ &resp, sizeof(resp)))
+ result = -EFAULT;
+
+ kfree(ctx);
+ return result;
+}
+
+static ssize_t ib_ucm_attr_id(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_attr_id_resp resp;
+ struct ib_ucm_attr_id cmd;
+ struct ib_ucm_context *ctx;
+ int result = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ resp.service_id = ctx->cm_id->service_id;
+ resp.service_mask = ctx->cm_id->service_mask;
+ resp.local_id = ctx->cm_id->local_id;
+ resp.remote_id = ctx->cm_id->remote_id;
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response),
+ &resp, sizeof(resp)))
+ result = -EFAULT;
+
+ ib_ucm_ctx_put(ctx);
+ return result;
+}
+
+static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_uverbs_qp_attr resp;
+ struct ib_ucm_init_qp_attr cmd;
+ struct ib_ucm_context *ctx;
+ struct ib_qp_attr qp_attr;
+ int result = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ resp.qp_attr_mask = 0;
+ memset(&qp_attr, 0, sizeof qp_attr);
+ qp_attr.qp_state = cmd.qp_state;
+ result = ib_cm_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
+ if (result)
+ goto out;
+
+ ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr);
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response),
+ &resp, sizeof(resp)))
+ result = -EFAULT;
+
+out:
+ ib_ucm_ctx_put(ctx);
+ return result;
+}
+
+static int ucm_validate_listen(__be64 service_id, __be64 service_mask)
+{
+ service_id &= service_mask;
+
+ if (((service_id & IB_CMA_SERVICE_ID_MASK) == IB_CMA_SERVICE_ID) ||
+ ((service_id & IB_SDP_SERVICE_ID_MASK) == IB_SDP_SERVICE_ID))
+ return -EINVAL;
+
+ return 0;
+}
+
+static ssize_t ib_ucm_listen(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_listen cmd;
+ struct ib_ucm_context *ctx;
+ int result;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ result = ucm_validate_listen(cmd.service_id, cmd.service_mask);
+ if (result)
+ goto out;
+
+ result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask);
+out:
+ ib_ucm_ctx_put(ctx);
+ return result;
+}
+
+static ssize_t ib_ucm_notify(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_notify cmd;
+ struct ib_ucm_context *ctx;
+ int result;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ result = ib_cm_notify(ctx->cm_id, (enum ib_event_type) cmd.event);
+ ib_ucm_ctx_put(ctx);
+ return result;
+}
+
+static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len)
+{
+ void *data;
+
+ *dest = NULL;
+
+ if (!len)
+ return 0;
+
+ data = memdup_user(u64_to_user_ptr(src), len);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ *dest = data;
+ return 0;
+}
+
+static int ib_ucm_path_get(struct sa_path_rec **path, u64 src)
+{
+ struct ib_user_path_rec upath;
+ struct sa_path_rec *sa_path;
+
+ *path = NULL;
+
+ if (!src)
+ return 0;
+
+ sa_path = kmalloc(sizeof(*sa_path), GFP_KERNEL);
+ if (!sa_path)
+ return -ENOMEM;
+
+ if (copy_from_user(&upath, u64_to_user_ptr(src),
+ sizeof(upath))) {
+
+ kfree(sa_path);
+ return -EFAULT;
+ }
+
+ ib_copy_path_rec_from_user(sa_path, &upath);
+ *path = sa_path;
+ return 0;
+}
+
+static ssize_t ib_ucm_send_req(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_cm_req_param param;
+ struct ib_ucm_context *ctx;
+ struct ib_ucm_req cmd;
+ int result;
+
+ param.private_data = NULL;
+ param.primary_path = NULL;
+ param.alternate_path = NULL;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+ if (result)
+ goto done;
+
+ result = ib_ucm_path_get(&param.primary_path, cmd.primary_path);
+ if (result)
+ goto done;
+
+ result = ib_ucm_path_get(&param.alternate_path, cmd.alternate_path);
+ if (result)
+ goto done;
+
+ param.private_data_len = cmd.len;
+ param.service_id = cmd.sid;
+ param.qp_num = cmd.qpn;
+ param.qp_type = cmd.qp_type;
+ param.starting_psn = cmd.psn;
+ param.peer_to_peer = cmd.peer_to_peer;
+ param.responder_resources = cmd.responder_resources;
+ param.initiator_depth = cmd.initiator_depth;
+ param.remote_cm_response_timeout = cmd.remote_cm_response_timeout;
+ param.flow_control = cmd.flow_control;
+ param.local_cm_response_timeout = cmd.local_cm_response_timeout;
+ param.retry_count = cmd.retry_count;
+ param.rnr_retry_count = cmd.rnr_retry_count;
+ param.max_cm_retries = cmd.max_cm_retries;
+ param.srq = cmd.srq;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ result = ib_send_cm_req(ctx->cm_id, &param);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+done:
+ kfree(param.private_data);
+ kfree(param.primary_path);
+ kfree(param.alternate_path);
+ return result;
+}
+
+static ssize_t ib_ucm_send_rep(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_cm_rep_param param;
+ struct ib_ucm_context *ctx;
+ struct ib_ucm_rep cmd;
+ int result;
+
+ param.private_data = NULL;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+ if (result)
+ return result;
+
+ param.qp_num = cmd.qpn;
+ param.starting_psn = cmd.psn;
+ param.private_data_len = cmd.len;
+ param.responder_resources = cmd.responder_resources;
+ param.initiator_depth = cmd.initiator_depth;
+ param.failover_accepted = cmd.failover_accepted;
+ param.flow_control = cmd.flow_control;
+ param.rnr_retry_count = cmd.rnr_retry_count;
+ param.srq = cmd.srq;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ ctx->uid = cmd.uid;
+ result = ib_send_cm_rep(ctx->cm_id, &param);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+ kfree(param.private_data);
+ return result;
+}
+
+static ssize_t ib_ucm_send_private_data(struct ib_ucm_file *file,
+ const char __user *inbuf, int in_len,
+ int (*func)(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len))
+{
+ struct ib_ucm_private_data cmd;
+ struct ib_ucm_context *ctx;
+ const void *private_data = NULL;
+ int result;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&private_data, cmd.data, cmd.len);
+ if (result)
+ return result;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ result = func(ctx->cm_id, private_data, cmd.len);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+ kfree(private_data);
+ return result;
+}
+
+static ssize_t ib_ucm_send_rtu(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_rtu);
+}
+
+static ssize_t ib_ucm_send_dreq(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_dreq);
+}
+
+static ssize_t ib_ucm_send_drep(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_drep);
+}
+
+static ssize_t ib_ucm_send_info(struct ib_ucm_file *file,
+ const char __user *inbuf, int in_len,
+ int (*func)(struct ib_cm_id *cm_id,
+ int status,
+ const void *info,
+ u8 info_len,
+ const void *data,
+ u8 data_len))
+{
+ struct ib_ucm_context *ctx;
+ struct ib_ucm_info cmd;
+ const void *data = NULL;
+ const void *info = NULL;
+ int result;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&data, cmd.data, cmd.data_len);
+ if (result)
+ goto done;
+
+ result = ib_ucm_alloc_data(&info, cmd.info, cmd.info_len);
+ if (result)
+ goto done;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ result = func(ctx->cm_id, cmd.status, info, cmd.info_len,
+ data, cmd.data_len);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+done:
+ kfree(data);
+ kfree(info);
+ return result;
+}
+
+static ssize_t ib_ucm_send_rej(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_rej);
+}
+
+static ssize_t ib_ucm_send_apr(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_apr);
+}
+
+static ssize_t ib_ucm_send_mra(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_context *ctx;
+ struct ib_ucm_mra cmd;
+ const void *data = NULL;
+ int result;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
+ if (result)
+ return result;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ result = ib_send_cm_mra(ctx->cm_id, cmd.timeout, data, cmd.len);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+ kfree(data);
+ return result;
+}
+
+static ssize_t ib_ucm_send_lap(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_context *ctx;
+ struct sa_path_rec *path = NULL;
+ struct ib_ucm_lap cmd;
+ const void *data = NULL;
+ int result;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
+ if (result)
+ goto done;
+
+ result = ib_ucm_path_get(&path, cmd.path);
+ if (result)
+ goto done;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ result = ib_send_cm_lap(ctx->cm_id, path, data, cmd.len);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+done:
+ kfree(data);
+ kfree(path);
+ return result;
+}
+
+static ssize_t ib_ucm_send_sidr_req(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_cm_sidr_req_param param = {};
+ struct ib_ucm_context *ctx;
+ struct ib_ucm_sidr_req cmd;
+ int result;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+ if (result)
+ goto done;
+
+ result = ib_ucm_path_get(&param.path, cmd.path);
+ if (result)
+ goto done;
+
+ param.private_data_len = cmd.len;
+ param.service_id = cmd.sid;
+ param.timeout_ms = cmd.timeout;
+ param.max_cm_retries = cmd.max_cm_retries;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ result = ib_send_cm_sidr_req(ctx->cm_id, &param);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+done:
+ kfree(param.private_data);
+ kfree(param.path);
+ return result;
+}
+
+static ssize_t ib_ucm_send_sidr_rep(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_cm_sidr_rep_param param;
+ struct ib_ucm_sidr_rep cmd;
+ struct ib_ucm_context *ctx;
+ int result;
+
+ param.info = NULL;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&param.private_data,
+ cmd.data, cmd.data_len);
+ if (result)
+ goto done;
+
+ result = ib_ucm_alloc_data(&param.info, cmd.info, cmd.info_len);
+ if (result)
+ goto done;
+
+ param.qp_num = cmd.qpn;
+ param.qkey = cmd.qkey;
+ param.status = cmd.status;
+ param.info_length = cmd.info_len;
+ param.private_data_len = cmd.data_len;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ result = ib_send_cm_sidr_rep(ctx->cm_id, &param);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+done:
+ kfree(param.private_data);
+ kfree(param.info);
+ return result;
+}
+
+static ssize_t (*ucm_cmd_table[])(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len) = {
+ [IB_USER_CM_CMD_CREATE_ID] = ib_ucm_create_id,
+ [IB_USER_CM_CMD_DESTROY_ID] = ib_ucm_destroy_id,
+ [IB_USER_CM_CMD_ATTR_ID] = ib_ucm_attr_id,
+ [IB_USER_CM_CMD_LISTEN] = ib_ucm_listen,
+ [IB_USER_CM_CMD_NOTIFY] = ib_ucm_notify,
+ [IB_USER_CM_CMD_SEND_REQ] = ib_ucm_send_req,
+ [IB_USER_CM_CMD_SEND_REP] = ib_ucm_send_rep,
+ [IB_USER_CM_CMD_SEND_RTU] = ib_ucm_send_rtu,
+ [IB_USER_CM_CMD_SEND_DREQ] = ib_ucm_send_dreq,
+ [IB_USER_CM_CMD_SEND_DREP] = ib_ucm_send_drep,
+ [IB_USER_CM_CMD_SEND_REJ] = ib_ucm_send_rej,
+ [IB_USER_CM_CMD_SEND_MRA] = ib_ucm_send_mra,
+ [IB_USER_CM_CMD_SEND_LAP] = ib_ucm_send_lap,
+ [IB_USER_CM_CMD_SEND_APR] = ib_ucm_send_apr,
+ [IB_USER_CM_CMD_SEND_SIDR_REQ] = ib_ucm_send_sidr_req,
+ [IB_USER_CM_CMD_SEND_SIDR_REP] = ib_ucm_send_sidr_rep,
+ [IB_USER_CM_CMD_EVENT] = ib_ucm_event,
+ [IB_USER_CM_CMD_INIT_QP_ATTR] = ib_ucm_init_qp_attr,
+};
+
+static ssize_t ib_ucm_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct ib_ucm_file *file = filp->private_data;
+ struct ib_ucm_cmd_hdr hdr;
+ ssize_t result;
+
+ if (!ib_safe_file_access(filp)) {
+ pr_err_once("ucm_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n",
+ task_tgid_vnr(current), current->comm);
+ return -EACCES;
+ }
+
+ if (len < sizeof(hdr))
+ return -EINVAL;
+
+ if (copy_from_user(&hdr, buf, sizeof(hdr)))
+ return -EFAULT;
+
+ if (hdr.cmd >= ARRAY_SIZE(ucm_cmd_table))
+ return -EINVAL;
+ hdr.cmd = array_index_nospec(hdr.cmd, ARRAY_SIZE(ucm_cmd_table));
+
+ if (hdr.in + sizeof(hdr) > len)
+ return -EINVAL;
+
+ result = ucm_cmd_table[hdr.cmd](file, buf + sizeof(hdr),
+ hdr.in, hdr.out);
+ if (!result)
+ result = len;
+
+ return result;
+}
+
+static __poll_t ib_ucm_poll(struct file *filp,
+ struct poll_table_struct *wait)
+{
+ struct ib_ucm_file *file = filp->private_data;
+ __poll_t mask = 0;
+
+ poll_wait(filp, &file->poll_wait, wait);
+
+ if (!list_empty(&file->events))
+ mask = EPOLLIN | EPOLLRDNORM;
+
+ return mask;
+}
+
+/*
+ * ib_ucm_open() does not need the BKL:
+ *
+ * - no global state is referred to;
+ * - there is no ioctl method to race against;
+ * - no further module initialization is required for open to work
+ * after the device is registered.
+ */
+static int ib_ucm_open(struct inode *inode, struct file *filp)
+{
+ struct ib_ucm_file *file;
+
+ file = kmalloc(sizeof(*file), GFP_KERNEL);
+ if (!file)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&file->events);
+ INIT_LIST_HEAD(&file->ctxs);
+ init_waitqueue_head(&file->poll_wait);
+
+ mutex_init(&file->file_mutex);
+
+ filp->private_data = file;
+ file->filp = filp;
+ file->device = container_of(inode->i_cdev, struct ib_ucm_device, cdev);
+
+ return nonseekable_open(inode, filp);
+}
+
+static int ib_ucm_close(struct inode *inode, struct file *filp)
+{
+ struct ib_ucm_file *file = filp->private_data;
+ struct ib_ucm_context *ctx;
+
+ mutex_lock(&file->file_mutex);
+ while (!list_empty(&file->ctxs)) {
+ ctx = list_entry(file->ctxs.next,
+ struct ib_ucm_context, file_list);
+ mutex_unlock(&file->file_mutex);
+
+ mutex_lock(&ctx_id_mutex);
+ idr_remove(&ctx_id_table, ctx->id);
+ mutex_unlock(&ctx_id_mutex);
+
+ ib_destroy_cm_id(ctx->cm_id);
+ ib_ucm_cleanup_events(ctx);
+ kfree(ctx);
+
+ mutex_lock(&file->file_mutex);
+ }
+ mutex_unlock(&file->file_mutex);
+ kfree(file);
+ return 0;
+}
+
+static void ib_ucm_release_dev(struct device *dev)
+{
+ struct ib_ucm_device *ucm_dev;
+
+ ucm_dev = container_of(dev, struct ib_ucm_device, dev);
+ kfree(ucm_dev);
+}
+
+static void ib_ucm_free_dev(struct ib_ucm_device *ucm_dev)
+{
+ clear_bit(ucm_dev->devnum, dev_map);
+}
+
+static const struct file_operations ucm_fops = {
+ .owner = THIS_MODULE,
+ .open = ib_ucm_open,
+ .release = ib_ucm_close,
+ .write = ib_ucm_write,
+ .poll = ib_ucm_poll,
+ .llseek = no_llseek,
+};
+
+static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct ib_ucm_device *ucm_dev;
+
+ ucm_dev = container_of(dev, struct ib_ucm_device, dev);
+ return sprintf(buf, "%s\n", ucm_dev->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static void ib_ucm_add_one(struct ib_device *device)
+{
+ int devnum;
+ dev_t base;
+ struct ib_ucm_device *ucm_dev;
+
+ if (!device->alloc_ucontext || !rdma_cap_ib_cm(device, 1))
+ return;
+
+ ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL);
+ if (!ucm_dev)
+ return;
+
+ device_initialize(&ucm_dev->dev);
+ ucm_dev->ib_dev = device;
+ ucm_dev->dev.release = ib_ucm_release_dev;
+
+ devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES);
+ if (devnum >= IB_UCM_MAX_DEVICES)
+ goto err;
+ ucm_dev->devnum = devnum;
+ set_bit(devnum, dev_map);
+ if (devnum >= IB_UCM_NUM_FIXED_MINOR)
+ base = dynamic_ucm_dev + devnum - IB_UCM_NUM_FIXED_MINOR;
+ else
+ base = IB_UCM_BASE_DEV + devnum;
+
+ cdev_init(&ucm_dev->cdev, &ucm_fops);
+ ucm_dev->cdev.owner = THIS_MODULE;
+ kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum);
+
+ ucm_dev->dev.class = &cm_class;
+ ucm_dev->dev.parent = device->dev.parent;
+ ucm_dev->dev.devt = base;
+
+ dev_set_name(&ucm_dev->dev, "ucm%d", ucm_dev->devnum);
+ if (cdev_device_add(&ucm_dev->cdev, &ucm_dev->dev))
+ goto err_devnum;
+
+ if (device_create_file(&ucm_dev->dev, &dev_attr_ibdev))
+ goto err_dev;
+
+ ib_set_client_data(device, &ucm_client, ucm_dev);
+ return;
+
+err_dev:
+ cdev_device_del(&ucm_dev->cdev, &ucm_dev->dev);
+err_devnum:
+ ib_ucm_free_dev(ucm_dev);
+err:
+ put_device(&ucm_dev->dev);
+ return;
+}
+
+static void ib_ucm_remove_one(struct ib_device *device, void *client_data)
+{
+ struct ib_ucm_device *ucm_dev = client_data;
+
+ if (!ucm_dev)
+ return;
+
+ cdev_device_del(&ucm_dev->cdev, &ucm_dev->dev);
+ ib_ucm_free_dev(ucm_dev);
+ put_device(&ucm_dev->dev);
+}
+
+static CLASS_ATTR_STRING(abi_version, S_IRUGO,
+ __stringify(IB_USER_CM_ABI_VERSION));
+
+static int __init ib_ucm_init(void)
+{
+ int ret;
+
+ ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR,
+ "infiniband_cm");
+ if (ret) {
+ pr_err("ucm: couldn't register device number\n");
+ goto error1;
+ }
+
+ ret = alloc_chrdev_region(&dynamic_ucm_dev, 0, IB_UCM_NUM_DYNAMIC_MINOR,
+ "infiniband_cm");
+ if (ret) {
+ pr_err("ucm: couldn't register dynamic device number\n");
+ goto err_alloc;
+ }
+
+ ret = class_create_file(&cm_class, &class_attr_abi_version.attr);
+ if (ret) {
+ pr_err("ucm: couldn't create abi_version attribute\n");
+ goto error2;
+ }
+
+ ret = ib_register_client(&ucm_client);
+ if (ret) {
+ pr_err("ucm: couldn't register client\n");
+ goto error3;
+ }
+ return 0;
+
+error3:
+ class_remove_file(&cm_class, &class_attr_abi_version.attr);
+error2:
+ unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR);
+err_alloc:
+ unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR);
+error1:
+ return ret;
+}
+
+static void __exit ib_ucm_cleanup(void)
+{
+ ib_unregister_client(&ucm_client);
+ class_remove_file(&cm_class, &class_attr_abi_version.attr);
+ unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR);
+ unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR);
+ idr_destroy(&ctx_id_table);
+}
+
+module_init(ib_ucm_init);
+module_exit(ib_ucm_cleanup);
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
new file mode 100644
index 000000000..01052de6b
--- /dev/null
+++ b/drivers/infiniband/core/ucma.c
@@ -0,0 +1,1896 @@
+/*
+ * Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/file.h>
+#include <linux/mutex.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/idr.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/module.h>
+#include <linux/nsproxy.h>
+
+#include <linux/nospec.h>
+
+#include <rdma/rdma_user_cm.h>
+#include <rdma/ib_marshall.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static unsigned int max_backlog = 1024;
+
+static struct ctl_table_header *ucma_ctl_table_hdr;
+static struct ctl_table ucma_ctl_table[] = {
+ {
+ .procname = "max_backlog",
+ .data = &max_backlog,
+ .maxlen = sizeof max_backlog,
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+struct ucma_file {
+ struct mutex mut;
+ struct file *filp;
+ struct list_head ctx_list;
+ struct list_head event_list;
+ wait_queue_head_t poll_wait;
+ struct workqueue_struct *close_wq;
+};
+
+struct ucma_context {
+ int id;
+ struct completion comp;
+ atomic_t ref;
+ int events_reported;
+ int backlog;
+
+ struct ucma_file *file;
+ struct rdma_cm_id *cm_id;
+ struct mutex mutex;
+ u64 uid;
+
+ struct list_head list;
+ struct list_head mc_list;
+ /* mark that device is in process of destroying the internal HW
+ * resources, protected by the global mut
+ */
+ int closing;
+ /* sync between removal event and id destroy, protected by file mut */
+ int destroying;
+ struct work_struct close_work;
+};
+
+struct ucma_multicast {
+ struct ucma_context *ctx;
+ int id;
+ int events_reported;
+
+ u64 uid;
+ u8 join_state;
+ struct list_head list;
+ struct sockaddr_storage addr;
+};
+
+struct ucma_event {
+ struct ucma_context *ctx;
+ struct ucma_multicast *mc;
+ struct list_head list;
+ struct rdma_cm_id *cm_id;
+ struct rdma_ucm_event_resp resp;
+ struct work_struct close_work;
+};
+
+static DEFINE_MUTEX(mut);
+static DEFINE_IDR(ctx_idr);
+static DEFINE_IDR(multicast_idr);
+
+static const struct file_operations ucma_fops;
+
+static inline struct ucma_context *_ucma_find_context(int id,
+ struct ucma_file *file)
+{
+ struct ucma_context *ctx;
+
+ ctx = idr_find(&ctx_idr, id);
+ if (!ctx)
+ ctx = ERR_PTR(-ENOENT);
+ else if (ctx->file != file || !ctx->cm_id)
+ ctx = ERR_PTR(-EINVAL);
+ return ctx;
+}
+
+static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
+{
+ struct ucma_context *ctx;
+
+ mutex_lock(&mut);
+ ctx = _ucma_find_context(id, file);
+ if (!IS_ERR(ctx)) {
+ if (ctx->closing)
+ ctx = ERR_PTR(-EIO);
+ else
+ atomic_inc(&ctx->ref);
+ }
+ mutex_unlock(&mut);
+ return ctx;
+}
+
+static void ucma_put_ctx(struct ucma_context *ctx)
+{
+ if (atomic_dec_and_test(&ctx->ref))
+ complete(&ctx->comp);
+}
+
+/*
+ * Same as ucm_get_ctx but requires that ->cm_id->device is valid, eg that the
+ * CM_ID is bound.
+ */
+static struct ucma_context *ucma_get_ctx_dev(struct ucma_file *file, int id)
+{
+ struct ucma_context *ctx = ucma_get_ctx(file, id);
+
+ if (IS_ERR(ctx))
+ return ctx;
+ if (!ctx->cm_id->device) {
+ ucma_put_ctx(ctx);
+ return ERR_PTR(-EINVAL);
+ }
+ return ctx;
+}
+
+static void ucma_close_event_id(struct work_struct *work)
+{
+ struct ucma_event *uevent_close = container_of(work, struct ucma_event, close_work);
+
+ rdma_destroy_id(uevent_close->cm_id);
+ kfree(uevent_close);
+}
+
+static void ucma_close_id(struct work_struct *work)
+{
+ struct ucma_context *ctx = container_of(work, struct ucma_context, close_work);
+
+ /* once all inflight tasks are finished, we close all underlying
+ * resources. The context is still alive till its explicit destryoing
+ * by its creator.
+ */
+ ucma_put_ctx(ctx);
+ wait_for_completion(&ctx->comp);
+ /* No new events will be generated after destroying the id. */
+ rdma_destroy_id(ctx->cm_id);
+}
+
+static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
+{
+ struct ucma_context *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ INIT_WORK(&ctx->close_work, ucma_close_id);
+ atomic_set(&ctx->ref, 1);
+ init_completion(&ctx->comp);
+ INIT_LIST_HEAD(&ctx->mc_list);
+ ctx->file = file;
+ mutex_init(&ctx->mutex);
+
+ mutex_lock(&mut);
+ ctx->id = idr_alloc(&ctx_idr, ctx, 0, 0, GFP_KERNEL);
+ mutex_unlock(&mut);
+ if (ctx->id < 0)
+ goto error;
+
+ list_add_tail(&ctx->list, &file->ctx_list);
+ return ctx;
+
+error:
+ kfree(ctx);
+ return NULL;
+}
+
+static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx)
+{
+ struct ucma_multicast *mc;
+
+ mc = kzalloc(sizeof(*mc), GFP_KERNEL);
+ if (!mc)
+ return NULL;
+
+ mutex_lock(&mut);
+ mc->id = idr_alloc(&multicast_idr, NULL, 0, 0, GFP_KERNEL);
+ mutex_unlock(&mut);
+ if (mc->id < 0)
+ goto error;
+
+ mc->ctx = ctx;
+ list_add_tail(&mc->list, &ctx->mc_list);
+ return mc;
+
+error:
+ kfree(mc);
+ return NULL;
+}
+
+static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst,
+ struct rdma_conn_param *src)
+{
+ if (src->private_data_len)
+ memcpy(dst->private_data, src->private_data,
+ src->private_data_len);
+ dst->private_data_len = src->private_data_len;
+ dst->responder_resources =src->responder_resources;
+ dst->initiator_depth = src->initiator_depth;
+ dst->flow_control = src->flow_control;
+ dst->retry_count = src->retry_count;
+ dst->rnr_retry_count = src->rnr_retry_count;
+ dst->srq = src->srq;
+ dst->qp_num = src->qp_num;
+}
+
+static void ucma_copy_ud_event(struct ib_device *device,
+ struct rdma_ucm_ud_param *dst,
+ struct rdma_ud_param *src)
+{
+ if (src->private_data_len)
+ memcpy(dst->private_data, src->private_data,
+ src->private_data_len);
+ dst->private_data_len = src->private_data_len;
+ ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr);
+ dst->qp_num = src->qp_num;
+ dst->qkey = src->qkey;
+}
+
+static void ucma_set_event_context(struct ucma_context *ctx,
+ struct rdma_cm_event *event,
+ struct ucma_event *uevent)
+{
+ uevent->ctx = ctx;
+ switch (event->event) {
+ case RDMA_CM_EVENT_MULTICAST_JOIN:
+ case RDMA_CM_EVENT_MULTICAST_ERROR:
+ uevent->mc = (struct ucma_multicast *)
+ event->param.ud.private_data;
+ uevent->resp.uid = uevent->mc->uid;
+ uevent->resp.id = uevent->mc->id;
+ break;
+ default:
+ uevent->resp.uid = ctx->uid;
+ uevent->resp.id = ctx->id;
+ break;
+ }
+}
+
+/* Called with file->mut locked for the relevant context. */
+static void ucma_removal_event_handler(struct rdma_cm_id *cm_id)
+{
+ struct ucma_context *ctx = cm_id->context;
+ struct ucma_event *con_req_eve;
+ int event_found = 0;
+
+ if (ctx->destroying)
+ return;
+
+ /* only if context is pointing to cm_id that it owns it and can be
+ * queued to be closed, otherwise that cm_id is an inflight one that
+ * is part of that context event list pending to be detached and
+ * reattached to its new context as part of ucma_get_event,
+ * handled separately below.
+ */
+ if (ctx->cm_id == cm_id) {
+ mutex_lock(&mut);
+ ctx->closing = 1;
+ mutex_unlock(&mut);
+ queue_work(ctx->file->close_wq, &ctx->close_work);
+ return;
+ }
+
+ list_for_each_entry(con_req_eve, &ctx->file->event_list, list) {
+ if (con_req_eve->cm_id == cm_id &&
+ con_req_eve->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+ list_del(&con_req_eve->list);
+ INIT_WORK(&con_req_eve->close_work, ucma_close_event_id);
+ queue_work(ctx->file->close_wq, &con_req_eve->close_work);
+ event_found = 1;
+ break;
+ }
+ }
+ if (!event_found)
+ pr_err("ucma_removal_event_handler: warning: connect request event wasn't found\n");
+}
+
+static int ucma_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ struct ucma_event *uevent;
+ struct ucma_context *ctx = cm_id->context;
+ int ret = 0;
+
+ uevent = kzalloc(sizeof(*uevent), GFP_KERNEL);
+ if (!uevent)
+ return event->event == RDMA_CM_EVENT_CONNECT_REQUEST;
+
+ mutex_lock(&ctx->file->mut);
+ uevent->cm_id = cm_id;
+ ucma_set_event_context(ctx, event, uevent);
+ uevent->resp.event = event->event;
+ uevent->resp.status = event->status;
+ if (cm_id->qp_type == IB_QPT_UD)
+ ucma_copy_ud_event(cm_id->device, &uevent->resp.param.ud,
+ &event->param.ud);
+ else
+ ucma_copy_conn_event(&uevent->resp.param.conn,
+ &event->param.conn);
+
+ if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+ if (!ctx->backlog) {
+ ret = -ENOMEM;
+ kfree(uevent);
+ goto out;
+ }
+ ctx->backlog--;
+ } else if (!ctx->uid || ctx->cm_id != cm_id) {
+ /*
+ * We ignore events for new connections until userspace has set
+ * their context. This can only happen if an error occurs on a
+ * new connection before the user accepts it. This is okay,
+ * since the accept will just fail later. However, we do need
+ * to release the underlying HW resources in case of a device
+ * removal event.
+ */
+ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+ ucma_removal_event_handler(cm_id);
+
+ kfree(uevent);
+ goto out;
+ }
+
+ list_add_tail(&uevent->list, &ctx->file->event_list);
+ wake_up_interruptible(&ctx->file->poll_wait);
+ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+ ucma_removal_event_handler(cm_id);
+out:
+ mutex_unlock(&ctx->file->mut);
+ return ret;
+}
+
+static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ucma_context *ctx;
+ struct rdma_ucm_get_event cmd;
+ struct ucma_event *uevent;
+ int ret = 0;
+
+ /*
+ * Old 32 bit user space does not send the 4 byte padding in the
+ * reserved field. We don't care, allow it to keep working.
+ */
+ if (out_len < sizeof(uevent->resp) - sizeof(uevent->resp.reserved))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ mutex_lock(&file->mut);
+ while (list_empty(&file->event_list)) {
+ mutex_unlock(&file->mut);
+
+ if (file->filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ if (wait_event_interruptible(file->poll_wait,
+ !list_empty(&file->event_list)))
+ return -ERESTARTSYS;
+
+ mutex_lock(&file->mut);
+ }
+
+ uevent = list_entry(file->event_list.next, struct ucma_event, list);
+
+ if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+ ctx = ucma_alloc_ctx(file);
+ if (!ctx) {
+ ret = -ENOMEM;
+ goto done;
+ }
+ uevent->ctx->backlog++;
+ ctx->cm_id = uevent->cm_id;
+ ctx->cm_id->context = ctx;
+ uevent->resp.id = ctx->id;
+ }
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response),
+ &uevent->resp,
+ min_t(size_t, out_len, sizeof(uevent->resp)))) {
+ ret = -EFAULT;
+ goto done;
+ }
+
+ list_del(&uevent->list);
+ uevent->ctx->events_reported++;
+ if (uevent->mc)
+ uevent->mc->events_reported++;
+ kfree(uevent);
+done:
+ mutex_unlock(&file->mut);
+ return ret;
+}
+
+static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type)
+{
+ switch (cmd->ps) {
+ case RDMA_PS_TCP:
+ *qp_type = IB_QPT_RC;
+ return 0;
+ case RDMA_PS_UDP:
+ case RDMA_PS_IPOIB:
+ *qp_type = IB_QPT_UD;
+ return 0;
+ case RDMA_PS_IB:
+ *qp_type = cmd->qp_type;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_create_id cmd;
+ struct rdma_ucm_create_id_resp resp;
+ struct ucma_context *ctx;
+ struct rdma_cm_id *cm_id;
+ enum ib_qp_type qp_type;
+ int ret;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ret = ucma_get_qp_type(&cmd, &qp_type);
+ if (ret)
+ return ret;
+
+ mutex_lock(&file->mut);
+ ctx = ucma_alloc_ctx(file);
+ mutex_unlock(&file->mut);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->uid = cmd.uid;
+ cm_id = __rdma_create_id(current->nsproxy->net_ns,
+ ucma_event_handler, ctx, cmd.ps, qp_type, NULL);
+ if (IS_ERR(cm_id)) {
+ ret = PTR_ERR(cm_id);
+ goto err1;
+ }
+
+ resp.id = ctx->id;
+ if (copy_to_user(u64_to_user_ptr(cmd.response),
+ &resp, sizeof(resp))) {
+ ret = -EFAULT;
+ goto err2;
+ }
+
+ ctx->cm_id = cm_id;
+ return 0;
+
+err2:
+ rdma_destroy_id(cm_id);
+err1:
+ mutex_lock(&mut);
+ idr_remove(&ctx_idr, ctx->id);
+ mutex_unlock(&mut);
+ mutex_lock(&file->mut);
+ list_del(&ctx->list);
+ mutex_unlock(&file->mut);
+ kfree(ctx);
+ return ret;
+}
+
+static void ucma_cleanup_multicast(struct ucma_context *ctx)
+{
+ struct ucma_multicast *mc, *tmp;
+
+ mutex_lock(&mut);
+ list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) {
+ list_del(&mc->list);
+ idr_remove(&multicast_idr, mc->id);
+ kfree(mc);
+ }
+ mutex_unlock(&mut);
+}
+
+static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
+{
+ struct ucma_event *uevent, *tmp;
+
+ list_for_each_entry_safe(uevent, tmp, &mc->ctx->file->event_list, list) {
+ if (uevent->mc != mc)
+ continue;
+
+ list_del(&uevent->list);
+ kfree(uevent);
+ }
+}
+
+/*
+ * ucma_free_ctx is called after the underlying rdma CM-ID is destroyed. At
+ * this point, no new events will be reported from the hardware. However, we
+ * still need to cleanup the UCMA context for this ID. Specifically, there
+ * might be events that have not yet been consumed by the user space software.
+ * These might include pending connect requests which we have not completed
+ * processing. We cannot call rdma_destroy_id while holding the lock of the
+ * context (file->mut), as it might cause a deadlock. We therefore extract all
+ * relevant events from the context pending events list while holding the
+ * mutex. After that we release them as needed.
+ */
+static int ucma_free_ctx(struct ucma_context *ctx)
+{
+ int events_reported;
+ struct ucma_event *uevent, *tmp;
+ LIST_HEAD(list);
+
+
+ ucma_cleanup_multicast(ctx);
+
+ /* Cleanup events not yet reported to the user. */
+ mutex_lock(&ctx->file->mut);
+ list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) {
+ if (uevent->ctx == ctx)
+ list_move_tail(&uevent->list, &list);
+ }
+ list_del(&ctx->list);
+ events_reported = ctx->events_reported;
+ mutex_unlock(&ctx->file->mut);
+
+ list_for_each_entry_safe(uevent, tmp, &list, list) {
+ list_del(&uevent->list);
+ if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST)
+ rdma_destroy_id(uevent->cm_id);
+ kfree(uevent);
+ }
+
+ mutex_destroy(&ctx->mutex);
+ kfree(ctx);
+ return events_reported;
+}
+
+static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_destroy_id cmd;
+ struct rdma_ucm_destroy_id_resp resp;
+ struct ucma_context *ctx;
+ int ret = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ mutex_lock(&mut);
+ ctx = _ucma_find_context(cmd.id, file);
+ if (!IS_ERR(ctx))
+ idr_remove(&ctx_idr, ctx->id);
+ mutex_unlock(&mut);
+
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->file->mut);
+ ctx->destroying = 1;
+ mutex_unlock(&ctx->file->mut);
+
+ flush_workqueue(ctx->file->close_wq);
+ /* At this point it's guaranteed that there is no inflight
+ * closing task */
+ mutex_lock(&mut);
+ if (!ctx->closing) {
+ mutex_unlock(&mut);
+ ucma_put_ctx(ctx);
+ wait_for_completion(&ctx->comp);
+ rdma_destroy_id(ctx->cm_id);
+ } else {
+ mutex_unlock(&mut);
+ }
+
+ resp.events_reported = ucma_free_ctx(ctx);
+ if (copy_to_user(u64_to_user_ptr(cmd.response),
+ &resp, sizeof(resp)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_bind_ip cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ if (!rdma_addr_size_in6(&cmd.addr))
+ return -EINVAL;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr);
+ mutex_unlock(&ctx->mutex);
+
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_bind cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ if (cmd.reserved || !cmd.addr_size ||
+ cmd.addr_size != rdma_addr_size_kss(&cmd.addr))
+ return -EINVAL;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr);
+ mutex_unlock(&ctx->mutex);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_resolve_ip(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_resolve_ip cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ if ((cmd.src_addr.sin6_family && !rdma_addr_size_in6(&cmd.src_addr)) ||
+ !rdma_addr_size_in6(&cmd.dst_addr))
+ return -EINVAL;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr,
+ (struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms);
+ mutex_unlock(&ctx->mutex);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_resolve_addr(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_resolve_addr cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ if (cmd.reserved ||
+ (cmd.src_size && (cmd.src_size != rdma_addr_size_kss(&cmd.src_addr))) ||
+ !cmd.dst_size || (cmd.dst_size != rdma_addr_size_kss(&cmd.dst_addr)))
+ return -EINVAL;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr,
+ (struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms);
+ mutex_unlock(&ctx->mutex);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_resolve_route(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_resolve_route cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx_dev(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ ret = rdma_resolve_route(ctx->cm_id, cmd.timeout_ms);
+ mutex_unlock(&ctx->mutex);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp,
+ struct rdma_route *route)
+{
+ struct rdma_dev_addr *dev_addr;
+
+ resp->num_paths = route->num_paths;
+ switch (route->num_paths) {
+ case 0:
+ dev_addr = &route->addr.dev_addr;
+ rdma_addr_get_dgid(dev_addr,
+ (union ib_gid *) &resp->ib_route[0].dgid);
+ rdma_addr_get_sgid(dev_addr,
+ (union ib_gid *) &resp->ib_route[0].sgid);
+ resp->ib_route[0].pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+ break;
+ case 2:
+ ib_copy_path_rec_to_user(&resp->ib_route[1],
+ &route->path_rec[1]);
+ /* fall through */
+ case 1:
+ ib_copy_path_rec_to_user(&resp->ib_route[0],
+ &route->path_rec[0]);
+ break;
+ default:
+ break;
+ }
+}
+
+static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp,
+ struct rdma_route *route)
+{
+
+ resp->num_paths = route->num_paths;
+ switch (route->num_paths) {
+ case 0:
+ rdma_ip2gid((struct sockaddr *)&route->addr.dst_addr,
+ (union ib_gid *)&resp->ib_route[0].dgid);
+ rdma_ip2gid((struct sockaddr *)&route->addr.src_addr,
+ (union ib_gid *)&resp->ib_route[0].sgid);
+ resp->ib_route[0].pkey = cpu_to_be16(0xffff);
+ break;
+ case 2:
+ ib_copy_path_rec_to_user(&resp->ib_route[1],
+ &route->path_rec[1]);
+ /* fall through */
+ case 1:
+ ib_copy_path_rec_to_user(&resp->ib_route[0],
+ &route->path_rec[0]);
+ break;
+ default:
+ break;
+ }
+}
+
+static void ucma_copy_iw_route(struct rdma_ucm_query_route_resp *resp,
+ struct rdma_route *route)
+{
+ struct rdma_dev_addr *dev_addr;
+
+ dev_addr = &route->addr.dev_addr;
+ rdma_addr_get_dgid(dev_addr, (union ib_gid *) &resp->ib_route[0].dgid);
+ rdma_addr_get_sgid(dev_addr, (union ib_gid *) &resp->ib_route[0].sgid);
+}
+
+static ssize_t ucma_query_route(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_query cmd;
+ struct rdma_ucm_query_route_resp resp;
+ struct ucma_context *ctx;
+ struct sockaddr *addr;
+ int ret = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ memset(&resp, 0, sizeof resp);
+ addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr;
+ memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ?
+ sizeof(struct sockaddr_in) :
+ sizeof(struct sockaddr_in6));
+ addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr;
+ memcpy(&resp.dst_addr, addr, addr->sa_family == AF_INET ?
+ sizeof(struct sockaddr_in) :
+ sizeof(struct sockaddr_in6));
+ if (!ctx->cm_id->device)
+ goto out;
+
+ resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid;
+ resp.port_num = ctx->cm_id->port_num;
+
+ if (rdma_cap_ib_sa(ctx->cm_id->device, ctx->cm_id->port_num))
+ ucma_copy_ib_route(&resp, &ctx->cm_id->route);
+ else if (rdma_protocol_roce(ctx->cm_id->device, ctx->cm_id->port_num))
+ ucma_copy_iboe_route(&resp, &ctx->cm_id->route);
+ else if (rdma_protocol_iwarp(ctx->cm_id->device, ctx->cm_id->port_num))
+ ucma_copy_iw_route(&resp, &ctx->cm_id->route);
+
+out:
+ mutex_unlock(&ctx->mutex);
+ if (copy_to_user(u64_to_user_ptr(cmd.response),
+ &resp, sizeof(resp)))
+ ret = -EFAULT;
+
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static void ucma_query_device_addr(struct rdma_cm_id *cm_id,
+ struct rdma_ucm_query_addr_resp *resp)
+{
+ if (!cm_id->device)
+ return;
+
+ resp->node_guid = (__force __u64) cm_id->device->node_guid;
+ resp->port_num = cm_id->port_num;
+ resp->pkey = (__force __u16) cpu_to_be16(
+ ib_addr_get_pkey(&cm_id->route.addr.dev_addr));
+}
+
+static ssize_t ucma_query_addr(struct ucma_context *ctx,
+ void __user *response, int out_len)
+{
+ struct rdma_ucm_query_addr_resp resp;
+ struct sockaddr *addr;
+ int ret = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ memset(&resp, 0, sizeof resp);
+
+ addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr;
+ resp.src_size = rdma_addr_size(addr);
+ memcpy(&resp.src_addr, addr, resp.src_size);
+
+ addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr;
+ resp.dst_size = rdma_addr_size(addr);
+ memcpy(&resp.dst_addr, addr, resp.dst_size);
+
+ ucma_query_device_addr(ctx->cm_id, &resp);
+
+ if (copy_to_user(response, &resp, sizeof(resp)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static ssize_t ucma_query_path(struct ucma_context *ctx,
+ void __user *response, int out_len)
+{
+ struct rdma_ucm_query_path_resp *resp;
+ int i, ret = 0;
+
+ if (out_len < sizeof(*resp))
+ return -ENOSPC;
+
+ resp = kzalloc(out_len, GFP_KERNEL);
+ if (!resp)
+ return -ENOMEM;
+
+ resp->num_paths = ctx->cm_id->route.num_paths;
+ for (i = 0, out_len -= sizeof(*resp);
+ i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data);
+ i++, out_len -= sizeof(struct ib_path_rec_data)) {
+ struct sa_path_rec *rec = &ctx->cm_id->route.path_rec[i];
+
+ resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY |
+ IB_PATH_BIDIRECTIONAL;
+ if (rec->rec_type == SA_PATH_REC_TYPE_OPA) {
+ struct sa_path_rec ib;
+
+ sa_convert_path_opa_to_ib(&ib, rec);
+ ib_sa_pack_path(&ib, &resp->path_data[i].path_rec);
+
+ } else {
+ ib_sa_pack_path(rec, &resp->path_data[i].path_rec);
+ }
+ }
+
+ if (copy_to_user(response, resp,
+ sizeof(*resp) + (i * sizeof(struct ib_path_rec_data))))
+ ret = -EFAULT;
+
+ kfree(resp);
+ return ret;
+}
+
+static ssize_t ucma_query_gid(struct ucma_context *ctx,
+ void __user *response, int out_len)
+{
+ struct rdma_ucm_query_addr_resp resp;
+ struct sockaddr_ib *addr;
+ int ret = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ memset(&resp, 0, sizeof resp);
+
+ ucma_query_device_addr(ctx->cm_id, &resp);
+
+ addr = (struct sockaddr_ib *) &resp.src_addr;
+ resp.src_size = sizeof(*addr);
+ if (ctx->cm_id->route.addr.src_addr.ss_family == AF_IB) {
+ memcpy(addr, &ctx->cm_id->route.addr.src_addr, resp.src_size);
+ } else {
+ addr->sib_family = AF_IB;
+ addr->sib_pkey = (__force __be16) resp.pkey;
+ rdma_read_gids(ctx->cm_id, (union ib_gid *)&addr->sib_addr,
+ NULL);
+ addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *)
+ &ctx->cm_id->route.addr.src_addr);
+ }
+
+ addr = (struct sockaddr_ib *) &resp.dst_addr;
+ resp.dst_size = sizeof(*addr);
+ if (ctx->cm_id->route.addr.dst_addr.ss_family == AF_IB) {
+ memcpy(addr, &ctx->cm_id->route.addr.dst_addr, resp.dst_size);
+ } else {
+ addr->sib_family = AF_IB;
+ addr->sib_pkey = (__force __be16) resp.pkey;
+ rdma_read_gids(ctx->cm_id, NULL,
+ (union ib_gid *)&addr->sib_addr);
+ addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *)
+ &ctx->cm_id->route.addr.dst_addr);
+ }
+
+ if (copy_to_user(response, &resp, sizeof(resp)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static ssize_t ucma_query(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_query cmd;
+ struct ucma_context *ctx;
+ void __user *response;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ response = u64_to_user_ptr(cmd.response);
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ switch (cmd.option) {
+ case RDMA_USER_CM_QUERY_ADDR:
+ ret = ucma_query_addr(ctx, response, out_len);
+ break;
+ case RDMA_USER_CM_QUERY_PATH:
+ ret = ucma_query_path(ctx, response, out_len);
+ break;
+ case RDMA_USER_CM_QUERY_GID:
+ ret = ucma_query_gid(ctx, response, out_len);
+ break;
+ default:
+ ret = -ENOSYS;
+ break;
+ }
+ mutex_unlock(&ctx->mutex);
+
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static void ucma_copy_conn_param(struct rdma_cm_id *id,
+ struct rdma_conn_param *dst,
+ struct rdma_ucm_conn_param *src)
+{
+ dst->private_data = src->private_data;
+ dst->private_data_len = src->private_data_len;
+ dst->responder_resources =src->responder_resources;
+ dst->initiator_depth = src->initiator_depth;
+ dst->flow_control = src->flow_control;
+ dst->retry_count = src->retry_count;
+ dst->rnr_retry_count = src->rnr_retry_count;
+ dst->srq = src->srq;
+ dst->qp_num = src->qp_num;
+ dst->qkey = (id->route.addr.src_addr.ss_family == AF_IB) ? src->qkey : 0;
+}
+
+static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_connect cmd;
+ struct rdma_conn_param conn_param;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ if (!cmd.conn_param.valid)
+ return -EINVAL;
+
+ ctx = ucma_get_ctx_dev(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param);
+ mutex_lock(&ctx->mutex);
+ ret = rdma_connect(ctx->cm_id, &conn_param);
+ mutex_unlock(&ctx->mutex);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_listen cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ctx->backlog = cmd.backlog > 0 && cmd.backlog < max_backlog ?
+ cmd.backlog : max_backlog;
+ mutex_lock(&ctx->mutex);
+ ret = rdma_listen(ctx->cm_id, ctx->backlog);
+ mutex_unlock(&ctx->mutex);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_accept cmd;
+ struct rdma_conn_param conn_param;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx_dev(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ if (cmd.conn_param.valid) {
+ ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param);
+ mutex_lock(&file->mut);
+ mutex_lock(&ctx->mutex);
+ ret = __rdma_accept(ctx->cm_id, &conn_param, NULL);
+ mutex_unlock(&ctx->mutex);
+ if (!ret)
+ ctx->uid = cmd.uid;
+ mutex_unlock(&file->mut);
+ } else {
+ mutex_lock(&ctx->mutex);
+ ret = __rdma_accept(ctx->cm_id, NULL, NULL);
+ mutex_unlock(&ctx->mutex);
+ }
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_reject cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx_dev(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len);
+ mutex_unlock(&ctx->mutex);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_disconnect(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_disconnect cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx_dev(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ ret = rdma_disconnect(ctx->cm_id);
+ mutex_unlock(&ctx->mutex);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_init_qp_attr(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_init_qp_attr cmd;
+ struct ib_uverbs_qp_attr resp;
+ struct ucma_context *ctx;
+ struct ib_qp_attr qp_attr;
+ int ret;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ if (cmd.qp_state > IB_QPS_ERR)
+ return -EINVAL;
+
+ ctx = ucma_get_ctx_dev(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ resp.qp_attr_mask = 0;
+ memset(&qp_attr, 0, sizeof qp_attr);
+ qp_attr.qp_state = cmd.qp_state;
+ mutex_lock(&ctx->mutex);
+ ret = rdma_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
+ mutex_unlock(&ctx->mutex);
+ if (ret)
+ goto out;
+
+ ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr);
+ if (copy_to_user(u64_to_user_ptr(cmd.response),
+ &resp, sizeof(resp)))
+ ret = -EFAULT;
+
+out:
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static int ucma_set_option_id(struct ucma_context *ctx, int optname,
+ void *optval, size_t optlen)
+{
+ int ret = 0;
+
+ switch (optname) {
+ case RDMA_OPTION_ID_TOS:
+ if (optlen != sizeof(u8)) {
+ ret = -EINVAL;
+ break;
+ }
+ rdma_set_service_type(ctx->cm_id, *((u8 *) optval));
+ break;
+ case RDMA_OPTION_ID_REUSEADDR:
+ if (optlen != sizeof(int)) {
+ ret = -EINVAL;
+ break;
+ }
+ ret = rdma_set_reuseaddr(ctx->cm_id, *((int *) optval) ? 1 : 0);
+ break;
+ case RDMA_OPTION_ID_AFONLY:
+ if (optlen != sizeof(int)) {
+ ret = -EINVAL;
+ break;
+ }
+ ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+
+ return ret;
+}
+
+static int ucma_set_ib_path(struct ucma_context *ctx,
+ struct ib_path_rec_data *path_data, size_t optlen)
+{
+ struct sa_path_rec sa_path;
+ struct rdma_cm_event event;
+ int ret;
+
+ if (optlen % sizeof(*path_data))
+ return -EINVAL;
+
+ for (; optlen; optlen -= sizeof(*path_data), path_data++) {
+ if (path_data->flags == (IB_PATH_GMP | IB_PATH_PRIMARY |
+ IB_PATH_BIDIRECTIONAL))
+ break;
+ }
+
+ if (!optlen)
+ return -EINVAL;
+
+ if (!ctx->cm_id->device)
+ return -EINVAL;
+
+ memset(&sa_path, 0, sizeof(sa_path));
+
+ sa_path.rec_type = SA_PATH_REC_TYPE_IB;
+ ib_sa_unpack_path(path_data->path_rec, &sa_path);
+
+ if (rdma_cap_opa_ah(ctx->cm_id->device, ctx->cm_id->port_num)) {
+ struct sa_path_rec opa;
+
+ sa_convert_path_ib_to_opa(&opa, &sa_path);
+ mutex_lock(&ctx->mutex);
+ ret = rdma_set_ib_path(ctx->cm_id, &opa);
+ mutex_unlock(&ctx->mutex);
+ } else {
+ mutex_lock(&ctx->mutex);
+ ret = rdma_set_ib_path(ctx->cm_id, &sa_path);
+ mutex_unlock(&ctx->mutex);
+ }
+ if (ret)
+ return ret;
+
+ memset(&event, 0, sizeof event);
+ event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+ return ucma_event_handler(ctx->cm_id, &event);
+}
+
+static int ucma_set_option_ib(struct ucma_context *ctx, int optname,
+ void *optval, size_t optlen)
+{
+ int ret;
+
+ switch (optname) {
+ case RDMA_OPTION_IB_PATH:
+ ret = ucma_set_ib_path(ctx, optval, optlen);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+
+ return ret;
+}
+
+static int ucma_set_option_level(struct ucma_context *ctx, int level,
+ int optname, void *optval, size_t optlen)
+{
+ int ret;
+
+ switch (level) {
+ case RDMA_OPTION_ID:
+ mutex_lock(&ctx->mutex);
+ ret = ucma_set_option_id(ctx, optname, optval, optlen);
+ mutex_unlock(&ctx->mutex);
+ break;
+ case RDMA_OPTION_IB:
+ ret = ucma_set_option_ib(ctx, optname, optval, optlen);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+
+ return ret;
+}
+
+static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_set_option cmd;
+ struct ucma_context *ctx;
+ void *optval;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ if (unlikely(cmd.optlen > KMALLOC_MAX_SIZE))
+ return -EINVAL;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ optval = memdup_user(u64_to_user_ptr(cmd.optval),
+ cmd.optlen);
+ if (IS_ERR(optval)) {
+ ret = PTR_ERR(optval);
+ goto out;
+ }
+
+ ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval,
+ cmd.optlen);
+ kfree(optval);
+
+out:
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_notify cmd;
+ struct ucma_context *ctx;
+ int ret = -EINVAL;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ if (ctx->cm_id->device)
+ ret = rdma_notify(ctx->cm_id, (enum ib_event_type)cmd.event);
+ mutex_unlock(&ctx->mutex);
+
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_process_join(struct ucma_file *file,
+ struct rdma_ucm_join_mcast *cmd, int out_len)
+{
+ struct rdma_ucm_create_id_resp resp;
+ struct ucma_context *ctx;
+ struct ucma_multicast *mc;
+ struct sockaddr *addr;
+ int ret;
+ u8 join_state;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ addr = (struct sockaddr *) &cmd->addr;
+ if (cmd->addr_size != rdma_addr_size(addr))
+ return -EINVAL;
+
+ if (cmd->join_flags == RDMA_MC_JOIN_FLAG_FULLMEMBER)
+ join_state = BIT(FULLMEMBER_JOIN);
+ else if (cmd->join_flags == RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER)
+ join_state = BIT(SENDONLY_FULLMEMBER_JOIN);
+ else
+ return -EINVAL;
+
+ ctx = ucma_get_ctx_dev(file, cmd->id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&file->mut);
+ mc = ucma_alloc_multicast(ctx);
+ if (!mc) {
+ ret = -ENOMEM;
+ goto err1;
+ }
+ mc->join_state = join_state;
+ mc->uid = cmd->uid;
+ memcpy(&mc->addr, addr, cmd->addr_size);
+ mutex_lock(&ctx->mutex);
+ ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr,
+ join_state, mc);
+ mutex_unlock(&ctx->mutex);
+ if (ret)
+ goto err2;
+
+ resp.id = mc->id;
+ if (copy_to_user(u64_to_user_ptr(cmd->response),
+ &resp, sizeof(resp))) {
+ ret = -EFAULT;
+ goto err3;
+ }
+
+ mutex_lock(&mut);
+ idr_replace(&multicast_idr, mc, mc->id);
+ mutex_unlock(&mut);
+
+ mutex_unlock(&file->mut);
+ ucma_put_ctx(ctx);
+ return 0;
+
+err3:
+ mutex_lock(&ctx->mutex);
+ rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr);
+ mutex_unlock(&ctx->mutex);
+ ucma_cleanup_mc_events(mc);
+err2:
+ mutex_lock(&mut);
+ idr_remove(&multicast_idr, mc->id);
+ mutex_unlock(&mut);
+ list_del(&mc->list);
+ kfree(mc);
+err1:
+ mutex_unlock(&file->mut);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_join_ip_multicast(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_join_ip_mcast cmd;
+ struct rdma_ucm_join_mcast join_cmd;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ join_cmd.response = cmd.response;
+ join_cmd.uid = cmd.uid;
+ join_cmd.id = cmd.id;
+ join_cmd.addr_size = rdma_addr_size_in6(&cmd.addr);
+ if (!join_cmd.addr_size)
+ return -EINVAL;
+
+ join_cmd.join_flags = RDMA_MC_JOIN_FLAG_FULLMEMBER;
+ memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size);
+
+ return ucma_process_join(file, &join_cmd, out_len);
+}
+
+static ssize_t ucma_join_multicast(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_join_mcast cmd;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ if (!rdma_addr_size_kss(&cmd.addr))
+ return -EINVAL;
+
+ return ucma_process_join(file, &cmd, out_len);
+}
+
+static ssize_t ucma_leave_multicast(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_destroy_id cmd;
+ struct rdma_ucm_destroy_id_resp resp;
+ struct ucma_multicast *mc;
+ int ret = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ mutex_lock(&mut);
+ mc = idr_find(&multicast_idr, cmd.id);
+ if (!mc)
+ mc = ERR_PTR(-ENOENT);
+ else if (mc->ctx->file != file)
+ mc = ERR_PTR(-EINVAL);
+ else if (!atomic_inc_not_zero(&mc->ctx->ref))
+ mc = ERR_PTR(-ENXIO);
+ else
+ idr_remove(&multicast_idr, mc->id);
+ mutex_unlock(&mut);
+
+ if (IS_ERR(mc)) {
+ ret = PTR_ERR(mc);
+ goto out;
+ }
+
+ mutex_lock(&mc->ctx->mutex);
+ rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr);
+ mutex_unlock(&mc->ctx->mutex);
+
+ mutex_lock(&mc->ctx->file->mut);
+ ucma_cleanup_mc_events(mc);
+ list_del(&mc->list);
+ mutex_unlock(&mc->ctx->file->mut);
+
+ ucma_put_ctx(mc->ctx);
+ resp.events_reported = mc->events_reported;
+ kfree(mc);
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response),
+ &resp, sizeof(resp)))
+ ret = -EFAULT;
+out:
+ return ret;
+}
+
+static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2)
+{
+ /* Acquire mutex's based on pointer comparison to prevent deadlock. */
+ if (file1 < file2) {
+ mutex_lock(&file1->mut);
+ mutex_lock_nested(&file2->mut, SINGLE_DEPTH_NESTING);
+ } else {
+ mutex_lock(&file2->mut);
+ mutex_lock_nested(&file1->mut, SINGLE_DEPTH_NESTING);
+ }
+}
+
+static void ucma_unlock_files(struct ucma_file *file1, struct ucma_file *file2)
+{
+ if (file1 < file2) {
+ mutex_unlock(&file2->mut);
+ mutex_unlock(&file1->mut);
+ } else {
+ mutex_unlock(&file1->mut);
+ mutex_unlock(&file2->mut);
+ }
+}
+
+static void ucma_move_events(struct ucma_context *ctx, struct ucma_file *file)
+{
+ struct ucma_event *uevent, *tmp;
+
+ list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list)
+ if (uevent->ctx == ctx)
+ list_move_tail(&uevent->list, &file->event_list);
+}
+
+static ssize_t ucma_migrate_id(struct ucma_file *new_file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_migrate_id cmd;
+ struct rdma_ucm_migrate_resp resp;
+ struct ucma_context *ctx;
+ struct fd f;
+ struct ucma_file *cur_file;
+ int ret = 0;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ /* Get current fd to protect against it being closed */
+ f = fdget(cmd.fd);
+ if (!f.file)
+ return -ENOENT;
+ if (f.file->f_op != &ucma_fops) {
+ ret = -EINVAL;
+ goto file_put;
+ }
+
+ /* Validate current fd and prevent destruction of id. */
+ ctx = ucma_get_ctx(f.file->private_data, cmd.id);
+ if (IS_ERR(ctx)) {
+ ret = PTR_ERR(ctx);
+ goto file_put;
+ }
+
+ cur_file = ctx->file;
+ if (cur_file == new_file) {
+ mutex_lock(&cur_file->mut);
+ resp.events_reported = ctx->events_reported;
+ mutex_unlock(&cur_file->mut);
+ goto response;
+ }
+
+ /*
+ * Migrate events between fd's, maintaining order, and avoiding new
+ * events being added before existing events.
+ */
+ ucma_lock_files(cur_file, new_file);
+ mutex_lock(&mut);
+
+ list_move_tail(&ctx->list, &new_file->ctx_list);
+ ucma_move_events(ctx, new_file);
+ ctx->file = new_file;
+ resp.events_reported = ctx->events_reported;
+
+ mutex_unlock(&mut);
+ ucma_unlock_files(cur_file, new_file);
+
+response:
+ if (copy_to_user(u64_to_user_ptr(cmd.response),
+ &resp, sizeof(resp)))
+ ret = -EFAULT;
+
+ ucma_put_ctx(ctx);
+file_put:
+ fdput(f);
+ return ret;
+}
+
+static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len) = {
+ [RDMA_USER_CM_CMD_CREATE_ID] = ucma_create_id,
+ [RDMA_USER_CM_CMD_DESTROY_ID] = ucma_destroy_id,
+ [RDMA_USER_CM_CMD_BIND_IP] = ucma_bind_ip,
+ [RDMA_USER_CM_CMD_RESOLVE_IP] = ucma_resolve_ip,
+ [RDMA_USER_CM_CMD_RESOLVE_ROUTE] = ucma_resolve_route,
+ [RDMA_USER_CM_CMD_QUERY_ROUTE] = ucma_query_route,
+ [RDMA_USER_CM_CMD_CONNECT] = ucma_connect,
+ [RDMA_USER_CM_CMD_LISTEN] = ucma_listen,
+ [RDMA_USER_CM_CMD_ACCEPT] = ucma_accept,
+ [RDMA_USER_CM_CMD_REJECT] = ucma_reject,
+ [RDMA_USER_CM_CMD_DISCONNECT] = ucma_disconnect,
+ [RDMA_USER_CM_CMD_INIT_QP_ATTR] = ucma_init_qp_attr,
+ [RDMA_USER_CM_CMD_GET_EVENT] = ucma_get_event,
+ [RDMA_USER_CM_CMD_GET_OPTION] = NULL,
+ [RDMA_USER_CM_CMD_SET_OPTION] = ucma_set_option,
+ [RDMA_USER_CM_CMD_NOTIFY] = ucma_notify,
+ [RDMA_USER_CM_CMD_JOIN_IP_MCAST] = ucma_join_ip_multicast,
+ [RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast,
+ [RDMA_USER_CM_CMD_MIGRATE_ID] = ucma_migrate_id,
+ [RDMA_USER_CM_CMD_QUERY] = ucma_query,
+ [RDMA_USER_CM_CMD_BIND] = ucma_bind,
+ [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr,
+ [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast
+};
+
+static ssize_t ucma_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct ucma_file *file = filp->private_data;
+ struct rdma_ucm_cmd_hdr hdr;
+ ssize_t ret;
+
+ if (!ib_safe_file_access(filp)) {
+ pr_err_once("ucma_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n",
+ task_tgid_vnr(current), current->comm);
+ return -EACCES;
+ }
+
+ if (len < sizeof(hdr))
+ return -EINVAL;
+
+ if (copy_from_user(&hdr, buf, sizeof(hdr)))
+ return -EFAULT;
+
+ if (hdr.cmd >= ARRAY_SIZE(ucma_cmd_table))
+ return -EINVAL;
+ hdr.cmd = array_index_nospec(hdr.cmd, ARRAY_SIZE(ucma_cmd_table));
+
+ if (hdr.in + sizeof(hdr) > len)
+ return -EINVAL;
+
+ if (!ucma_cmd_table[hdr.cmd])
+ return -ENOSYS;
+
+ ret = ucma_cmd_table[hdr.cmd](file, buf + sizeof(hdr), hdr.in, hdr.out);
+ if (!ret)
+ ret = len;
+
+ return ret;
+}
+
+static __poll_t ucma_poll(struct file *filp, struct poll_table_struct *wait)
+{
+ struct ucma_file *file = filp->private_data;
+ __poll_t mask = 0;
+
+ poll_wait(filp, &file->poll_wait, wait);
+
+ if (!list_empty(&file->event_list))
+ mask = EPOLLIN | EPOLLRDNORM;
+
+ return mask;
+}
+
+/*
+ * ucma_open() does not need the BKL:
+ *
+ * - no global state is referred to;
+ * - there is no ioctl method to race against;
+ * - no further module initialization is required for open to work
+ * after the device is registered.
+ */
+static int ucma_open(struct inode *inode, struct file *filp)
+{
+ struct ucma_file *file;
+
+ file = kmalloc(sizeof *file, GFP_KERNEL);
+ if (!file)
+ return -ENOMEM;
+
+ file->close_wq = alloc_ordered_workqueue("ucma_close_id",
+ WQ_MEM_RECLAIM);
+ if (!file->close_wq) {
+ kfree(file);
+ return -ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&file->event_list);
+ INIT_LIST_HEAD(&file->ctx_list);
+ init_waitqueue_head(&file->poll_wait);
+ mutex_init(&file->mut);
+
+ filp->private_data = file;
+ file->filp = filp;
+
+ return nonseekable_open(inode, filp);
+}
+
+static int ucma_close(struct inode *inode, struct file *filp)
+{
+ struct ucma_file *file = filp->private_data;
+ struct ucma_context *ctx, *tmp;
+
+ mutex_lock(&file->mut);
+ list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) {
+ ctx->destroying = 1;
+ mutex_unlock(&file->mut);
+
+ mutex_lock(&mut);
+ idr_remove(&ctx_idr, ctx->id);
+ mutex_unlock(&mut);
+
+ flush_workqueue(file->close_wq);
+ /* At that step once ctx was marked as destroying and workqueue
+ * was flushed we are safe from any inflights handlers that
+ * might put other closing task.
+ */
+ mutex_lock(&mut);
+ if (!ctx->closing) {
+ mutex_unlock(&mut);
+ ucma_put_ctx(ctx);
+ wait_for_completion(&ctx->comp);
+ /* rdma_destroy_id ensures that no event handlers are
+ * inflight for that id before releasing it.
+ */
+ rdma_destroy_id(ctx->cm_id);
+ } else {
+ mutex_unlock(&mut);
+ }
+
+ ucma_free_ctx(ctx);
+ mutex_lock(&file->mut);
+ }
+ mutex_unlock(&file->mut);
+ destroy_workqueue(file->close_wq);
+ kfree(file);
+ return 0;
+}
+
+static const struct file_operations ucma_fops = {
+ .owner = THIS_MODULE,
+ .open = ucma_open,
+ .release = ucma_close,
+ .write = ucma_write,
+ .poll = ucma_poll,
+ .llseek = no_llseek,
+};
+
+static struct miscdevice ucma_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "rdma_cm",
+ .nodename = "infiniband/rdma_cm",
+ .mode = 0666,
+ .fops = &ucma_fops,
+};
+
+static ssize_t show_abi_version(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", RDMA_USER_CM_ABI_VERSION);
+}
+static DEVICE_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+
+static int __init ucma_init(void)
+{
+ int ret;
+
+ ret = misc_register(&ucma_misc);
+ if (ret)
+ return ret;
+
+ ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version);
+ if (ret) {
+ pr_err("rdma_ucm: couldn't create abi_version attr\n");
+ goto err1;
+ }
+
+ ucma_ctl_table_hdr = register_net_sysctl(&init_net, "net/rdma_ucm", ucma_ctl_table);
+ if (!ucma_ctl_table_hdr) {
+ pr_err("rdma_ucm: couldn't register sysctl paths\n");
+ ret = -ENOMEM;
+ goto err2;
+ }
+ return 0;
+err2:
+ device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
+err1:
+ misc_deregister(&ucma_misc);
+ return ret;
+}
+
+static void __exit ucma_cleanup(void)
+{
+ unregister_net_sysctl_table(ucma_ctl_table_hdr);
+ device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
+ misc_deregister(&ucma_misc);
+ idr_destroy(&ctx_idr);
+ idr_destroy(&multicast_idr);
+}
+
+module_init(ucma_init);
+module_exit(ucma_cleanup);
diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c
new file mode 100644
index 000000000..29a45d2f8
--- /dev/null
+++ b/drivers/infiniband/core/ud_header.c
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/export.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+
+#include <rdma/ib_pack.h>
+
+#define STRUCT_FIELD(header, field) \
+ .struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field), \
+ .struct_size_bytes = sizeof ((struct ib_unpacked_ ## header *) 0)->field, \
+ .field_name = #header ":" #field
+
+static const struct ib_field lrh_table[] = {
+ { STRUCT_FIELD(lrh, virtual_lane),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { STRUCT_FIELD(lrh, link_version),
+ .offset_words = 0,
+ .offset_bits = 4,
+ .size_bits = 4 },
+ { STRUCT_FIELD(lrh, service_level),
+ .offset_words = 0,
+ .offset_bits = 8,
+ .size_bits = 4 },
+ { RESERVED,
+ .offset_words = 0,
+ .offset_bits = 12,
+ .size_bits = 2 },
+ { STRUCT_FIELD(lrh, link_next_header),
+ .offset_words = 0,
+ .offset_bits = 14,
+ .size_bits = 2 },
+ { STRUCT_FIELD(lrh, destination_lid),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { RESERVED,
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 5 },
+ { STRUCT_FIELD(lrh, packet_length),
+ .offset_words = 1,
+ .offset_bits = 5,
+ .size_bits = 11 },
+ { STRUCT_FIELD(lrh, source_lid),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 16 }
+};
+
+static const struct ib_field eth_table[] = {
+ { STRUCT_FIELD(eth, dmac_h),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { STRUCT_FIELD(eth, dmac_l),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(eth, smac_h),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { STRUCT_FIELD(eth, smac_l),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { STRUCT_FIELD(eth, type),
+ .offset_words = 3,
+ .offset_bits = 0,
+ .size_bits = 16 }
+};
+
+static const struct ib_field vlan_table[] = {
+ { STRUCT_FIELD(vlan, tag),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(vlan, type),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 }
+};
+
+static const struct ib_field ip4_table[] = {
+ { STRUCT_FIELD(ip4, ver),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { STRUCT_FIELD(ip4, hdr_len),
+ .offset_words = 0,
+ .offset_bits = 4,
+ .size_bits = 4 },
+ { STRUCT_FIELD(ip4, tos),
+ .offset_words = 0,
+ .offset_bits = 8,
+ .size_bits = 8 },
+ { STRUCT_FIELD(ip4, tot_len),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { STRUCT_FIELD(ip4, id),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(ip4, frag_off),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { STRUCT_FIELD(ip4, ttl),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { STRUCT_FIELD(ip4, protocol),
+ .offset_words = 2,
+ .offset_bits = 8,
+ .size_bits = 8 },
+ { STRUCT_FIELD(ip4, check),
+ .offset_words = 2,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { STRUCT_FIELD(ip4, saddr),
+ .offset_words = 3,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { STRUCT_FIELD(ip4, daddr),
+ .offset_words = 4,
+ .offset_bits = 0,
+ .size_bits = 32 }
+};
+
+static const struct ib_field udp_table[] = {
+ { STRUCT_FIELD(udp, sport),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(udp, dport),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { STRUCT_FIELD(udp, length),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(udp, csum),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 16 }
+};
+
+static const struct ib_field grh_table[] = {
+ { STRUCT_FIELD(grh, ip_version),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { STRUCT_FIELD(grh, traffic_class),
+ .offset_words = 0,
+ .offset_bits = 4,
+ .size_bits = 8 },
+ { STRUCT_FIELD(grh, flow_label),
+ .offset_words = 0,
+ .offset_bits = 12,
+ .size_bits = 20 },
+ { STRUCT_FIELD(grh, payload_length),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(grh, next_header),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 8 },
+ { STRUCT_FIELD(grh, hop_limit),
+ .offset_words = 1,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { STRUCT_FIELD(grh, source_gid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { STRUCT_FIELD(grh, destination_gid),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 128 }
+};
+
+static const struct ib_field bth_table[] = {
+ { STRUCT_FIELD(bth, opcode),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { STRUCT_FIELD(bth, solicited_event),
+ .offset_words = 0,
+ .offset_bits = 8,
+ .size_bits = 1 },
+ { STRUCT_FIELD(bth, mig_req),
+ .offset_words = 0,
+ .offset_bits = 9,
+ .size_bits = 1 },
+ { STRUCT_FIELD(bth, pad_count),
+ .offset_words = 0,
+ .offset_bits = 10,
+ .size_bits = 2 },
+ { STRUCT_FIELD(bth, transport_header_version),
+ .offset_words = 0,
+ .offset_bits = 12,
+ .size_bits = 4 },
+ { STRUCT_FIELD(bth, pkey),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { RESERVED,
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { STRUCT_FIELD(bth, destination_qpn),
+ .offset_words = 1,
+ .offset_bits = 8,
+ .size_bits = 24 },
+ { STRUCT_FIELD(bth, ack_req),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 1 },
+ { RESERVED,
+ .offset_words = 2,
+ .offset_bits = 1,
+ .size_bits = 7 },
+ { STRUCT_FIELD(bth, psn),
+ .offset_words = 2,
+ .offset_bits = 8,
+ .size_bits = 24 }
+};
+
+static const struct ib_field deth_table[] = {
+ { STRUCT_FIELD(deth, qkey),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { RESERVED,
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { STRUCT_FIELD(deth, source_qpn),
+ .offset_words = 1,
+ .offset_bits = 8,
+ .size_bits = 24 }
+};
+
+__sum16 ib_ud_ip4_csum(struct ib_ud_header *header)
+{
+ struct iphdr iph;
+
+ iph.ihl = 5;
+ iph.version = 4;
+ iph.tos = header->ip4.tos;
+ iph.tot_len = header->ip4.tot_len;
+ iph.id = header->ip4.id;
+ iph.frag_off = header->ip4.frag_off;
+ iph.ttl = header->ip4.ttl;
+ iph.protocol = header->ip4.protocol;
+ iph.check = 0;
+ iph.saddr = header->ip4.saddr;
+ iph.daddr = header->ip4.daddr;
+
+ return ip_fast_csum((u8 *)&iph, iph.ihl);
+}
+EXPORT_SYMBOL(ib_ud_ip4_csum);
+
+/**
+ * ib_ud_header_init - Initialize UD header structure
+ * @payload_bytes:Length of packet payload
+ * @lrh_present: specify if LRH is present
+ * @eth_present: specify if Eth header is present
+ * @vlan_present: packet is tagged vlan
+ * @grh_present: GRH flag (if non-zero, GRH will be included)
+ * @ip_version: if non-zero, IP header, V4 or V6, will be included
+ * @udp_present :if non-zero, UDP header will be included
+ * @immediate_present: specify if immediate data is present
+ * @header:Structure to initialize
+ */
+int ib_ud_header_init(int payload_bytes,
+ int lrh_present,
+ int eth_present,
+ int vlan_present,
+ int grh_present,
+ int ip_version,
+ int udp_present,
+ int immediate_present,
+ struct ib_ud_header *header)
+{
+ size_t udp_bytes = udp_present ? IB_UDP_BYTES : 0;
+
+ grh_present = grh_present && !ip_version;
+ memset(header, 0, sizeof *header);
+
+ /*
+ * UDP header without IP header doesn't make sense
+ */
+ if (udp_present && ip_version != 4 && ip_version != 6)
+ return -EINVAL;
+
+ if (lrh_present) {
+ u16 packet_length;
+
+ header->lrh.link_version = 0;
+ header->lrh.link_next_header =
+ grh_present ? IB_LNH_IBA_GLOBAL : IB_LNH_IBA_LOCAL;
+ packet_length = (IB_LRH_BYTES +
+ IB_BTH_BYTES +
+ IB_DETH_BYTES +
+ (grh_present ? IB_GRH_BYTES : 0) +
+ payload_bytes +
+ 4 + /* ICRC */
+ 3) / 4; /* round up */
+ header->lrh.packet_length = cpu_to_be16(packet_length);
+ }
+
+ if (vlan_present)
+ header->eth.type = cpu_to_be16(ETH_P_8021Q);
+
+ if (ip_version == 6 || grh_present) {
+ header->grh.ip_version = 6;
+ header->grh.payload_length =
+ cpu_to_be16((udp_bytes +
+ IB_BTH_BYTES +
+ IB_DETH_BYTES +
+ payload_bytes +
+ 4 + /* ICRC */
+ 3) & ~3); /* round up */
+ header->grh.next_header = udp_present ? IPPROTO_UDP : 0x1b;
+ }
+
+ if (ip_version == 4) {
+ header->ip4.ver = 4; /* version 4 */
+ header->ip4.hdr_len = 5; /* 5 words */
+ header->ip4.tot_len =
+ cpu_to_be16(IB_IP4_BYTES +
+ udp_bytes +
+ IB_BTH_BYTES +
+ IB_DETH_BYTES +
+ payload_bytes +
+ 4); /* ICRC */
+ header->ip4.protocol = IPPROTO_UDP;
+ }
+ if (udp_present && ip_version)
+ header->udp.length =
+ cpu_to_be16(IB_UDP_BYTES +
+ IB_BTH_BYTES +
+ IB_DETH_BYTES +
+ payload_bytes +
+ 4); /* ICRC */
+
+ if (immediate_present)
+ header->bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+ else
+ header->bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+ header->bth.pad_count = (4 - payload_bytes) & 3;
+ header->bth.transport_header_version = 0;
+
+ header->lrh_present = lrh_present;
+ header->eth_present = eth_present;
+ header->vlan_present = vlan_present;
+ header->grh_present = grh_present || (ip_version == 6);
+ header->ipv4_present = ip_version == 4;
+ header->udp_present = udp_present;
+ header->immediate_present = immediate_present;
+ return 0;
+}
+EXPORT_SYMBOL(ib_ud_header_init);
+
+/**
+ * ib_ud_header_pack - Pack UD header struct into wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() packs the UD header structure @header into wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_pack(struct ib_ud_header *header,
+ void *buf)
+{
+ int len = 0;
+
+ if (header->lrh_present) {
+ ib_pack(lrh_table, ARRAY_SIZE(lrh_table),
+ &header->lrh, buf + len);
+ len += IB_LRH_BYTES;
+ }
+ if (header->eth_present) {
+ ib_pack(eth_table, ARRAY_SIZE(eth_table),
+ &header->eth, buf + len);
+ len += IB_ETH_BYTES;
+ }
+ if (header->vlan_present) {
+ ib_pack(vlan_table, ARRAY_SIZE(vlan_table),
+ &header->vlan, buf + len);
+ len += IB_VLAN_BYTES;
+ }
+ if (header->grh_present) {
+ ib_pack(grh_table, ARRAY_SIZE(grh_table),
+ &header->grh, buf + len);
+ len += IB_GRH_BYTES;
+ }
+ if (header->ipv4_present) {
+ ib_pack(ip4_table, ARRAY_SIZE(ip4_table),
+ &header->ip4, buf + len);
+ len += IB_IP4_BYTES;
+ }
+ if (header->udp_present) {
+ ib_pack(udp_table, ARRAY_SIZE(udp_table),
+ &header->udp, buf + len);
+ len += IB_UDP_BYTES;
+ }
+
+ ib_pack(bth_table, ARRAY_SIZE(bth_table),
+ &header->bth, buf + len);
+ len += IB_BTH_BYTES;
+
+ ib_pack(deth_table, ARRAY_SIZE(deth_table),
+ &header->deth, buf + len);
+ len += IB_DETH_BYTES;
+
+ if (header->immediate_present) {
+ memcpy(buf + len, &header->immediate_data, sizeof header->immediate_data);
+ len += sizeof header->immediate_data;
+ }
+
+ return len;
+}
+EXPORT_SYMBOL(ib_ud_header_pack);
+
+/**
+ * ib_ud_header_unpack - Unpack UD header struct from wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() unpacks the UD header structure @header from wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_unpack(void *buf,
+ struct ib_ud_header *header)
+{
+ ib_unpack(lrh_table, ARRAY_SIZE(lrh_table),
+ buf, &header->lrh);
+ buf += IB_LRH_BYTES;
+
+ if (header->lrh.link_version != 0) {
+ pr_warn("Invalid LRH.link_version %d\n",
+ header->lrh.link_version);
+ return -EINVAL;
+ }
+
+ switch (header->lrh.link_next_header) {
+ case IB_LNH_IBA_LOCAL:
+ header->grh_present = 0;
+ break;
+
+ case IB_LNH_IBA_GLOBAL:
+ header->grh_present = 1;
+ ib_unpack(grh_table, ARRAY_SIZE(grh_table),
+ buf, &header->grh);
+ buf += IB_GRH_BYTES;
+
+ if (header->grh.ip_version != 6) {
+ pr_warn("Invalid GRH.ip_version %d\n",
+ header->grh.ip_version);
+ return -EINVAL;
+ }
+ if (header->grh.next_header != 0x1b) {
+ pr_warn("Invalid GRH.next_header 0x%02x\n",
+ header->grh.next_header);
+ return -EINVAL;
+ }
+ break;
+
+ default:
+ pr_warn("Invalid LRH.link_next_header %d\n",
+ header->lrh.link_next_header);
+ return -EINVAL;
+ }
+
+ ib_unpack(bth_table, ARRAY_SIZE(bth_table),
+ buf, &header->bth);
+ buf += IB_BTH_BYTES;
+
+ switch (header->bth.opcode) {
+ case IB_OPCODE_UD_SEND_ONLY:
+ header->immediate_present = 0;
+ break;
+ case IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE:
+ header->immediate_present = 1;
+ break;
+ default:
+ pr_warn("Invalid BTH.opcode 0x%02x\n", header->bth.opcode);
+ return -EINVAL;
+ }
+
+ if (header->bth.transport_header_version != 0) {
+ pr_warn("Invalid BTH.transport_header_version %d\n",
+ header->bth.transport_header_version);
+ return -EINVAL;
+ }
+
+ ib_unpack(deth_table, ARRAY_SIZE(deth_table),
+ buf, &header->deth);
+ buf += IB_DETH_BYTES;
+
+ if (header->immediate_present)
+ memcpy(&header->immediate_data, buf, sizeof header->immediate_data);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_ud_header_unpack);
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
new file mode 100644
index 000000000..a41792dba
--- /dev/null
+++ b/drivers/infiniband/core/umem.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
+#include <linux/export.h>
+#include <linux/hugetlb.h>
+#include <linux/slab.h>
+#include <rdma/ib_umem_odp.h>
+
+#include "uverbs.h"
+
+
+static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
+{
+ struct scatterlist *sg;
+ struct page *page;
+ int i;
+
+ if (umem->nmap > 0)
+ ib_dma_unmap_sg(dev, umem->sg_head.sgl,
+ umem->npages,
+ DMA_BIDIRECTIONAL);
+
+ for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
+
+ page = sg_page(sg);
+ if (!PageDirty(page) && umem->writable && dirty)
+ set_page_dirty_lock(page);
+ put_page(page);
+ }
+
+ sg_free_table(&umem->sg_head);
+}
+
+/**
+ * ib_umem_get - Pin and DMA map userspace memory.
+ *
+ * If access flags indicate ODP memory, avoid pinning. Instead, stores
+ * the mm for future page fault handling in conjunction with MMU notifiers.
+ *
+ * @context: userspace context to pin memory for
+ * @addr: userspace virtual address to start at
+ * @size: length of region to pin
+ * @access: IB_ACCESS_xxx flags for memory being pinned
+ * @dmasync: flush in-flight DMA when the memory region is written
+ */
+struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+ size_t size, int access, int dmasync)
+{
+ struct ib_umem *umem;
+ struct page **page_list;
+ struct vm_area_struct **vma_list;
+ unsigned long lock_limit;
+ unsigned long cur_base;
+ unsigned long npages;
+ int ret;
+ int i;
+ unsigned long dma_attrs = 0;
+ struct scatterlist *sg, *sg_list_start;
+ unsigned int gup_flags = FOLL_WRITE;
+
+ if (dmasync)
+ dma_attrs |= DMA_ATTR_WRITE_BARRIER;
+
+ /*
+ * If the combination of the addr and size requested for this memory
+ * region causes an integer overflow, return error.
+ */
+ if (((addr + size) < addr) ||
+ PAGE_ALIGN(addr + size) < (addr + size))
+ return ERR_PTR(-EINVAL);
+
+ if (!can_do_mlock())
+ return ERR_PTR(-EPERM);
+
+ umem = kzalloc(sizeof *umem, GFP_KERNEL);
+ if (!umem)
+ return ERR_PTR(-ENOMEM);
+
+ umem->context = context;
+ umem->length = size;
+ umem->address = addr;
+ umem->page_shift = PAGE_SHIFT;
+ umem->writable = ib_access_writable(access);
+
+ if (access & IB_ACCESS_ON_DEMAND) {
+ ret = ib_umem_odp_get(context, umem, access);
+ if (ret)
+ goto umem_kfree;
+ return umem;
+ }
+
+ umem->odp_data = NULL;
+
+ /* We assume the memory is from hugetlb until proved otherwise */
+ umem->hugetlb = 1;
+
+ page_list = (struct page **) __get_free_page(GFP_KERNEL);
+ if (!page_list) {
+ ret = -ENOMEM;
+ goto umem_kfree;
+ }
+
+ /*
+ * if we can't alloc the vma_list, it's not so bad;
+ * just assume the memory is not hugetlb memory
+ */
+ vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL);
+ if (!vma_list)
+ umem->hugetlb = 0;
+
+ npages = ib_umem_num_pages(umem);
+
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ down_write(&current->mm->mmap_sem);
+ current->mm->pinned_vm += npages;
+ if ((current->mm->pinned_vm > lock_limit) && !capable(CAP_IPC_LOCK)) {
+ up_write(&current->mm->mmap_sem);
+ ret = -ENOMEM;
+ goto vma;
+ }
+ up_write(&current->mm->mmap_sem);
+
+ cur_base = addr & PAGE_MASK;
+
+ if (npages == 0 || npages > UINT_MAX) {
+ ret = -EINVAL;
+ goto vma;
+ }
+
+ ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
+ if (ret)
+ goto vma;
+
+ if (!umem->writable)
+ gup_flags |= FOLL_FORCE;
+
+ sg_list_start = umem->sg_head.sgl;
+
+ down_read(&current->mm->mmap_sem);
+ while (npages) {
+ ret = get_user_pages_longterm(cur_base,
+ min_t(unsigned long, npages,
+ PAGE_SIZE / sizeof (struct page *)),
+ gup_flags, page_list, vma_list);
+ if (ret < 0) {
+ up_read(&current->mm->mmap_sem);
+ goto umem_release;
+ }
+
+ umem->npages += ret;
+ cur_base += ret * PAGE_SIZE;
+ npages -= ret;
+
+ for_each_sg(sg_list_start, sg, ret, i) {
+ if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
+ umem->hugetlb = 0;
+
+ sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
+ }
+
+ /* preparing for next loop */
+ sg_list_start = sg;
+ }
+ up_read(&current->mm->mmap_sem);
+
+ umem->nmap = ib_dma_map_sg_attrs(context->device,
+ umem->sg_head.sgl,
+ umem->npages,
+ DMA_BIDIRECTIONAL,
+ dma_attrs);
+
+ if (!umem->nmap) {
+ ret = -ENOMEM;
+ goto umem_release;
+ }
+
+ ret = 0;
+ goto out;
+
+umem_release:
+ __ib_umem_release(context->device, umem, 0);
+vma:
+ down_write(&current->mm->mmap_sem);
+ current->mm->pinned_vm -= ib_umem_num_pages(umem);
+ up_write(&current->mm->mmap_sem);
+out:
+ if (vma_list)
+ free_page((unsigned long) vma_list);
+ free_page((unsigned long) page_list);
+umem_kfree:
+ if (ret)
+ kfree(umem);
+ return ret ? ERR_PTR(ret) : umem;
+}
+EXPORT_SYMBOL(ib_umem_get);
+
+static void ib_umem_account(struct work_struct *work)
+{
+ struct ib_umem *umem = container_of(work, struct ib_umem, work);
+
+ down_write(&umem->mm->mmap_sem);
+ umem->mm->pinned_vm -= umem->diff;
+ up_write(&umem->mm->mmap_sem);
+ mmput(umem->mm);
+ kfree(umem);
+}
+
+/**
+ * ib_umem_release - release memory pinned with ib_umem_get
+ * @umem: umem struct to release
+ */
+void ib_umem_release(struct ib_umem *umem)
+{
+ struct ib_ucontext *context = umem->context;
+ struct mm_struct *mm;
+ struct task_struct *task;
+ unsigned long diff;
+
+ if (umem->odp_data) {
+ ib_umem_odp_release(umem);
+ return;
+ }
+
+ __ib_umem_release(umem->context->device, umem, 1);
+
+ task = get_pid_task(umem->context->tgid, PIDTYPE_PID);
+ if (!task)
+ goto out;
+ mm = get_task_mm(task);
+ put_task_struct(task);
+ if (!mm)
+ goto out;
+
+ diff = ib_umem_num_pages(umem);
+
+ /*
+ * We may be called with the mm's mmap_sem already held. This
+ * can happen when a userspace munmap() is the call that drops
+ * the last reference to our file and calls our release
+ * method. If there are memory regions to destroy, we'll end
+ * up here and not be able to take the mmap_sem. In that case
+ * we defer the vm_locked accounting to the system workqueue.
+ */
+ if (context->closing) {
+ if (!down_write_trylock(&mm->mmap_sem)) {
+ INIT_WORK(&umem->work, ib_umem_account);
+ umem->mm = mm;
+ umem->diff = diff;
+
+ queue_work(ib_wq, &umem->work);
+ return;
+ }
+ } else
+ down_write(&mm->mmap_sem);
+
+ mm->pinned_vm -= diff;
+ up_write(&mm->mmap_sem);
+ mmput(mm);
+out:
+ kfree(umem);
+}
+EXPORT_SYMBOL(ib_umem_release);
+
+int ib_umem_page_count(struct ib_umem *umem)
+{
+ int i;
+ int n;
+ struct scatterlist *sg;
+
+ if (umem->odp_data)
+ return ib_umem_num_pages(umem);
+
+ n = 0;
+ for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
+ n += sg_dma_len(sg) >> umem->page_shift;
+
+ return n;
+}
+EXPORT_SYMBOL(ib_umem_page_count);
+
+/*
+ * Copy from the given ib_umem's pages to the given buffer.
+ *
+ * umem - the umem to copy from
+ * offset - offset to start copying from
+ * dst - destination buffer
+ * length - buffer length
+ *
+ * Returns 0 on success, or an error code.
+ */
+int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
+ size_t length)
+{
+ size_t end = offset + length;
+ int ret;
+
+ if (offset > umem->length || length > umem->length - offset) {
+ pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n",
+ offset, umem->length, end);
+ return -EINVAL;
+ }
+
+ ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->npages, dst, length,
+ offset + ib_umem_offset(umem));
+
+ if (ret < 0)
+ return ret;
+ else if (ret != length)
+ return -EINVAL;
+ else
+ return 0;
+}
+EXPORT_SYMBOL(ib_umem_copy_from);
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
new file mode 100644
index 000000000..fd6ec56c1
--- /dev/null
+++ b/drivers/infiniband/core/umem_odp.c
@@ -0,0 +1,852 @@
+/*
+ * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
+#include <linux/pid.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/vmalloc.h>
+#include <linux/hugetlb.h>
+#include <linux/interval_tree_generic.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_umem_odp.h>
+
+/*
+ * The ib_umem list keeps track of memory regions for which the HW
+ * device request to receive notification when the related memory
+ * mapping is changed.
+ *
+ * ib_umem_lock protects the list.
+ */
+
+static u64 node_start(struct umem_odp_node *n)
+{
+ struct ib_umem_odp *umem_odp =
+ container_of(n, struct ib_umem_odp, interval_tree);
+
+ return ib_umem_start(umem_odp->umem);
+}
+
+/* Note that the representation of the intervals in the interval tree
+ * considers the ending point as contained in the interval, while the
+ * function ib_umem_end returns the first address which is not contained
+ * in the umem.
+ */
+static u64 node_last(struct umem_odp_node *n)
+{
+ struct ib_umem_odp *umem_odp =
+ container_of(n, struct ib_umem_odp, interval_tree);
+
+ return ib_umem_end(umem_odp->umem) - 1;
+}
+
+INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
+ node_start, node_last, static, rbt_ib_umem)
+
+static void ib_umem_notifier_start_account(struct ib_umem *item)
+{
+ mutex_lock(&item->odp_data->umem_mutex);
+
+ /* Only update private counters for this umem if it has them.
+ * Otherwise skip it. All page faults will be delayed for this umem. */
+ if (item->odp_data->mn_counters_active) {
+ int notifiers_count = item->odp_data->notifiers_count++;
+
+ if (notifiers_count == 0)
+ /* Initialize the completion object for waiting on
+ * notifiers. Since notifier_count is zero, no one
+ * should be waiting right now. */
+ reinit_completion(&item->odp_data->notifier_completion);
+ }
+ mutex_unlock(&item->odp_data->umem_mutex);
+}
+
+static void ib_umem_notifier_end_account(struct ib_umem *item)
+{
+ mutex_lock(&item->odp_data->umem_mutex);
+
+ /* Only update private counters for this umem if it has them.
+ * Otherwise skip it. All page faults will be delayed for this umem. */
+ if (item->odp_data->mn_counters_active) {
+ /*
+ * This sequence increase will notify the QP page fault that
+ * the page that is going to be mapped in the spte could have
+ * been freed.
+ */
+ ++item->odp_data->notifiers_seq;
+ if (--item->odp_data->notifiers_count == 0)
+ complete_all(&item->odp_data->notifier_completion);
+ }
+ mutex_unlock(&item->odp_data->umem_mutex);
+}
+
+/* Account for a new mmu notifier in an ib_ucontext. */
+static void ib_ucontext_notifier_start_account(struct ib_ucontext *context)
+{
+ atomic_inc(&context->notifier_count);
+}
+
+/* Account for a terminating mmu notifier in an ib_ucontext.
+ *
+ * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since
+ * the function takes the semaphore itself. */
+static void ib_ucontext_notifier_end_account(struct ib_ucontext *context)
+{
+ int zero_notifiers = atomic_dec_and_test(&context->notifier_count);
+
+ if (zero_notifiers &&
+ !list_empty(&context->no_private_counters)) {
+ /* No currently running mmu notifiers. Now is the chance to
+ * add private accounting to all previously added umems. */
+ struct ib_umem_odp *odp_data, *next;
+
+ /* Prevent concurrent mmu notifiers from working on the
+ * no_private_counters list. */
+ down_write(&context->umem_rwsem);
+
+ /* Read the notifier_count again, with the umem_rwsem
+ * semaphore taken for write. */
+ if (!atomic_read(&context->notifier_count)) {
+ list_for_each_entry_safe(odp_data, next,
+ &context->no_private_counters,
+ no_private_counters) {
+ mutex_lock(&odp_data->umem_mutex);
+ odp_data->mn_counters_active = true;
+ list_del(&odp_data->no_private_counters);
+ complete_all(&odp_data->notifier_completion);
+ mutex_unlock(&odp_data->umem_mutex);
+ }
+ }
+
+ up_write(&context->umem_rwsem);
+ }
+}
+
+static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start,
+ u64 end, void *cookie) {
+ /*
+ * Increase the number of notifiers running, to
+ * prevent any further fault handling on this MR.
+ */
+ ib_umem_notifier_start_account(item);
+ item->odp_data->dying = 1;
+ /* Make sure that the fact the umem is dying is out before we release
+ * all pending page faults. */
+ smp_wmb();
+ complete_all(&item->odp_data->notifier_completion);
+ item->context->invalidate_range(item, ib_umem_start(item),
+ ib_umem_end(item));
+ return 0;
+}
+
+static void ib_umem_notifier_release(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+ if (!context->invalidate_range)
+ return;
+
+ ib_ucontext_notifier_start_account(context);
+ down_read(&context->umem_rwsem);
+ rbt_ib_umem_for_each_in_range(&context->umem_tree, 0,
+ ULLONG_MAX,
+ ib_umem_notifier_release_trampoline,
+ true,
+ NULL);
+ up_read(&context->umem_rwsem);
+}
+
+static int invalidate_page_trampoline(struct ib_umem *item, u64 start,
+ u64 end, void *cookie)
+{
+ ib_umem_notifier_start_account(item);
+ item->context->invalidate_range(item, start, start + PAGE_SIZE);
+ ib_umem_notifier_end_account(item);
+ return 0;
+}
+
+static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
+ u64 end, void *cookie)
+{
+ ib_umem_notifier_start_account(item);
+ item->context->invalidate_range(item, start, end);
+ return 0;
+}
+
+static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end,
+ bool blockable)
+{
+ struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+ int ret;
+
+ if (!context->invalidate_range)
+ return 0;
+
+ if (blockable)
+ down_read(&context->umem_rwsem);
+ else if (!down_read_trylock(&context->umem_rwsem))
+ return -EAGAIN;
+
+ ib_ucontext_notifier_start_account(context);
+ ret = rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
+ end,
+ invalidate_range_start_trampoline,
+ blockable, NULL);
+ up_read(&context->umem_rwsem);
+
+ return ret;
+}
+
+static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start,
+ u64 end, void *cookie)
+{
+ ib_umem_notifier_end_account(item);
+ return 0;
+}
+
+static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+ if (!context->invalidate_range)
+ return;
+
+ /*
+ * TODO: we currently bail out if there is any sleepable work to be done
+ * in ib_umem_notifier_invalidate_range_start so we shouldn't really block
+ * here. But this is ugly and fragile.
+ */
+ down_read(&context->umem_rwsem);
+ rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
+ end,
+ invalidate_range_end_trampoline, true, NULL);
+ up_read(&context->umem_rwsem);
+ ib_ucontext_notifier_end_account(context);
+}
+
+static const struct mmu_notifier_ops ib_umem_notifiers = {
+ .release = ib_umem_notifier_release,
+ .invalidate_range_start = ib_umem_notifier_invalidate_range_start,
+ .invalidate_range_end = ib_umem_notifier_invalidate_range_end,
+};
+
+struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context,
+ unsigned long addr,
+ size_t size)
+{
+ struct ib_umem *umem;
+ struct ib_umem_odp *odp_data;
+ int pages = size >> PAGE_SHIFT;
+ int ret;
+
+ umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+ if (!umem)
+ return ERR_PTR(-ENOMEM);
+
+ umem->context = context;
+ umem->length = size;
+ umem->address = addr;
+ umem->page_shift = PAGE_SHIFT;
+ umem->writable = 1;
+
+ odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL);
+ if (!odp_data) {
+ ret = -ENOMEM;
+ goto out_umem;
+ }
+ odp_data->umem = umem;
+
+ mutex_init(&odp_data->umem_mutex);
+ init_completion(&odp_data->notifier_completion);
+
+ odp_data->page_list =
+ vzalloc(array_size(pages, sizeof(*odp_data->page_list)));
+ if (!odp_data->page_list) {
+ ret = -ENOMEM;
+ goto out_odp_data;
+ }
+
+ odp_data->dma_list =
+ vzalloc(array_size(pages, sizeof(*odp_data->dma_list)));
+ if (!odp_data->dma_list) {
+ ret = -ENOMEM;
+ goto out_page_list;
+ }
+
+ down_write(&context->umem_rwsem);
+ context->odp_mrs_count++;
+ rbt_ib_umem_insert(&odp_data->interval_tree, &context->umem_tree);
+ if (likely(!atomic_read(&context->notifier_count)))
+ odp_data->mn_counters_active = true;
+ else
+ list_add(&odp_data->no_private_counters,
+ &context->no_private_counters);
+ up_write(&context->umem_rwsem);
+
+ umem->odp_data = odp_data;
+
+ return umem;
+
+out_page_list:
+ vfree(odp_data->page_list);
+out_odp_data:
+ kfree(odp_data);
+out_umem:
+ kfree(umem);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_alloc_odp_umem);
+
+int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem,
+ int access)
+{
+ int ret_val;
+ struct pid *our_pid;
+ struct mm_struct *mm = get_task_mm(current);
+
+ if (!mm)
+ return -EINVAL;
+
+ if (access & IB_ACCESS_HUGETLB) {
+ struct vm_area_struct *vma;
+ struct hstate *h;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, ib_umem_start(umem));
+ if (!vma || !is_vm_hugetlb_page(vma)) {
+ up_read(&mm->mmap_sem);
+ ret_val = -EINVAL;
+ goto out_mm;
+ }
+ h = hstate_vma(vma);
+ umem->page_shift = huge_page_shift(h);
+ up_read(&mm->mmap_sem);
+ umem->hugetlb = 1;
+ } else {
+ umem->hugetlb = 0;
+ }
+
+ /* Prevent creating ODP MRs in child processes */
+ rcu_read_lock();
+ our_pid = get_task_pid(current->group_leader, PIDTYPE_PID);
+ rcu_read_unlock();
+ put_pid(our_pid);
+ if (context->tgid != our_pid) {
+ ret_val = -EINVAL;
+ goto out_mm;
+ }
+
+ umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL);
+ if (!umem->odp_data) {
+ ret_val = -ENOMEM;
+ goto out_mm;
+ }
+ umem->odp_data->umem = umem;
+
+ mutex_init(&umem->odp_data->umem_mutex);
+
+ init_completion(&umem->odp_data->notifier_completion);
+
+ if (ib_umem_num_pages(umem)) {
+ umem->odp_data->page_list =
+ vzalloc(array_size(sizeof(*umem->odp_data->page_list),
+ ib_umem_num_pages(umem)));
+ if (!umem->odp_data->page_list) {
+ ret_val = -ENOMEM;
+ goto out_odp_data;
+ }
+
+ umem->odp_data->dma_list =
+ vzalloc(array_size(sizeof(*umem->odp_data->dma_list),
+ ib_umem_num_pages(umem)));
+ if (!umem->odp_data->dma_list) {
+ ret_val = -ENOMEM;
+ goto out_page_list;
+ }
+ }
+
+ /*
+ * When using MMU notifiers, we will get a
+ * notification before the "current" task (and MM) is
+ * destroyed. We use the umem_rwsem semaphore to synchronize.
+ */
+ down_write(&context->umem_rwsem);
+ context->odp_mrs_count++;
+ if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+ rbt_ib_umem_insert(&umem->odp_data->interval_tree,
+ &context->umem_tree);
+ if (likely(!atomic_read(&context->notifier_count)) ||
+ context->odp_mrs_count == 1)
+ umem->odp_data->mn_counters_active = true;
+ else
+ list_add(&umem->odp_data->no_private_counters,
+ &context->no_private_counters);
+ downgrade_write(&context->umem_rwsem);
+
+ if (context->odp_mrs_count == 1) {
+ /*
+ * Note that at this point, no MMU notifier is running
+ * for this context!
+ */
+ atomic_set(&context->notifier_count, 0);
+ INIT_HLIST_NODE(&context->mn.hlist);
+ context->mn.ops = &ib_umem_notifiers;
+ /*
+ * Lock-dep detects a false positive for mmap_sem vs.
+ * umem_rwsem, due to not grasping downgrade_write correctly.
+ */
+ lockdep_off();
+ ret_val = mmu_notifier_register(&context->mn, mm);
+ lockdep_on();
+ if (ret_val) {
+ pr_err("Failed to register mmu_notifier %d\n", ret_val);
+ ret_val = -EBUSY;
+ goto out_mutex;
+ }
+ }
+
+ up_read(&context->umem_rwsem);
+
+ /*
+ * Note that doing an mmput can cause a notifier for the relevant mm.
+ * If the notifier is called while we hold the umem_rwsem, this will
+ * cause a deadlock. Therefore, we release the reference only after we
+ * released the semaphore.
+ */
+ mmput(mm);
+ return 0;
+
+out_mutex:
+ up_read(&context->umem_rwsem);
+ vfree(umem->odp_data->dma_list);
+out_page_list:
+ vfree(umem->odp_data->page_list);
+out_odp_data:
+ kfree(umem->odp_data);
+out_mm:
+ mmput(mm);
+ return ret_val;
+}
+
+void ib_umem_odp_release(struct ib_umem *umem)
+{
+ struct ib_ucontext *context = umem->context;
+
+ /*
+ * Ensure that no more pages are mapped in the umem.
+ *
+ * It is the driver's responsibility to ensure, before calling us,
+ * that the hardware will not attempt to access the MR any more.
+ */
+ ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem),
+ ib_umem_end(umem));
+
+ down_write(&context->umem_rwsem);
+ if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+ rbt_ib_umem_remove(&umem->odp_data->interval_tree,
+ &context->umem_tree);
+ context->odp_mrs_count--;
+ if (!umem->odp_data->mn_counters_active) {
+ list_del(&umem->odp_data->no_private_counters);
+ complete_all(&umem->odp_data->notifier_completion);
+ }
+
+ /*
+ * Downgrade the lock to a read lock. This ensures that the notifiers
+ * (who lock the mutex for reading) will be able to finish, and we
+ * will be able to enventually obtain the mmu notifiers SRCU. Note
+ * that since we are doing it atomically, no other user could register
+ * and unregister while we do the check.
+ */
+ downgrade_write(&context->umem_rwsem);
+ if (!context->odp_mrs_count) {
+ struct task_struct *owning_process = NULL;
+ struct mm_struct *owning_mm = NULL;
+
+ owning_process = get_pid_task(context->tgid,
+ PIDTYPE_PID);
+ if (owning_process == NULL)
+ /*
+ * The process is already dead, notifier were removed
+ * already.
+ */
+ goto out;
+
+ owning_mm = get_task_mm(owning_process);
+ if (owning_mm == NULL)
+ /*
+ * The process' mm is already dead, notifier were
+ * removed already.
+ */
+ goto out_put_task;
+ mmu_notifier_unregister(&context->mn, owning_mm);
+
+ mmput(owning_mm);
+
+out_put_task:
+ put_task_struct(owning_process);
+ }
+out:
+ up_read(&context->umem_rwsem);
+
+ vfree(umem->odp_data->dma_list);
+ vfree(umem->odp_data->page_list);
+ kfree(umem->odp_data);
+ kfree(umem);
+}
+
+/*
+ * Map for DMA and insert a single page into the on-demand paging page tables.
+ *
+ * @umem: the umem to insert the page to.
+ * @page_index: index in the umem to add the page to.
+ * @page: the page struct to map and add.
+ * @access_mask: access permissions needed for this page.
+ * @current_seq: sequence number for synchronization with invalidations.
+ * the sequence number is taken from
+ * umem->odp_data->notifiers_seq.
+ *
+ * The function returns -EFAULT if the DMA mapping operation fails. It returns
+ * -EAGAIN if a concurrent invalidation prevents us from updating the page.
+ *
+ * The page is released via put_page even if the operation failed. For
+ * on-demand pinning, the page is released whenever it isn't stored in the
+ * umem.
+ */
+static int ib_umem_odp_map_dma_single_page(
+ struct ib_umem *umem,
+ int page_index,
+ struct page *page,
+ u64 access_mask,
+ unsigned long current_seq)
+{
+ struct ib_device *dev = umem->context->device;
+ dma_addr_t dma_addr;
+ int stored_page = 0;
+ int remove_existing_mapping = 0;
+ int ret = 0;
+
+ /*
+ * Note: we avoid writing if seq is different from the initial seq, to
+ * handle case of a racing notifier. This check also allows us to bail
+ * early if we have a notifier running in parallel with us.
+ */
+ if (ib_umem_mmu_notifier_retry(umem, current_seq)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ if (!(umem->odp_data->dma_list[page_index])) {
+ dma_addr = ib_dma_map_page(dev,
+ page,
+ 0, BIT(umem->page_shift),
+ DMA_BIDIRECTIONAL);
+ if (ib_dma_mapping_error(dev, dma_addr)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ umem->odp_data->dma_list[page_index] = dma_addr | access_mask;
+ umem->odp_data->page_list[page_index] = page;
+ umem->npages++;
+ stored_page = 1;
+ } else if (umem->odp_data->page_list[page_index] == page) {
+ umem->odp_data->dma_list[page_index] |= access_mask;
+ } else {
+ pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
+ umem->odp_data->page_list[page_index], page);
+ /* Better remove the mapping now, to prevent any further
+ * damage. */
+ remove_existing_mapping = 1;
+ }
+
+out:
+ /* On Demand Paging - avoid pinning the page */
+ if (umem->context->invalidate_range || !stored_page)
+ put_page(page);
+
+ if (remove_existing_mapping && umem->context->invalidate_range) {
+ invalidate_page_trampoline(
+ umem,
+ ib_umem_start(umem) + (page_index >> umem->page_shift),
+ ib_umem_start(umem) + ((page_index + 1) >>
+ umem->page_shift),
+ NULL);
+ ret = -EAGAIN;
+ }
+
+ return ret;
+}
+
+/**
+ * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR.
+ *
+ * Pins the range of pages passed in the argument, and maps them to
+ * DMA addresses. The DMA addresses of the mapped pages is updated in
+ * umem->odp_data->dma_list.
+ *
+ * Returns the number of pages mapped in success, negative error code
+ * for failure.
+ * An -EAGAIN error code is returned when a concurrent mmu notifier prevents
+ * the function from completing its task.
+ * An -ENOENT error code indicates that userspace process is being terminated
+ * and mm was already destroyed.
+ * @umem: the umem to map and pin
+ * @user_virt: the address from which we need to map.
+ * @bcnt: the minimal number of bytes to pin and map. The mapping might be
+ * bigger due to alignment, and may also be smaller in case of an error
+ * pinning or mapping a page. The actual pages mapped is returned in
+ * the return value.
+ * @access_mask: bit mask of the requested access permissions for the given
+ * range.
+ * @current_seq: the MMU notifiers sequance value for synchronization with
+ * invalidations. the sequance number is read from
+ * umem->odp_data->notifiers_seq before calling this function
+ */
+int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
+ u64 access_mask, unsigned long current_seq)
+{
+ struct task_struct *owning_process = NULL;
+ struct mm_struct *owning_mm = NULL;
+ struct page **local_page_list = NULL;
+ u64 page_mask, off;
+ int j, k, ret = 0, start_idx, npages = 0, page_shift;
+ unsigned int flags = 0;
+ phys_addr_t p = 0;
+
+ if (access_mask == 0)
+ return -EINVAL;
+
+ if (user_virt < ib_umem_start(umem) ||
+ user_virt + bcnt > ib_umem_end(umem))
+ return -EFAULT;
+
+ local_page_list = (struct page **)__get_free_page(GFP_KERNEL);
+ if (!local_page_list)
+ return -ENOMEM;
+
+ page_shift = umem->page_shift;
+ page_mask = ~(BIT(page_shift) - 1);
+ off = user_virt & (~page_mask);
+ user_virt = user_virt & page_mask;
+ bcnt += off; /* Charge for the first page offset as well. */
+
+ owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID);
+ if (owning_process == NULL) {
+ ret = -EINVAL;
+ goto out_no_task;
+ }
+
+ owning_mm = get_task_mm(owning_process);
+ if (owning_mm == NULL) {
+ ret = -ENOENT;
+ goto out_put_task;
+ }
+
+ if (access_mask & ODP_WRITE_ALLOWED_BIT)
+ flags |= FOLL_WRITE;
+
+ start_idx = (user_virt - ib_umem_start(umem)) >> page_shift;
+ k = start_idx;
+
+ while (bcnt > 0) {
+ const size_t gup_num_pages = min_t(size_t,
+ ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE,
+ PAGE_SIZE / sizeof(struct page *));
+
+ down_read(&owning_mm->mmap_sem);
+ /*
+ * Note: this might result in redundent page getting. We can
+ * avoid this by checking dma_list to be 0 before calling
+ * get_user_pages. However, this make the code much more
+ * complex (and doesn't gain us much performance in most use
+ * cases).
+ */
+ npages = get_user_pages_remote(owning_process, owning_mm,
+ user_virt, gup_num_pages,
+ flags, local_page_list, NULL, NULL);
+ up_read(&owning_mm->mmap_sem);
+
+ if (npages < 0)
+ break;
+
+ bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt);
+ mutex_lock(&umem->odp_data->umem_mutex);
+ for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) {
+ if (user_virt & ~page_mask) {
+ p += PAGE_SIZE;
+ if (page_to_phys(local_page_list[j]) != p) {
+ ret = -EFAULT;
+ break;
+ }
+ put_page(local_page_list[j]);
+ continue;
+ }
+
+ ret = ib_umem_odp_map_dma_single_page(
+ umem, k, local_page_list[j],
+ access_mask, current_seq);
+ if (ret < 0)
+ break;
+
+ p = page_to_phys(local_page_list[j]);
+ k++;
+ }
+ mutex_unlock(&umem->odp_data->umem_mutex);
+
+ if (ret < 0) {
+ /* Release left over pages when handling errors. */
+ for (++j; j < npages; ++j)
+ put_page(local_page_list[j]);
+ break;
+ }
+ }
+
+ if (ret >= 0) {
+ if (npages < 0 && k == start_idx)
+ ret = npages;
+ else
+ ret = k - start_idx;
+ }
+
+ mmput(owning_mm);
+out_put_task:
+ put_task_struct(owning_process);
+out_no_task:
+ free_page((unsigned long)local_page_list);
+ return ret;
+}
+EXPORT_SYMBOL(ib_umem_odp_map_dma_pages);
+
+void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt,
+ u64 bound)
+{
+ int idx;
+ u64 addr;
+ struct ib_device *dev = umem->context->device;
+
+ virt = max_t(u64, virt, ib_umem_start(umem));
+ bound = min_t(u64, bound, ib_umem_end(umem));
+ /* Note that during the run of this function, the
+ * notifiers_count of the MR is > 0, preventing any racing
+ * faults from completion. We might be racing with other
+ * invalidations, so we must make sure we free each page only
+ * once. */
+ mutex_lock(&umem->odp_data->umem_mutex);
+ for (addr = virt; addr < bound; addr += BIT(umem->page_shift)) {
+ idx = (addr - ib_umem_start(umem)) >> umem->page_shift;
+ if (umem->odp_data->page_list[idx]) {
+ struct page *page = umem->odp_data->page_list[idx];
+ dma_addr_t dma = umem->odp_data->dma_list[idx];
+ dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK;
+
+ WARN_ON(!dma_addr);
+
+ ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE,
+ DMA_BIDIRECTIONAL);
+ if (dma & ODP_WRITE_ALLOWED_BIT) {
+ struct page *head_page = compound_head(page);
+ /*
+ * set_page_dirty prefers being called with
+ * the page lock. However, MMU notifiers are
+ * called sometimes with and sometimes without
+ * the lock. We rely on the umem_mutex instead
+ * to prevent other mmu notifiers from
+ * continuing and allowing the page mapping to
+ * be removed.
+ */
+ set_page_dirty(head_page);
+ }
+ /* on demand pinning support */
+ if (!umem->context->invalidate_range)
+ put_page(page);
+ umem->odp_data->page_list[idx] = NULL;
+ umem->odp_data->dma_list[idx] = 0;
+ umem->npages--;
+ }
+ }
+ mutex_unlock(&umem->odp_data->umem_mutex);
+}
+EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
+
+/* @last is not a part of the interval. See comment for function
+ * node_last.
+ */
+int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
+ u64 start, u64 last,
+ umem_call_back cb,
+ bool blockable,
+ void *cookie)
+{
+ int ret_val = 0;
+ struct umem_odp_node *node, *next;
+ struct ib_umem_odp *umem;
+
+ if (unlikely(start == last))
+ return ret_val;
+
+ for (node = rbt_ib_umem_iter_first(root, start, last - 1);
+ node; node = next) {
+ /* TODO move the blockable decision up to the callback */
+ if (!blockable)
+ return -EAGAIN;
+ next = rbt_ib_umem_iter_next(node, start, last - 1);
+ umem = container_of(node, struct ib_umem_odp, interval_tree);
+ ret_val = cb(umem->umem, start, last, cookie) || ret_val;
+ }
+
+ return ret_val;
+}
+EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range);
+
+struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root,
+ u64 addr, u64 length)
+{
+ struct umem_odp_node *node;
+
+ node = rbt_ib_umem_iter_first(root, addr, addr + length - 1);
+ if (node)
+ return container_of(node, struct ib_umem_odp, interval_tree);
+ return NULL;
+
+}
+EXPORT_SYMBOL(rbt_ib_umem_lookup);
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
new file mode 100644
index 000000000..471a824be
--- /dev/null
+++ b/drivers/infiniband/core/user_mad.c
@@ -0,0 +1,1425 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2008 Cisco. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "user_mad: " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/dma-mapping.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+#include <linux/kref.h>
+#include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+#include <linux/slab.h>
+#include <linux/nospec.h>
+
+#include <linux/uaccess.h>
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_mad.h>
+
+#include "core_priv.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace MAD packet access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+ IB_UMAD_MAX_PORTS = RDMA_MAX_PORTS,
+ IB_UMAD_MAX_AGENTS = 32,
+
+ IB_UMAD_MAJOR = 231,
+ IB_UMAD_MINOR_BASE = 0,
+ IB_UMAD_NUM_FIXED_MINOR = 64,
+ IB_UMAD_NUM_DYNAMIC_MINOR = IB_UMAD_MAX_PORTS - IB_UMAD_NUM_FIXED_MINOR,
+ IB_ISSM_MINOR_BASE = IB_UMAD_NUM_FIXED_MINOR,
+};
+
+/*
+ * Our lifetime rules for these structs are the following:
+ * device special file is opened, we take a reference on the
+ * ib_umad_port's struct ib_umad_device. We drop these
+ * references in the corresponding close().
+ *
+ * In addition to references coming from open character devices, there
+ * is one more reference to each ib_umad_device representing the
+ * module's reference taken when allocating the ib_umad_device in
+ * ib_umad_add_one().
+ *
+ * When destroying an ib_umad_device, we drop the module's reference.
+ */
+
+struct ib_umad_port {
+ struct cdev cdev;
+ struct device *dev;
+
+ struct cdev sm_cdev;
+ struct device *sm_dev;
+ struct semaphore sm_sem;
+
+ struct mutex file_mutex;
+ struct list_head file_list;
+
+ struct ib_device *ib_dev;
+ struct ib_umad_device *umad_dev;
+ int dev_num;
+ u8 port_num;
+};
+
+struct ib_umad_device {
+ struct kobject kobj;
+ struct ib_umad_port port[0];
+};
+
+struct ib_umad_file {
+ struct mutex mutex;
+ struct ib_umad_port *port;
+ struct list_head recv_list;
+ struct list_head send_list;
+ struct list_head port_list;
+ spinlock_t send_lock;
+ wait_queue_head_t recv_wait;
+ struct ib_mad_agent *agent[IB_UMAD_MAX_AGENTS];
+ int agents_dead;
+ u8 use_pkey_index;
+ u8 already_used;
+};
+
+struct ib_umad_packet {
+ struct ib_mad_send_buf *msg;
+ struct ib_mad_recv_wc *recv_wc;
+ struct list_head list;
+ int length;
+ struct ib_user_mad mad;
+};
+
+static struct class *umad_class;
+
+static const dev_t base_umad_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);
+static const dev_t base_issm_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE) +
+ IB_UMAD_NUM_FIXED_MINOR;
+static dev_t dynamic_umad_dev;
+static dev_t dynamic_issm_dev;
+
+static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
+
+static void ib_umad_add_one(struct ib_device *device);
+static void ib_umad_remove_one(struct ib_device *device, void *client_data);
+
+static void ib_umad_release_dev(struct kobject *kobj)
+{
+ struct ib_umad_device *dev =
+ container_of(kobj, struct ib_umad_device, kobj);
+
+ kfree(dev);
+}
+
+static struct kobj_type ib_umad_dev_ktype = {
+ .release = ib_umad_release_dev,
+};
+
+static int hdr_size(struct ib_umad_file *file)
+{
+ return file->use_pkey_index ? sizeof (struct ib_user_mad_hdr) :
+ sizeof (struct ib_user_mad_hdr_old);
+}
+
+/* caller must hold file->mutex */
+static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id)
+{
+ return file->agents_dead ? NULL : file->agent[id];
+}
+
+static int queue_packet(struct ib_umad_file *file,
+ struct ib_mad_agent *agent,
+ struct ib_umad_packet *packet)
+{
+ int ret = 1;
+
+ mutex_lock(&file->mutex);
+
+ for (packet->mad.hdr.id = 0;
+ packet->mad.hdr.id < IB_UMAD_MAX_AGENTS;
+ packet->mad.hdr.id++)
+ if (agent == __get_agent(file, packet->mad.hdr.id)) {
+ list_add_tail(&packet->list, &file->recv_list);
+ wake_up_interruptible(&file->recv_wait);
+ ret = 0;
+ break;
+ }
+
+ mutex_unlock(&file->mutex);
+
+ return ret;
+}
+
+static void dequeue_send(struct ib_umad_file *file,
+ struct ib_umad_packet *packet)
+{
+ spin_lock_irq(&file->send_lock);
+ list_del(&packet->list);
+ spin_unlock_irq(&file->send_lock);
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *send_wc)
+{
+ struct ib_umad_file *file = agent->context;
+ struct ib_umad_packet *packet = send_wc->send_buf->context[0];
+
+ dequeue_send(file, packet);
+ rdma_destroy_ah(packet->msg->ah);
+ ib_free_send_mad(packet->msg);
+
+ if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
+ packet->length = IB_MGMT_MAD_HDR;
+ packet->mad.hdr.status = ETIMEDOUT;
+ if (!queue_packet(file, agent, packet))
+ return;
+ }
+ kfree(packet);
+}
+
+static void recv_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_buf *send_buf,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_umad_file *file = agent->context;
+ struct ib_umad_packet *packet;
+
+ if (mad_recv_wc->wc->status != IB_WC_SUCCESS)
+ goto err1;
+
+ packet = kzalloc(sizeof *packet, GFP_KERNEL);
+ if (!packet)
+ goto err1;
+
+ packet->length = mad_recv_wc->mad_len;
+ packet->recv_wc = mad_recv_wc;
+
+ packet->mad.hdr.status = 0;
+ packet->mad.hdr.length = hdr_size(file) + mad_recv_wc->mad_len;
+ packet->mad.hdr.qpn = cpu_to_be32(mad_recv_wc->wc->src_qp);
+ /*
+ * On OPA devices it is okay to lose the upper 16 bits of LID as this
+ * information is obtained elsewhere. Mask off the upper 16 bits.
+ */
+ if (rdma_cap_opa_mad(agent->device, agent->port_num))
+ packet->mad.hdr.lid = ib_lid_be16(0xFFFF &
+ mad_recv_wc->wc->slid);
+ else
+ packet->mad.hdr.lid = ib_lid_be16(mad_recv_wc->wc->slid);
+ packet->mad.hdr.sl = mad_recv_wc->wc->sl;
+ packet->mad.hdr.path_bits = mad_recv_wc->wc->dlid_path_bits;
+ packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index;
+ packet->mad.hdr.grh_present = !!(mad_recv_wc->wc->wc_flags & IB_WC_GRH);
+ if (packet->mad.hdr.grh_present) {
+ struct rdma_ah_attr ah_attr;
+ const struct ib_global_route *grh;
+ int ret;
+
+ ret = ib_init_ah_attr_from_wc(agent->device, agent->port_num,
+ mad_recv_wc->wc,
+ mad_recv_wc->recv_buf.grh,
+ &ah_attr);
+ if (ret)
+ goto err2;
+
+ grh = rdma_ah_read_grh(&ah_attr);
+ packet->mad.hdr.gid_index = grh->sgid_index;
+ packet->mad.hdr.hop_limit = grh->hop_limit;
+ packet->mad.hdr.traffic_class = grh->traffic_class;
+ memcpy(packet->mad.hdr.gid, &grh->dgid, 16);
+ packet->mad.hdr.flow_label = cpu_to_be32(grh->flow_label);
+ rdma_destroy_ah_attr(&ah_attr);
+ }
+
+ if (queue_packet(file, agent, packet))
+ goto err2;
+ return;
+
+err2:
+ kfree(packet);
+err1:
+ ib_free_recv_mad(mad_recv_wc);
+}
+
+static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf,
+ struct ib_umad_packet *packet, size_t count)
+{
+ struct ib_mad_recv_buf *recv_buf;
+ int left, seg_payload, offset, max_seg_payload;
+ size_t seg_size;
+
+ recv_buf = &packet->recv_wc->recv_buf;
+ seg_size = packet->recv_wc->mad_seg_size;
+
+ /* We need enough room to copy the first (or only) MAD segment. */
+ if ((packet->length <= seg_size &&
+ count < hdr_size(file) + packet->length) ||
+ (packet->length > seg_size &&
+ count < hdr_size(file) + seg_size))
+ return -EINVAL;
+
+ if (copy_to_user(buf, &packet->mad, hdr_size(file)))
+ return -EFAULT;
+
+ buf += hdr_size(file);
+ seg_payload = min_t(int, packet->length, seg_size);
+ if (copy_to_user(buf, recv_buf->mad, seg_payload))
+ return -EFAULT;
+
+ if (seg_payload < packet->length) {
+ /*
+ * Multipacket RMPP MAD message. Copy remainder of message.
+ * Note that last segment may have a shorter payload.
+ */
+ if (count < hdr_size(file) + packet->length) {
+ /*
+ * The buffer is too small, return the first RMPP segment,
+ * which includes the RMPP message length.
+ */
+ return -ENOSPC;
+ }
+ offset = ib_get_mad_data_offset(recv_buf->mad->mad_hdr.mgmt_class);
+ max_seg_payload = seg_size - offset;
+
+ for (left = packet->length - seg_payload, buf += seg_payload;
+ left; left -= seg_payload, buf += seg_payload) {
+ recv_buf = container_of(recv_buf->list.next,
+ struct ib_mad_recv_buf, list);
+ seg_payload = min(left, max_seg_payload);
+ if (copy_to_user(buf, ((void *) recv_buf->mad) + offset,
+ seg_payload))
+ return -EFAULT;
+ }
+ }
+ return hdr_size(file) + packet->length;
+}
+
+static ssize_t copy_send_mad(struct ib_umad_file *file, char __user *buf,
+ struct ib_umad_packet *packet, size_t count)
+{
+ ssize_t size = hdr_size(file) + packet->length;
+
+ if (count < size)
+ return -EINVAL;
+
+ if (copy_to_user(buf, &packet->mad, hdr_size(file)))
+ return -EFAULT;
+
+ buf += hdr_size(file);
+
+ if (copy_to_user(buf, packet->mad.data, packet->length))
+ return -EFAULT;
+
+ return size;
+}
+
+static ssize_t ib_umad_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct ib_umad_file *file = filp->private_data;
+ struct ib_umad_packet *packet;
+ ssize_t ret;
+
+ if (count < hdr_size(file))
+ return -EINVAL;
+
+ mutex_lock(&file->mutex);
+
+ if (file->agents_dead) {
+ mutex_unlock(&file->mutex);
+ return -EIO;
+ }
+
+ while (list_empty(&file->recv_list)) {
+ mutex_unlock(&file->mutex);
+
+ if (filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ if (wait_event_interruptible(file->recv_wait,
+ !list_empty(&file->recv_list)))
+ return -ERESTARTSYS;
+
+ mutex_lock(&file->mutex);
+ }
+
+ if (file->agents_dead) {
+ mutex_unlock(&file->mutex);
+ return -EIO;
+ }
+
+ packet = list_entry(file->recv_list.next, struct ib_umad_packet, list);
+ list_del(&packet->list);
+
+ mutex_unlock(&file->mutex);
+
+ if (packet->recv_wc)
+ ret = copy_recv_mad(file, buf, packet, count);
+ else
+ ret = copy_send_mad(file, buf, packet, count);
+
+ if (ret < 0) {
+ /* Requeue packet */
+ mutex_lock(&file->mutex);
+ list_add(&packet->list, &file->recv_list);
+ mutex_unlock(&file->mutex);
+ } else {
+ if (packet->recv_wc)
+ ib_free_recv_mad(packet->recv_wc);
+ kfree(packet);
+ }
+ return ret;
+}
+
+static int copy_rmpp_mad(struct ib_mad_send_buf *msg, const char __user *buf)
+{
+ int left, seg;
+
+ /* Copy class specific header */
+ if ((msg->hdr_len > IB_MGMT_RMPP_HDR) &&
+ copy_from_user(msg->mad + IB_MGMT_RMPP_HDR, buf + IB_MGMT_RMPP_HDR,
+ msg->hdr_len - IB_MGMT_RMPP_HDR))
+ return -EFAULT;
+
+ /* All headers are in place. Copy data segments. */
+ for (seg = 1, left = msg->data_len, buf += msg->hdr_len; left > 0;
+ seg++, left -= msg->seg_size, buf += msg->seg_size) {
+ if (copy_from_user(ib_get_rmpp_segment(msg, seg), buf,
+ min(left, msg->seg_size)))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+static int same_destination(struct ib_user_mad_hdr *hdr1,
+ struct ib_user_mad_hdr *hdr2)
+{
+ if (!hdr1->grh_present && !hdr2->grh_present)
+ return (hdr1->lid == hdr2->lid);
+
+ if (hdr1->grh_present && hdr2->grh_present)
+ return !memcmp(hdr1->gid, hdr2->gid, 16);
+
+ return 0;
+}
+
+static int is_duplicate(struct ib_umad_file *file,
+ struct ib_umad_packet *packet)
+{
+ struct ib_umad_packet *sent_packet;
+ struct ib_mad_hdr *sent_hdr, *hdr;
+
+ hdr = (struct ib_mad_hdr *) packet->mad.data;
+ list_for_each_entry(sent_packet, &file->send_list, list) {
+ sent_hdr = (struct ib_mad_hdr *) sent_packet->mad.data;
+
+ if ((hdr->tid != sent_hdr->tid) ||
+ (hdr->mgmt_class != sent_hdr->mgmt_class))
+ continue;
+
+ /*
+ * No need to be overly clever here. If two new operations have
+ * the same TID, reject the second as a duplicate. This is more
+ * restrictive than required by the spec.
+ */
+ if (!ib_response_mad(hdr)) {
+ if (!ib_response_mad(sent_hdr))
+ return 1;
+ continue;
+ } else if (!ib_response_mad(sent_hdr))
+ continue;
+
+ if (same_destination(&packet->mad.hdr, &sent_packet->mad.hdr))
+ return 1;
+ }
+
+ return 0;
+}
+
+static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct ib_umad_file *file = filp->private_data;
+ struct ib_umad_packet *packet;
+ struct ib_mad_agent *agent;
+ struct rdma_ah_attr ah_attr;
+ struct ib_ah *ah;
+ struct ib_rmpp_mad *rmpp_mad;
+ __be64 *tid;
+ int ret, data_len, hdr_len, copy_offset, rmpp_active;
+ u8 base_version;
+
+ if (count < hdr_size(file) + IB_MGMT_RMPP_HDR)
+ return -EINVAL;
+
+ packet = kzalloc(sizeof *packet + IB_MGMT_RMPP_HDR, GFP_KERNEL);
+ if (!packet)
+ return -ENOMEM;
+
+ if (copy_from_user(&packet->mad, buf, hdr_size(file))) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+ if (packet->mad.hdr.id >= IB_UMAD_MAX_AGENTS) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ buf += hdr_size(file);
+
+ if (copy_from_user(packet->mad.data, buf, IB_MGMT_RMPP_HDR)) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+ mutex_lock(&file->mutex);
+
+ agent = __get_agent(file, packet->mad.hdr.id);
+ if (!agent) {
+ ret = -EIO;
+ goto err_up;
+ }
+
+ memset(&ah_attr, 0, sizeof ah_attr);
+ ah_attr.type = rdma_ah_find_type(agent->device,
+ file->port->port_num);
+ rdma_ah_set_dlid(&ah_attr, be16_to_cpu(packet->mad.hdr.lid));
+ rdma_ah_set_sl(&ah_attr, packet->mad.hdr.sl);
+ rdma_ah_set_path_bits(&ah_attr, packet->mad.hdr.path_bits);
+ rdma_ah_set_port_num(&ah_attr, file->port->port_num);
+ if (packet->mad.hdr.grh_present) {
+ rdma_ah_set_grh(&ah_attr, NULL,
+ be32_to_cpu(packet->mad.hdr.flow_label),
+ packet->mad.hdr.gid_index,
+ packet->mad.hdr.hop_limit,
+ packet->mad.hdr.traffic_class);
+ rdma_ah_set_dgid_raw(&ah_attr, packet->mad.hdr.gid);
+ }
+
+ ah = rdma_create_user_ah(agent->qp->pd, &ah_attr, NULL);
+ if (IS_ERR(ah)) {
+ ret = PTR_ERR(ah);
+ goto err_up;
+ }
+
+ rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data;
+ hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
+
+ if (ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)
+ && ib_mad_kernel_rmpp_agent(agent)) {
+ copy_offset = IB_MGMT_RMPP_HDR;
+ rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+ IB_MGMT_RMPP_FLAG_ACTIVE;
+ } else {
+ copy_offset = IB_MGMT_MAD_HDR;
+ rmpp_active = 0;
+ }
+
+ base_version = ((struct ib_mad_hdr *)&packet->mad.data)->base_version;
+ data_len = count - hdr_size(file) - hdr_len;
+ packet->msg = ib_create_send_mad(agent,
+ be32_to_cpu(packet->mad.hdr.qpn),
+ packet->mad.hdr.pkey_index, rmpp_active,
+ hdr_len, data_len, GFP_KERNEL,
+ base_version);
+ if (IS_ERR(packet->msg)) {
+ ret = PTR_ERR(packet->msg);
+ goto err_ah;
+ }
+
+ packet->msg->ah = ah;
+ packet->msg->timeout_ms = packet->mad.hdr.timeout_ms;
+ packet->msg->retries = packet->mad.hdr.retries;
+ packet->msg->context[0] = packet;
+
+ /* Copy MAD header. Any RMPP header is already in place. */
+ memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR);
+
+ if (!rmpp_active) {
+ if (copy_from_user(packet->msg->mad + copy_offset,
+ buf + copy_offset,
+ hdr_len + data_len - copy_offset)) {
+ ret = -EFAULT;
+ goto err_msg;
+ }
+ } else {
+ ret = copy_rmpp_mad(packet->msg, buf);
+ if (ret)
+ goto err_msg;
+ }
+
+ /*
+ * Set the high-order part of the transaction ID to make MADs from
+ * different agents unique, and allow routing responses back to the
+ * original requestor.
+ */
+ if (!ib_response_mad(packet->msg->mad)) {
+ tid = &((struct ib_mad_hdr *) packet->msg->mad)->tid;
+ *tid = cpu_to_be64(((u64) agent->hi_tid) << 32 |
+ (be64_to_cpup(tid) & 0xffffffff));
+ rmpp_mad->mad_hdr.tid = *tid;
+ }
+
+ if (!ib_mad_kernel_rmpp_agent(agent)
+ && ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)
+ && (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) {
+ spin_lock_irq(&file->send_lock);
+ list_add_tail(&packet->list, &file->send_list);
+ spin_unlock_irq(&file->send_lock);
+ } else {
+ spin_lock_irq(&file->send_lock);
+ ret = is_duplicate(file, packet);
+ if (!ret)
+ list_add_tail(&packet->list, &file->send_list);
+ spin_unlock_irq(&file->send_lock);
+ if (ret) {
+ ret = -EINVAL;
+ goto err_msg;
+ }
+ }
+
+ ret = ib_post_send_mad(packet->msg, NULL);
+ if (ret)
+ goto err_send;
+
+ mutex_unlock(&file->mutex);
+ return count;
+
+err_send:
+ dequeue_send(file, packet);
+err_msg:
+ ib_free_send_mad(packet->msg);
+err_ah:
+ rdma_destroy_ah(ah);
+err_up:
+ mutex_unlock(&file->mutex);
+err:
+ kfree(packet);
+ return ret;
+}
+
+static __poll_t ib_umad_poll(struct file *filp, struct poll_table_struct *wait)
+{
+ struct ib_umad_file *file = filp->private_data;
+
+ /* we will always be able to post a MAD send */
+ __poll_t mask = EPOLLOUT | EPOLLWRNORM;
+
+ mutex_lock(&file->mutex);
+ poll_wait(filp, &file->recv_wait, wait);
+
+ if (!list_empty(&file->recv_list))
+ mask |= EPOLLIN | EPOLLRDNORM;
+ if (file->agents_dead)
+ mask = EPOLLERR;
+ mutex_unlock(&file->mutex);
+
+ return mask;
+}
+
+static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
+ int compat_method_mask)
+{
+ struct ib_user_mad_reg_req ureq;
+ struct ib_mad_reg_req req;
+ struct ib_mad_agent *agent = NULL;
+ int agent_id;
+ int ret;
+
+ mutex_lock(&file->port->file_mutex);
+ mutex_lock(&file->mutex);
+
+ if (!file->port->ib_dev) {
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent: invalid device\n");
+ ret = -EPIPE;
+ goto out;
+ }
+
+ if (copy_from_user(&ureq, arg, sizeof ureq)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (ureq.qpn != 0 && ureq.qpn != 1) {
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent: invalid QPN %d specified\n",
+ ureq.qpn);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id)
+ if (!__get_agent(file, agent_id))
+ goto found;
+
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent: Max Agents (%u) reached\n",
+ IB_UMAD_MAX_AGENTS);
+ ret = -ENOMEM;
+ goto out;
+
+found:
+ if (ureq.mgmt_class) {
+ memset(&req, 0, sizeof(req));
+ req.mgmt_class = ureq.mgmt_class;
+ req.mgmt_class_version = ureq.mgmt_class_version;
+ memcpy(req.oui, ureq.oui, sizeof req.oui);
+
+ if (compat_method_mask) {
+ u32 *umm = (u32 *) ureq.method_mask;
+ int i;
+
+ for (i = 0; i < BITS_TO_LONGS(IB_MGMT_MAX_METHODS); ++i)
+ req.method_mask[i] =
+ umm[i * 2] | ((u64) umm[i * 2 + 1] << 32);
+ } else
+ memcpy(req.method_mask, ureq.method_mask,
+ sizeof req.method_mask);
+ }
+
+ agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num,
+ ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
+ ureq.mgmt_class ? &req : NULL,
+ ureq.rmpp_version,
+ send_handler, recv_handler, file, 0);
+ if (IS_ERR(agent)) {
+ ret = PTR_ERR(agent);
+ agent = NULL;
+ goto out;
+ }
+
+ if (put_user(agent_id,
+ (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (!file->already_used) {
+ file->already_used = 1;
+ if (!file->use_pkey_index) {
+ dev_warn(file->port->dev,
+ "process %s did not enable P_Key index support.\n",
+ current->comm);
+ dev_warn(file->port->dev,
+ " Documentation/infiniband/user_mad.txt has info on the new ABI.\n");
+ }
+ }
+
+ file->agent[agent_id] = agent;
+ ret = 0;
+
+out:
+ mutex_unlock(&file->mutex);
+
+ if (ret && agent)
+ ib_unregister_mad_agent(agent);
+
+ mutex_unlock(&file->port->file_mutex);
+
+ return ret;
+}
+
+static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg)
+{
+ struct ib_user_mad_reg_req2 ureq;
+ struct ib_mad_reg_req req;
+ struct ib_mad_agent *agent = NULL;
+ int agent_id;
+ int ret;
+
+ mutex_lock(&file->port->file_mutex);
+ mutex_lock(&file->mutex);
+
+ if (!file->port->ib_dev) {
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent2: invalid device\n");
+ ret = -EPIPE;
+ goto out;
+ }
+
+ if (copy_from_user(&ureq, arg, sizeof(ureq))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (ureq.qpn != 0 && ureq.qpn != 1) {
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent2: invalid QPN %d specified\n",
+ ureq.qpn);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (ureq.flags & ~IB_USER_MAD_REG_FLAGS_CAP) {
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent2 failed: invalid registration flags specified 0x%x; supported 0x%x\n",
+ ureq.flags, IB_USER_MAD_REG_FLAGS_CAP);
+ ret = -EINVAL;
+
+ if (put_user((u32)IB_USER_MAD_REG_FLAGS_CAP,
+ (u32 __user *) (arg + offsetof(struct
+ ib_user_mad_reg_req2, flags))))
+ ret = -EFAULT;
+
+ goto out;
+ }
+
+ for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id)
+ if (!__get_agent(file, agent_id))
+ goto found;
+
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent2: Max Agents (%u) reached\n",
+ IB_UMAD_MAX_AGENTS);
+ ret = -ENOMEM;
+ goto out;
+
+found:
+ if (ureq.mgmt_class) {
+ memset(&req, 0, sizeof(req));
+ req.mgmt_class = ureq.mgmt_class;
+ req.mgmt_class_version = ureq.mgmt_class_version;
+ if (ureq.oui & 0xff000000) {
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent2 failed: oui invalid 0x%08x\n",
+ ureq.oui);
+ ret = -EINVAL;
+ goto out;
+ }
+ req.oui[2] = ureq.oui & 0x0000ff;
+ req.oui[1] = (ureq.oui & 0x00ff00) >> 8;
+ req.oui[0] = (ureq.oui & 0xff0000) >> 16;
+ memcpy(req.method_mask, ureq.method_mask,
+ sizeof(req.method_mask));
+ }
+
+ agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num,
+ ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
+ ureq.mgmt_class ? &req : NULL,
+ ureq.rmpp_version,
+ send_handler, recv_handler, file,
+ ureq.flags);
+ if (IS_ERR(agent)) {
+ ret = PTR_ERR(agent);
+ agent = NULL;
+ goto out;
+ }
+
+ if (put_user(agent_id,
+ (u32 __user *)(arg +
+ offsetof(struct ib_user_mad_reg_req2, id)))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (!file->already_used) {
+ file->already_used = 1;
+ file->use_pkey_index = 1;
+ }
+
+ file->agent[agent_id] = agent;
+ ret = 0;
+
+out:
+ mutex_unlock(&file->mutex);
+
+ if (ret && agent)
+ ib_unregister_mad_agent(agent);
+
+ mutex_unlock(&file->port->file_mutex);
+
+ return ret;
+}
+
+
+static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg)
+{
+ struct ib_mad_agent *agent = NULL;
+ u32 id;
+ int ret = 0;
+
+ if (get_user(id, arg))
+ return -EFAULT;
+ if (id >= IB_UMAD_MAX_AGENTS)
+ return -EINVAL;
+
+ mutex_lock(&file->port->file_mutex);
+ mutex_lock(&file->mutex);
+
+ id = array_index_nospec(id, IB_UMAD_MAX_AGENTS);
+ if (!__get_agent(file, id)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ agent = file->agent[id];
+ file->agent[id] = NULL;
+
+out:
+ mutex_unlock(&file->mutex);
+
+ if (agent)
+ ib_unregister_mad_agent(agent);
+
+ mutex_unlock(&file->port->file_mutex);
+
+ return ret;
+}
+
+static long ib_umad_enable_pkey(struct ib_umad_file *file)
+{
+ int ret = 0;
+
+ mutex_lock(&file->mutex);
+ if (file->already_used)
+ ret = -EINVAL;
+ else
+ file->use_pkey_index = 1;
+ mutex_unlock(&file->mutex);
+
+ return ret;
+}
+
+static long ib_umad_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (cmd) {
+ case IB_USER_MAD_REGISTER_AGENT:
+ return ib_umad_reg_agent(filp->private_data, (void __user *) arg, 0);
+ case IB_USER_MAD_UNREGISTER_AGENT:
+ return ib_umad_unreg_agent(filp->private_data, (__u32 __user *) arg);
+ case IB_USER_MAD_ENABLE_PKEY:
+ return ib_umad_enable_pkey(filp->private_data);
+ case IB_USER_MAD_REGISTER_AGENT2:
+ return ib_umad_reg_agent2(filp->private_data, (void __user *) arg);
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (cmd) {
+ case IB_USER_MAD_REGISTER_AGENT:
+ return ib_umad_reg_agent(filp->private_data, compat_ptr(arg), 1);
+ case IB_USER_MAD_UNREGISTER_AGENT:
+ return ib_umad_unreg_agent(filp->private_data, compat_ptr(arg));
+ case IB_USER_MAD_ENABLE_PKEY:
+ return ib_umad_enable_pkey(filp->private_data);
+ case IB_USER_MAD_REGISTER_AGENT2:
+ return ib_umad_reg_agent2(filp->private_data, compat_ptr(arg));
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+#endif
+
+/*
+ * ib_umad_open() does not need the BKL:
+ *
+ * - the ib_umad_port structures are properly reference counted, and
+ * everything else is purely local to the file being created, so
+ * races against other open calls are not a problem;
+ * - the ioctl method does not affect any global state outside of the
+ * file structure being operated on;
+ */
+static int ib_umad_open(struct inode *inode, struct file *filp)
+{
+ struct ib_umad_port *port;
+ struct ib_umad_file *file;
+ int ret = -ENXIO;
+
+ port = container_of(inode->i_cdev, struct ib_umad_port, cdev);
+
+ mutex_lock(&port->file_mutex);
+
+ if (!port->ib_dev)
+ goto out;
+
+ ret = -ENOMEM;
+ file = kzalloc(sizeof *file, GFP_KERNEL);
+ if (!file)
+ goto out;
+
+ mutex_init(&file->mutex);
+ spin_lock_init(&file->send_lock);
+ INIT_LIST_HEAD(&file->recv_list);
+ INIT_LIST_HEAD(&file->send_list);
+ init_waitqueue_head(&file->recv_wait);
+
+ file->port = port;
+ filp->private_data = file;
+
+ list_add_tail(&file->port_list, &port->file_list);
+
+ ret = nonseekable_open(inode, filp);
+ if (ret) {
+ list_del(&file->port_list);
+ kfree(file);
+ goto out;
+ }
+
+ kobject_get(&port->umad_dev->kobj);
+
+out:
+ mutex_unlock(&port->file_mutex);
+ return ret;
+}
+
+static int ib_umad_close(struct inode *inode, struct file *filp)
+{
+ struct ib_umad_file *file = filp->private_data;
+ struct ib_umad_device *dev = file->port->umad_dev;
+ struct ib_umad_packet *packet, *tmp;
+ int already_dead;
+ int i;
+
+ mutex_lock(&file->port->file_mutex);
+ mutex_lock(&file->mutex);
+
+ already_dead = file->agents_dead;
+ file->agents_dead = 1;
+
+ list_for_each_entry_safe(packet, tmp, &file->recv_list, list) {
+ if (packet->recv_wc)
+ ib_free_recv_mad(packet->recv_wc);
+ kfree(packet);
+ }
+
+ list_del(&file->port_list);
+
+ mutex_unlock(&file->mutex);
+
+ if (!already_dead)
+ for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i)
+ if (file->agent[i])
+ ib_unregister_mad_agent(file->agent[i]);
+
+ mutex_unlock(&file->port->file_mutex);
+
+ kfree(file);
+ kobject_put(&dev->kobj);
+
+ return 0;
+}
+
+static const struct file_operations umad_fops = {
+ .owner = THIS_MODULE,
+ .read = ib_umad_read,
+ .write = ib_umad_write,
+ .poll = ib_umad_poll,
+ .unlocked_ioctl = ib_umad_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ib_umad_compat_ioctl,
+#endif
+ .open = ib_umad_open,
+ .release = ib_umad_close,
+ .llseek = no_llseek,
+};
+
+static int ib_umad_sm_open(struct inode *inode, struct file *filp)
+{
+ struct ib_umad_port *port;
+ struct ib_port_modify props = {
+ .set_port_cap_mask = IB_PORT_SM
+ };
+ int ret;
+
+ port = container_of(inode->i_cdev, struct ib_umad_port, sm_cdev);
+
+ if (filp->f_flags & O_NONBLOCK) {
+ if (down_trylock(&port->sm_sem)) {
+ ret = -EAGAIN;
+ goto fail;
+ }
+ } else {
+ if (down_interruptible(&port->sm_sem)) {
+ ret = -ERESTARTSYS;
+ goto fail;
+ }
+ }
+
+ ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+ if (ret)
+ goto err_up_sem;
+
+ filp->private_data = port;
+
+ ret = nonseekable_open(inode, filp);
+ if (ret)
+ goto err_clr_sm_cap;
+
+ kobject_get(&port->umad_dev->kobj);
+
+ return 0;
+
+err_clr_sm_cap:
+ swap(props.set_port_cap_mask, props.clr_port_cap_mask);
+ ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+
+err_up_sem:
+ up(&port->sm_sem);
+
+fail:
+ return ret;
+}
+
+static int ib_umad_sm_close(struct inode *inode, struct file *filp)
+{
+ struct ib_umad_port *port = filp->private_data;
+ struct ib_port_modify props = {
+ .clr_port_cap_mask = IB_PORT_SM
+ };
+ int ret = 0;
+
+ mutex_lock(&port->file_mutex);
+ if (port->ib_dev)
+ ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+ mutex_unlock(&port->file_mutex);
+
+ up(&port->sm_sem);
+
+ kobject_put(&port->umad_dev->kobj);
+
+ return ret;
+}
+
+static const struct file_operations umad_sm_fops = {
+ .owner = THIS_MODULE,
+ .open = ib_umad_sm_open,
+ .release = ib_umad_sm_close,
+ .llseek = no_llseek,
+};
+
+static struct ib_client umad_client = {
+ .name = "umad",
+ .add = ib_umad_add_one,
+ .remove = ib_umad_remove_one
+};
+
+static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct ib_umad_port *port = dev_get_drvdata(dev);
+
+ if (!port)
+ return -ENODEV;
+
+ return sprintf(buf, "%s\n", port->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_port(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct ib_umad_port *port = dev_get_drvdata(dev);
+
+ if (!port)
+ return -ENODEV;
+
+ return sprintf(buf, "%d\n", port->port_num);
+}
+static DEVICE_ATTR(port, S_IRUGO, show_port, NULL);
+
+static CLASS_ATTR_STRING(abi_version, S_IRUGO,
+ __stringify(IB_USER_MAD_ABI_VERSION));
+
+static int ib_umad_init_port(struct ib_device *device, int port_num,
+ struct ib_umad_device *umad_dev,
+ struct ib_umad_port *port)
+{
+ int devnum;
+ dev_t base_umad;
+ dev_t base_issm;
+
+ devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
+ if (devnum >= IB_UMAD_MAX_PORTS)
+ return -1;
+ port->dev_num = devnum;
+ set_bit(devnum, dev_map);
+ if (devnum >= IB_UMAD_NUM_FIXED_MINOR) {
+ base_umad = dynamic_umad_dev + devnum - IB_UMAD_NUM_FIXED_MINOR;
+ base_issm = dynamic_issm_dev + devnum - IB_UMAD_NUM_FIXED_MINOR;
+ } else {
+ base_umad = devnum + base_umad_dev;
+ base_issm = devnum + base_issm_dev;
+ }
+
+ port->ib_dev = device;
+ port->port_num = port_num;
+ sema_init(&port->sm_sem, 1);
+ mutex_init(&port->file_mutex);
+ INIT_LIST_HEAD(&port->file_list);
+
+ cdev_init(&port->cdev, &umad_fops);
+ port->cdev.owner = THIS_MODULE;
+ cdev_set_parent(&port->cdev, &umad_dev->kobj);
+ kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num);
+ if (cdev_add(&port->cdev, base_umad, 1))
+ goto err_cdev;
+
+ port->dev = device_create(umad_class, device->dev.parent,
+ port->cdev.dev, port,
+ "umad%d", port->dev_num);
+ if (IS_ERR(port->dev))
+ goto err_cdev;
+
+ if (device_create_file(port->dev, &dev_attr_ibdev))
+ goto err_dev;
+ if (device_create_file(port->dev, &dev_attr_port))
+ goto err_dev;
+
+ cdev_init(&port->sm_cdev, &umad_sm_fops);
+ port->sm_cdev.owner = THIS_MODULE;
+ cdev_set_parent(&port->sm_cdev, &umad_dev->kobj);
+ kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num);
+ if (cdev_add(&port->sm_cdev, base_issm, 1))
+ goto err_sm_cdev;
+
+ port->sm_dev = device_create(umad_class, device->dev.parent,
+ port->sm_cdev.dev, port,
+ "issm%d", port->dev_num);
+ if (IS_ERR(port->sm_dev))
+ goto err_sm_cdev;
+
+ if (device_create_file(port->sm_dev, &dev_attr_ibdev))
+ goto err_sm_dev;
+ if (device_create_file(port->sm_dev, &dev_attr_port))
+ goto err_sm_dev;
+
+ return 0;
+
+err_sm_dev:
+ device_destroy(umad_class, port->sm_cdev.dev);
+
+err_sm_cdev:
+ cdev_del(&port->sm_cdev);
+
+err_dev:
+ device_destroy(umad_class, port->cdev.dev);
+
+err_cdev:
+ cdev_del(&port->cdev);
+ clear_bit(devnum, dev_map);
+
+ return -1;
+}
+
+static void ib_umad_kill_port(struct ib_umad_port *port)
+{
+ struct ib_umad_file *file;
+ int id;
+
+ dev_set_drvdata(port->dev, NULL);
+ dev_set_drvdata(port->sm_dev, NULL);
+
+ device_destroy(umad_class, port->cdev.dev);
+ device_destroy(umad_class, port->sm_cdev.dev);
+
+ cdev_del(&port->cdev);
+ cdev_del(&port->sm_cdev);
+
+ mutex_lock(&port->file_mutex);
+
+ port->ib_dev = NULL;
+
+ list_for_each_entry(file, &port->file_list, port_list) {
+ mutex_lock(&file->mutex);
+ file->agents_dead = 1;
+ wake_up_interruptible(&file->recv_wait);
+ mutex_unlock(&file->mutex);
+
+ for (id = 0; id < IB_UMAD_MAX_AGENTS; ++id)
+ if (file->agent[id])
+ ib_unregister_mad_agent(file->agent[id]);
+ }
+
+ mutex_unlock(&port->file_mutex);
+ clear_bit(port->dev_num, dev_map);
+}
+
+static void ib_umad_add_one(struct ib_device *device)
+{
+ struct ib_umad_device *umad_dev;
+ int s, e, i;
+ int count = 0;
+
+ s = rdma_start_port(device);
+ e = rdma_end_port(device);
+
+ umad_dev = kzalloc(sizeof *umad_dev +
+ (e - s + 1) * sizeof (struct ib_umad_port),
+ GFP_KERNEL);
+ if (!umad_dev)
+ return;
+
+ kobject_init(&umad_dev->kobj, &ib_umad_dev_ktype);
+
+ for (i = s; i <= e; ++i) {
+ if (!rdma_cap_ib_mad(device, i))
+ continue;
+
+ umad_dev->port[i - s].umad_dev = umad_dev;
+
+ if (ib_umad_init_port(device, i, umad_dev,
+ &umad_dev->port[i - s]))
+ goto err;
+
+ count++;
+ }
+
+ if (!count)
+ goto free;
+
+ ib_set_client_data(device, &umad_client, umad_dev);
+
+ return;
+
+err:
+ while (--i >= s) {
+ if (!rdma_cap_ib_mad(device, i))
+ continue;
+
+ ib_umad_kill_port(&umad_dev->port[i - s]);
+ }
+free:
+ kobject_put(&umad_dev->kobj);
+}
+
+static void ib_umad_remove_one(struct ib_device *device, void *client_data)
+{
+ struct ib_umad_device *umad_dev = client_data;
+ int i;
+
+ if (!umad_dev)
+ return;
+
+ for (i = 0; i <= rdma_end_port(device) - rdma_start_port(device); ++i) {
+ if (rdma_cap_ib_mad(device, i + rdma_start_port(device)))
+ ib_umad_kill_port(&umad_dev->port[i]);
+ }
+
+ kobject_put(&umad_dev->kobj);
+}
+
+static char *umad_devnode(struct device *dev, umode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+static int __init ib_umad_init(void)
+{
+ int ret;
+
+ ret = register_chrdev_region(base_umad_dev,
+ IB_UMAD_NUM_FIXED_MINOR * 2,
+ "infiniband_mad");
+ if (ret) {
+ pr_err("couldn't register device number\n");
+ goto out;
+ }
+
+ ret = alloc_chrdev_region(&dynamic_umad_dev, 0,
+ IB_UMAD_NUM_DYNAMIC_MINOR * 2,
+ "infiniband_mad");
+ if (ret) {
+ pr_err("couldn't register dynamic device number\n");
+ goto out_alloc;
+ }
+ dynamic_issm_dev = dynamic_umad_dev + IB_UMAD_NUM_DYNAMIC_MINOR;
+
+ umad_class = class_create(THIS_MODULE, "infiniband_mad");
+ if (IS_ERR(umad_class)) {
+ ret = PTR_ERR(umad_class);
+ pr_err("couldn't create class infiniband_mad\n");
+ goto out_chrdev;
+ }
+
+ umad_class->devnode = umad_devnode;
+
+ ret = class_create_file(umad_class, &class_attr_abi_version.attr);
+ if (ret) {
+ pr_err("couldn't create abi_version attribute\n");
+ goto out_class;
+ }
+
+ ret = ib_register_client(&umad_client);
+ if (ret) {
+ pr_err("couldn't register ib_umad client\n");
+ goto out_class;
+ }
+
+ return 0;
+
+out_class:
+ class_destroy(umad_class);
+
+out_chrdev:
+ unregister_chrdev_region(dynamic_umad_dev,
+ IB_UMAD_NUM_DYNAMIC_MINOR * 2);
+
+out_alloc:
+ unregister_chrdev_region(base_umad_dev,
+ IB_UMAD_NUM_FIXED_MINOR * 2);
+
+out:
+ return ret;
+}
+
+static void __exit ib_umad_cleanup(void)
+{
+ ib_unregister_client(&umad_client);
+ class_destroy(umad_class);
+ unregister_chrdev_region(base_umad_dev,
+ IB_UMAD_NUM_FIXED_MINOR * 2);
+ unregister_chrdev_region(dynamic_umad_dev,
+ IB_UMAD_NUM_DYNAMIC_MINOR * 2);
+}
+
+module_init(ib_umad_init);
+module_exit(ib_umad_cleanup);
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
new file mode 100644
index 000000000..4a14de2d8
--- /dev/null
+++ b/drivers/infiniband/core/uverbs.h
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef UVERBS_H
+#define UVERBS_H
+
+#include <linux/kref.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/completion.h>
+#include <linux/cdev.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/uverbs_std_types.h>
+
+#define UVERBS_MODULE_NAME ib_uverbs
+#include <rdma/uverbs_named_ioctl.h>
+
+static inline void
+ib_uverbs_init_udata(struct ib_udata *udata,
+ const void __user *ibuf,
+ void __user *obuf,
+ size_t ilen, size_t olen)
+{
+ udata->inbuf = ibuf;
+ udata->outbuf = obuf;
+ udata->inlen = ilen;
+ udata->outlen = olen;
+}
+
+static inline void
+ib_uverbs_init_udata_buf_or_null(struct ib_udata *udata,
+ const void __user *ibuf,
+ void __user *obuf,
+ size_t ilen, size_t olen)
+{
+ ib_uverbs_init_udata(udata,
+ ilen ? ibuf : NULL, olen ? obuf : NULL,
+ ilen, olen);
+}
+
+/*
+ * Our lifetime rules for these structs are the following:
+ *
+ * struct ib_uverbs_device: One reference is held by the module and
+ * released in ib_uverbs_remove_one(). Another reference is taken by
+ * ib_uverbs_open() each time the character special file is opened,
+ * and released in ib_uverbs_release_file() when the file is released.
+ *
+ * struct ib_uverbs_file: One reference is held by the VFS and
+ * released when the file is closed. Another reference is taken when
+ * an asynchronous event queue file is created and released when the
+ * event file is closed.
+ *
+ * struct ib_uverbs_event_queue: Base structure for
+ * struct ib_uverbs_async_event_file and struct ib_uverbs_completion_event_file.
+ * One reference is held by the VFS and released when the file is closed.
+ * For asynchronous event files, another reference is held by the corresponding
+ * main context file and released when that file is closed. For completion
+ * event files, a reference is taken when a CQ is created that uses the file,
+ * and released when the CQ is destroyed.
+ */
+
+struct ib_uverbs_device {
+ atomic_t refcount;
+ u32 num_comp_vectors;
+ struct completion comp;
+ struct device *dev;
+ struct ib_device __rcu *ib_dev;
+ int devnum;
+ struct cdev cdev;
+ struct rb_root xrcd_tree;
+ struct mutex xrcd_tree_mutex;
+ struct kobject kobj;
+ struct srcu_struct disassociate_srcu;
+ struct mutex lists_mutex; /* protect lists */
+ struct list_head uverbs_file_list;
+ struct list_head uverbs_events_file_list;
+ struct uverbs_api *uapi;
+};
+
+struct ib_uverbs_event_queue {
+ spinlock_t lock;
+ int is_closed;
+ wait_queue_head_t poll_wait;
+ struct fasync_struct *async_queue;
+ struct list_head event_list;
+};
+
+struct ib_uverbs_async_event_file {
+ struct ib_uverbs_event_queue ev_queue;
+ struct ib_uverbs_file *uverbs_file;
+ struct kref ref;
+ struct list_head list;
+};
+
+struct ib_uverbs_completion_event_file {
+ struct ib_uobject uobj;
+ struct ib_uverbs_event_queue ev_queue;
+};
+
+struct ib_uverbs_file {
+ struct kref ref;
+ struct ib_uverbs_device *device;
+ struct mutex ucontext_lock;
+ /*
+ * ucontext must be accessed via ib_uverbs_get_ucontext() or with
+ * ucontext_lock held
+ */
+ struct ib_ucontext *ucontext;
+ struct ib_event_handler event_handler;
+ struct ib_uverbs_async_event_file *async_file;
+ struct list_head list;
+ int is_closed;
+
+ /*
+ * To access the uobjects list hw_destroy_rwsem must be held for write
+ * OR hw_destroy_rwsem held for read AND uobjects_lock held.
+ * hw_destroy_rwsem should be called across any destruction of the HW
+ * object of an associated uobject.
+ */
+ struct rw_semaphore hw_destroy_rwsem;
+ spinlock_t uobjects_lock;
+ struct list_head uobjects;
+
+ u64 uverbs_cmd_mask;
+ u64 uverbs_ex_cmd_mask;
+
+ struct idr idr;
+ /* spinlock protects write access to idr */
+ spinlock_t idr_lock;
+};
+
+struct ib_uverbs_event {
+ union {
+ struct ib_uverbs_async_event_desc async;
+ struct ib_uverbs_comp_event_desc comp;
+ } desc;
+ struct list_head list;
+ struct list_head obj_list;
+ u32 *counter;
+};
+
+struct ib_uverbs_mcast_entry {
+ struct list_head list;
+ union ib_gid gid;
+ u16 lid;
+};
+
+struct ib_uevent_object {
+ struct ib_uobject uobject;
+ struct list_head event_list;
+ u32 events_reported;
+};
+
+struct ib_uxrcd_object {
+ struct ib_uobject uobject;
+ atomic_t refcnt;
+};
+
+struct ib_usrq_object {
+ struct ib_uevent_object uevent;
+ struct ib_uxrcd_object *uxrcd;
+};
+
+struct ib_uqp_object {
+ struct ib_uevent_object uevent;
+ /* lock for mcast list */
+ struct mutex mcast_lock;
+ struct list_head mcast_list;
+ struct ib_uxrcd_object *uxrcd;
+};
+
+struct ib_uwq_object {
+ struct ib_uevent_object uevent;
+};
+
+struct ib_ucq_object {
+ struct ib_uobject uobject;
+ struct list_head comp_list;
+ struct list_head async_list;
+ u32 comp_events_reported;
+ u32 async_events_reported;
+};
+
+struct ib_uflow_resources;
+struct ib_uflow_object {
+ struct ib_uobject uobject;
+ struct ib_uflow_resources *resources;
+};
+
+extern const struct file_operations uverbs_event_fops;
+void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue);
+struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file,
+ struct ib_device *ib_dev);
+void ib_uverbs_free_async_event_file(struct ib_uverbs_file *uverbs_file);
+void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res);
+
+void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
+ struct ib_uverbs_completion_event_file *ev_file,
+ struct ib_ucq_object *uobj);
+void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
+ struct ib_uevent_object *uobj);
+void ib_uverbs_release_file(struct kref *ref);
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context);
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_event_handler(struct ib_event_handler *handler,
+ struct ib_event *event);
+int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, struct ib_xrcd *xrcd,
+ enum rdma_remove_reason why);
+
+int uverbs_dealloc_mw(struct ib_mw *mw);
+void ib_uverbs_detach_umcast(struct ib_qp *qp,
+ struct ib_uqp_object *uobj);
+
+void create_udata(struct uverbs_attr_bundle *ctx, struct ib_udata *udata);
+long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+
+struct ib_uverbs_flow_spec {
+ union {
+ union {
+ struct ib_uverbs_flow_spec_hdr hdr;
+ struct {
+ __u32 type;
+ __u16 size;
+ __u16 reserved;
+ };
+ };
+ struct ib_uverbs_flow_spec_eth eth;
+ struct ib_uverbs_flow_spec_ipv4 ipv4;
+ struct ib_uverbs_flow_spec_esp esp;
+ struct ib_uverbs_flow_spec_tcp_udp tcp_udp;
+ struct ib_uverbs_flow_spec_ipv6 ipv6;
+ struct ib_uverbs_flow_spec_action_tag flow_tag;
+ struct ib_uverbs_flow_spec_action_drop drop;
+ struct ib_uverbs_flow_spec_action_handle action;
+ struct ib_uverbs_flow_spec_action_count flow_count;
+ };
+};
+
+int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type,
+ const void *kern_spec_mask,
+ const void *kern_spec_val,
+ size_t kern_filter_sz,
+ union ib_flow_spec *ib_spec);
+
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_DEVICE);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_PD);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_MR);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_COMP_CHANNEL);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_CQ);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_QP);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_AH);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_MW);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_SRQ);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_FLOW);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_WQ);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_XRCD);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_FLOW_ACTION);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_DM);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_COUNTERS);
+
+#define IB_UVERBS_DECLARE_CMD(name) \
+ ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \
+ const char __user *buf, int in_len, \
+ int out_len)
+
+IB_UVERBS_DECLARE_CMD(get_context);
+IB_UVERBS_DECLARE_CMD(query_device);
+IB_UVERBS_DECLARE_CMD(query_port);
+IB_UVERBS_DECLARE_CMD(alloc_pd);
+IB_UVERBS_DECLARE_CMD(dealloc_pd);
+IB_UVERBS_DECLARE_CMD(reg_mr);
+IB_UVERBS_DECLARE_CMD(rereg_mr);
+IB_UVERBS_DECLARE_CMD(dereg_mr);
+IB_UVERBS_DECLARE_CMD(alloc_mw);
+IB_UVERBS_DECLARE_CMD(dealloc_mw);
+IB_UVERBS_DECLARE_CMD(create_comp_channel);
+IB_UVERBS_DECLARE_CMD(create_cq);
+IB_UVERBS_DECLARE_CMD(resize_cq);
+IB_UVERBS_DECLARE_CMD(poll_cq);
+IB_UVERBS_DECLARE_CMD(req_notify_cq);
+IB_UVERBS_DECLARE_CMD(destroy_cq);
+IB_UVERBS_DECLARE_CMD(create_qp);
+IB_UVERBS_DECLARE_CMD(open_qp);
+IB_UVERBS_DECLARE_CMD(query_qp);
+IB_UVERBS_DECLARE_CMD(modify_qp);
+IB_UVERBS_DECLARE_CMD(destroy_qp);
+IB_UVERBS_DECLARE_CMD(post_send);
+IB_UVERBS_DECLARE_CMD(post_recv);
+IB_UVERBS_DECLARE_CMD(post_srq_recv);
+IB_UVERBS_DECLARE_CMD(create_ah);
+IB_UVERBS_DECLARE_CMD(destroy_ah);
+IB_UVERBS_DECLARE_CMD(attach_mcast);
+IB_UVERBS_DECLARE_CMD(detach_mcast);
+IB_UVERBS_DECLARE_CMD(create_srq);
+IB_UVERBS_DECLARE_CMD(modify_srq);
+IB_UVERBS_DECLARE_CMD(query_srq);
+IB_UVERBS_DECLARE_CMD(destroy_srq);
+IB_UVERBS_DECLARE_CMD(create_xsrq);
+IB_UVERBS_DECLARE_CMD(open_xrcd);
+IB_UVERBS_DECLARE_CMD(close_xrcd);
+
+#define IB_UVERBS_DECLARE_EX_CMD(name) \
+ int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \
+ struct ib_udata *ucore, \
+ struct ib_udata *uhw)
+
+IB_UVERBS_DECLARE_EX_CMD(create_flow);
+IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
+IB_UVERBS_DECLARE_EX_CMD(query_device);
+IB_UVERBS_DECLARE_EX_CMD(create_cq);
+IB_UVERBS_DECLARE_EX_CMD(create_qp);
+IB_UVERBS_DECLARE_EX_CMD(create_wq);
+IB_UVERBS_DECLARE_EX_CMD(modify_wq);
+IB_UVERBS_DECLARE_EX_CMD(destroy_wq);
+IB_UVERBS_DECLARE_EX_CMD(create_rwq_ind_table);
+IB_UVERBS_DECLARE_EX_CMD(destroy_rwq_ind_table);
+IB_UVERBS_DECLARE_EX_CMD(modify_qp);
+IB_UVERBS_DECLARE_EX_CMD(modify_cq);
+
+#endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
new file mode 100644
index 000000000..5e10a40fd
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -0,0 +1,4146 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ * Copyright (c) 2006 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include <linux/uaccess.h>
+
+#include <rdma/uverbs_types.h>
+#include <rdma/uverbs_std_types.h>
+#include "rdma_core.h"
+
+#include "uverbs.h"
+#include "core_priv.h"
+
+static struct ib_uverbs_completion_event_file *
+_ib_uverbs_lookup_comp_file(s32 fd, struct ib_uverbs_file *ufile)
+{
+ struct ib_uobject *uobj = ufd_get_read(UVERBS_OBJECT_COMP_CHANNEL,
+ fd, ufile);
+
+ if (IS_ERR(uobj))
+ return (void *)uobj;
+
+ uverbs_uobject_get(uobj);
+ uobj_put_read(uobj);
+
+ return container_of(uobj, struct ib_uverbs_completion_event_file,
+ uobj);
+}
+#define ib_uverbs_lookup_comp_file(_fd, _ufile) \
+ _ib_uverbs_lookup_comp_file((_fd)*typecheck(s32, _fd), _ufile)
+
+ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
+ const char __user *buf,
+ int in_len, int out_len)
+{
+ struct ib_uverbs_get_context cmd;
+ struct ib_uverbs_get_context_resp resp;
+ struct ib_udata udata;
+ struct ib_ucontext *ucontext;
+ struct file *filp;
+ struct ib_rdmacg_object cg_obj;
+ struct ib_device *ib_dev;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ mutex_lock(&file->ucontext_lock);
+ ib_dev = srcu_dereference(file->device->ib_dev,
+ &file->device->disassociate_srcu);
+ if (!ib_dev) {
+ ret = -EIO;
+ goto err;
+ }
+
+ if (file->ucontext) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+ u64_to_user_ptr(cmd.response) + sizeof(resp),
+ in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+ out_len - sizeof(resp));
+
+ ret = ib_rdmacg_try_charge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
+ if (ret)
+ goto err;
+
+ ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);
+ if (IS_ERR(ucontext)) {
+ ret = PTR_ERR(ucontext);
+ goto err_alloc;
+ }
+
+ ucontext->device = ib_dev;
+ ucontext->cg_obj = cg_obj;
+ /* ufile is required when some objects are released */
+ ucontext->ufile = file;
+
+ rcu_read_lock();
+ ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
+ rcu_read_unlock();
+ ucontext->closing = 0;
+ ucontext->cleanup_retryable = false;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+ ucontext->umem_tree = RB_ROOT_CACHED;
+ init_rwsem(&ucontext->umem_rwsem);
+ ucontext->odp_mrs_count = 0;
+ INIT_LIST_HEAD(&ucontext->no_private_counters);
+
+ if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
+ ucontext->invalidate_range = NULL;
+
+#endif
+
+ resp.num_comp_vectors = file->device->num_comp_vectors;
+
+ ret = get_unused_fd_flags(O_CLOEXEC);
+ if (ret < 0)
+ goto err_free;
+ resp.async_fd = ret;
+
+ filp = ib_uverbs_alloc_async_event_file(file, ib_dev);
+ if (IS_ERR(filp)) {
+ ret = PTR_ERR(filp);
+ goto err_fd;
+ }
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_file;
+ }
+
+ fd_install(resp.async_fd, filp);
+
+ /*
+ * Make sure that ib_uverbs_get_ucontext() sees the pointer update
+ * only after all writes to setup the ucontext have completed
+ */
+ smp_store_release(&file->ucontext, ucontext);
+
+ mutex_unlock(&file->ucontext_lock);
+
+ return in_len;
+
+err_file:
+ ib_uverbs_free_async_event_file(file);
+ fput(filp);
+
+err_fd:
+ put_unused_fd(resp.async_fd);
+
+err_free:
+ put_pid(ucontext->tgid);
+ ib_dev->dealloc_ucontext(ucontext);
+
+err_alloc:
+ ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
+
+err:
+ mutex_unlock(&file->ucontext_lock);
+ return ret;
+}
+
+static void copy_query_dev_fields(struct ib_ucontext *ucontext,
+ struct ib_uverbs_query_device_resp *resp,
+ struct ib_device_attr *attr)
+{
+ struct ib_device *ib_dev = ucontext->device;
+
+ resp->fw_ver = attr->fw_ver;
+ resp->node_guid = ib_dev->node_guid;
+ resp->sys_image_guid = attr->sys_image_guid;
+ resp->max_mr_size = attr->max_mr_size;
+ resp->page_size_cap = attr->page_size_cap;
+ resp->vendor_id = attr->vendor_id;
+ resp->vendor_part_id = attr->vendor_part_id;
+ resp->hw_ver = attr->hw_ver;
+ resp->max_qp = attr->max_qp;
+ resp->max_qp_wr = attr->max_qp_wr;
+ resp->device_cap_flags = lower_32_bits(attr->device_cap_flags);
+ resp->max_sge = min(attr->max_send_sge, attr->max_recv_sge);
+ resp->max_sge_rd = attr->max_sge_rd;
+ resp->max_cq = attr->max_cq;
+ resp->max_cqe = attr->max_cqe;
+ resp->max_mr = attr->max_mr;
+ resp->max_pd = attr->max_pd;
+ resp->max_qp_rd_atom = attr->max_qp_rd_atom;
+ resp->max_ee_rd_atom = attr->max_ee_rd_atom;
+ resp->max_res_rd_atom = attr->max_res_rd_atom;
+ resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom;
+ resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom;
+ resp->atomic_cap = attr->atomic_cap;
+ resp->max_ee = attr->max_ee;
+ resp->max_rdd = attr->max_rdd;
+ resp->max_mw = attr->max_mw;
+ resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp;
+ resp->max_raw_ethy_qp = attr->max_raw_ethy_qp;
+ resp->max_mcast_grp = attr->max_mcast_grp;
+ resp->max_mcast_qp_attach = attr->max_mcast_qp_attach;
+ resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach;
+ resp->max_ah = attr->max_ah;
+ resp->max_fmr = attr->max_fmr;
+ resp->max_map_per_fmr = attr->max_map_per_fmr;
+ resp->max_srq = attr->max_srq;
+ resp->max_srq_wr = attr->max_srq_wr;
+ resp->max_srq_sge = attr->max_srq_sge;
+ resp->max_pkeys = attr->max_pkeys;
+ resp->local_ca_ack_delay = attr->local_ca_ack_delay;
+ resp->phys_port_cnt = ib_dev->phys_port_cnt;
+}
+
+ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
+ const char __user *buf,
+ int in_len, int out_len)
+{
+ struct ib_uverbs_query_device cmd;
+ struct ib_uverbs_query_device_resp resp;
+ struct ib_ucontext *ucontext;
+
+ ucontext = ib_uverbs_get_ucontext(file);
+ if (IS_ERR(ucontext))
+ return PTR_ERR(ucontext);
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ memset(&resp, 0, sizeof resp);
+ copy_query_dev_fields(ucontext, &resp, &ucontext->device->attrs);
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
+ return -EFAULT;
+
+ return in_len;
+}
+
+/*
+ * ib_uverbs_query_port_resp.port_cap_flags started out as just a copy of the
+ * PortInfo CapabilityMask, but was extended with unique bits.
+ */
+static u32 make_port_cap_flags(const struct ib_port_attr *attr)
+{
+ u32 res;
+
+ /* All IBA CapabilityMask bits are passed through here, except bit 26,
+ * which is overridden with IP_BASED_GIDS. This is due to a historical
+ * mistake in the implementation of IP_BASED_GIDS. Otherwise all other
+ * bits match the IBA definition across all kernel versions.
+ */
+ res = attr->port_cap_flags & ~(u32)IB_UVERBS_PCF_IP_BASED_GIDS;
+
+ if (attr->ip_gids)
+ res |= IB_UVERBS_PCF_IP_BASED_GIDS;
+
+ return res;
+}
+
+ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
+ const char __user *buf,
+ int in_len, int out_len)
+{
+ struct ib_uverbs_query_port cmd;
+ struct ib_uverbs_query_port_resp resp;
+ struct ib_port_attr attr;
+ int ret;
+ struct ib_ucontext *ucontext;
+ struct ib_device *ib_dev;
+
+ ucontext = ib_uverbs_get_ucontext(file);
+ if (IS_ERR(ucontext))
+ return PTR_ERR(ucontext);
+ ib_dev = ucontext->device;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ ret = ib_query_port(ib_dev, cmd.port_num, &attr);
+ if (ret)
+ return ret;
+
+ memset(&resp, 0, sizeof resp);
+
+ resp.state = attr.state;
+ resp.max_mtu = attr.max_mtu;
+ resp.active_mtu = attr.active_mtu;
+ resp.gid_tbl_len = attr.gid_tbl_len;
+ resp.port_cap_flags = make_port_cap_flags(&attr);
+ resp.max_msg_sz = attr.max_msg_sz;
+ resp.bad_pkey_cntr = attr.bad_pkey_cntr;
+ resp.qkey_viol_cntr = attr.qkey_viol_cntr;
+ resp.pkey_tbl_len = attr.pkey_tbl_len;
+
+ if (rdma_is_grh_required(ib_dev, cmd.port_num))
+ resp.flags |= IB_UVERBS_QPF_GRH_REQUIRED;
+
+ if (rdma_cap_opa_ah(ib_dev, cmd.port_num)) {
+ resp.lid = OPA_TO_IB_UCAST_LID(attr.lid);
+ resp.sm_lid = OPA_TO_IB_UCAST_LID(attr.sm_lid);
+ } else {
+ resp.lid = ib_lid_cpu16(attr.lid);
+ resp.sm_lid = ib_lid_cpu16(attr.sm_lid);
+ }
+ resp.lmc = attr.lmc;
+ resp.max_vl_num = attr.max_vl_num;
+ resp.sm_sl = attr.sm_sl;
+ resp.subnet_timeout = attr.subnet_timeout;
+ resp.init_type_reply = attr.init_type_reply;
+ resp.active_width = attr.active_width;
+ resp.active_speed = attr.active_speed;
+ resp.phys_state = attr.phys_state;
+ resp.link_layer = rdma_port_get_link_layer(ib_dev,
+ cmd.port_num);
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
+ return -EFAULT;
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
+ const char __user *buf,
+ int in_len, int out_len)
+{
+ struct ib_uverbs_alloc_pd cmd;
+ struct ib_uverbs_alloc_pd_resp resp;
+ struct ib_udata udata;
+ struct ib_uobject *uobj;
+ struct ib_pd *pd;
+ int ret;
+ struct ib_device *ib_dev;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+ u64_to_user_ptr(cmd.response) + sizeof(resp),
+ in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+ out_len - sizeof(resp));
+
+ uobj = uobj_alloc(UVERBS_OBJECT_PD, file, &ib_dev);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+
+ pd = ib_dev->alloc_pd(ib_dev, uobj->context, &udata);
+ if (IS_ERR(pd)) {
+ ret = PTR_ERR(pd);
+ goto err;
+ }
+
+ pd->device = ib_dev;
+ pd->uobject = uobj;
+ pd->__internal_mr = NULL;
+ atomic_set(&pd->usecnt, 0);
+
+ uobj->object = pd;
+ memset(&resp, 0, sizeof resp);
+ resp.pd_handle = uobj->id;
+ pd->res.type = RDMA_RESTRACK_PD;
+ rdma_restrack_add(&pd->res);
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ return uobj_alloc_commit(uobj, in_len);
+
+err_copy:
+ ib_dealloc_pd(pd);
+
+err:
+ uobj_alloc_abort(uobj);
+ return ret;
+}
+
+ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
+ const char __user *buf,
+ int in_len, int out_len)
+{
+ struct ib_uverbs_dealloc_pd cmd;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ return uobj_perform_destroy(UVERBS_OBJECT_PD, cmd.pd_handle, file,
+ in_len);
+}
+
+struct xrcd_table_entry {
+ struct rb_node node;
+ struct ib_xrcd *xrcd;
+ struct inode *inode;
+};
+
+static int xrcd_table_insert(struct ib_uverbs_device *dev,
+ struct inode *inode,
+ struct ib_xrcd *xrcd)
+{
+ struct xrcd_table_entry *entry, *scan;
+ struct rb_node **p = &dev->xrcd_tree.rb_node;
+ struct rb_node *parent = NULL;
+
+ entry = kmalloc(sizeof *entry, GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->xrcd = xrcd;
+ entry->inode = inode;
+
+ while (*p) {
+ parent = *p;
+ scan = rb_entry(parent, struct xrcd_table_entry, node);
+
+ if (inode < scan->inode) {
+ p = &(*p)->rb_left;
+ } else if (inode > scan->inode) {
+ p = &(*p)->rb_right;
+ } else {
+ kfree(entry);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&entry->node, parent, p);
+ rb_insert_color(&entry->node, &dev->xrcd_tree);
+ igrab(inode);
+ return 0;
+}
+
+static struct xrcd_table_entry *xrcd_table_search(struct ib_uverbs_device *dev,
+ struct inode *inode)
+{
+ struct xrcd_table_entry *entry;
+ struct rb_node *p = dev->xrcd_tree.rb_node;
+
+ while (p) {
+ entry = rb_entry(p, struct xrcd_table_entry, node);
+
+ if (inode < entry->inode)
+ p = p->rb_left;
+ else if (inode > entry->inode)
+ p = p->rb_right;
+ else
+ return entry;
+ }
+
+ return NULL;
+}
+
+static struct ib_xrcd *find_xrcd(struct ib_uverbs_device *dev, struct inode *inode)
+{
+ struct xrcd_table_entry *entry;
+
+ entry = xrcd_table_search(dev, inode);
+ if (!entry)
+ return NULL;
+
+ return entry->xrcd;
+}
+
+static void xrcd_table_delete(struct ib_uverbs_device *dev,
+ struct inode *inode)
+{
+ struct xrcd_table_entry *entry;
+
+ entry = xrcd_table_search(dev, inode);
+ if (entry) {
+ iput(inode);
+ rb_erase(&entry->node, &dev->xrcd_tree);
+ kfree(entry);
+ }
+}
+
+ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_open_xrcd cmd;
+ struct ib_uverbs_open_xrcd_resp resp;
+ struct ib_udata udata;
+ struct ib_uxrcd_object *obj;
+ struct ib_xrcd *xrcd = NULL;
+ struct fd f = {NULL, 0};
+ struct inode *inode = NULL;
+ int ret = 0;
+ int new_xrcd = 0;
+ struct ib_device *ib_dev;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+ u64_to_user_ptr(cmd.response) + sizeof(resp),
+ in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+ out_len - sizeof(resp));
+
+ mutex_lock(&file->device->xrcd_tree_mutex);
+
+ if (cmd.fd != -1) {
+ /* search for file descriptor */
+ f = fdget(cmd.fd);
+ if (!f.file) {
+ ret = -EBADF;
+ goto err_tree_mutex_unlock;
+ }
+
+ inode = file_inode(f.file);
+ xrcd = find_xrcd(file->device, inode);
+ if (!xrcd && !(cmd.oflags & O_CREAT)) {
+ /* no file descriptor. Need CREATE flag */
+ ret = -EAGAIN;
+ goto err_tree_mutex_unlock;
+ }
+
+ if (xrcd && cmd.oflags & O_EXCL) {
+ ret = -EINVAL;
+ goto err_tree_mutex_unlock;
+ }
+ }
+
+ obj = (struct ib_uxrcd_object *)uobj_alloc(UVERBS_OBJECT_XRCD, file,
+ &ib_dev);
+ if (IS_ERR(obj)) {
+ ret = PTR_ERR(obj);
+ goto err_tree_mutex_unlock;
+ }
+
+ if (!xrcd) {
+ xrcd = ib_dev->alloc_xrcd(ib_dev, obj->uobject.context, &udata);
+ if (IS_ERR(xrcd)) {
+ ret = PTR_ERR(xrcd);
+ goto err;
+ }
+
+ xrcd->inode = inode;
+ xrcd->device = ib_dev;
+ atomic_set(&xrcd->usecnt, 0);
+ mutex_init(&xrcd->tgt_qp_mutex);
+ INIT_LIST_HEAD(&xrcd->tgt_qp_list);
+ new_xrcd = 1;
+ }
+
+ atomic_set(&obj->refcnt, 0);
+ obj->uobject.object = xrcd;
+ memset(&resp, 0, sizeof resp);
+ resp.xrcd_handle = obj->uobject.id;
+
+ if (inode) {
+ if (new_xrcd) {
+ /* create new inode/xrcd table entry */
+ ret = xrcd_table_insert(file->device, inode, xrcd);
+ if (ret)
+ goto err_dealloc_xrcd;
+ }
+ atomic_inc(&xrcd->usecnt);
+ }
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ if (f.file)
+ fdput(f);
+
+ mutex_unlock(&file->device->xrcd_tree_mutex);
+
+ return uobj_alloc_commit(&obj->uobject, in_len);
+
+err_copy:
+ if (inode) {
+ if (new_xrcd)
+ xrcd_table_delete(file->device, inode);
+ atomic_dec(&xrcd->usecnt);
+ }
+
+err_dealloc_xrcd:
+ ib_dealloc_xrcd(xrcd);
+
+err:
+ uobj_alloc_abort(&obj->uobject);
+
+err_tree_mutex_unlock:
+ if (f.file)
+ fdput(f);
+
+ mutex_unlock(&file->device->xrcd_tree_mutex);
+
+ return ret;
+}
+
+ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_close_xrcd cmd;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ return uobj_perform_destroy(UVERBS_OBJECT_XRCD, cmd.xrcd_handle, file,
+ in_len);
+}
+
+int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject,
+ struct ib_xrcd *xrcd,
+ enum rdma_remove_reason why)
+{
+ struct inode *inode;
+ int ret;
+ struct ib_uverbs_device *dev = uobject->context->ufile->device;
+
+ inode = xrcd->inode;
+ if (inode && !atomic_dec_and_test(&xrcd->usecnt))
+ return 0;
+
+ ret = ib_dealloc_xrcd(xrcd);
+
+ if (ib_is_destroy_retryable(ret, why, uobject)) {
+ atomic_inc(&xrcd->usecnt);
+ return ret;
+ }
+
+ if (inode)
+ xrcd_table_delete(dev, inode);
+
+ return ret;
+}
+
+ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_reg_mr cmd;
+ struct ib_uverbs_reg_mr_resp resp;
+ struct ib_udata udata;
+ struct ib_uobject *uobj;
+ struct ib_pd *pd;
+ struct ib_mr *mr;
+ int ret;
+ struct ib_device *ib_dev;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+ u64_to_user_ptr(cmd.response) + sizeof(resp),
+ in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+ out_len - sizeof(resp));
+
+ if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))
+ return -EINVAL;
+
+ ret = ib_check_mr_access(cmd.access_flags);
+ if (ret)
+ return ret;
+
+ uobj = uobj_alloc(UVERBS_OBJECT_MR, file, &ib_dev);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+
+ pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file);
+ if (!pd) {
+ ret = -EINVAL;
+ goto err_free;
+ }
+
+ if (cmd.access_flags & IB_ACCESS_ON_DEMAND) {
+ if (!(pd->device->attrs.device_cap_flags &
+ IB_DEVICE_ON_DEMAND_PAGING)) {
+ pr_debug("ODP support not available\n");
+ ret = -EINVAL;
+ goto err_put;
+ }
+ }
+
+ mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
+ cmd.access_flags, &udata);
+ if (IS_ERR(mr)) {
+ ret = PTR_ERR(mr);
+ goto err_put;
+ }
+
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->dm = NULL;
+ mr->uobject = uobj;
+ atomic_inc(&pd->usecnt);
+ mr->res.type = RDMA_RESTRACK_MR;
+ rdma_restrack_add(&mr->res);
+
+ uobj->object = mr;
+
+ memset(&resp, 0, sizeof resp);
+ resp.lkey = mr->lkey;
+ resp.rkey = mr->rkey;
+ resp.mr_handle = uobj->id;
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ uobj_put_obj_read(pd);
+
+ return uobj_alloc_commit(uobj, in_len);
+
+err_copy:
+ ib_dereg_mr(mr);
+
+err_put:
+ uobj_put_obj_read(pd);
+
+err_free:
+ uobj_alloc_abort(uobj);
+ return ret;
+}
+
+ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_rereg_mr cmd;
+ struct ib_uverbs_rereg_mr_resp resp;
+ struct ib_udata udata;
+ struct ib_pd *pd = NULL;
+ struct ib_mr *mr;
+ struct ib_pd *old_pd;
+ int ret;
+ struct ib_uobject *uobj;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof(cmd)))
+ return -EFAULT;
+
+ ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+ u64_to_user_ptr(cmd.response) + sizeof(resp),
+ in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+ out_len - sizeof(resp));
+
+ if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags)
+ return -EINVAL;
+
+ if ((cmd.flags & IB_MR_REREG_TRANS) &&
+ (!cmd.start || !cmd.hca_va || 0 >= cmd.length ||
+ (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)))
+ return -EINVAL;
+
+ uobj = uobj_get_write(UVERBS_OBJECT_MR, cmd.mr_handle, file);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+
+ mr = uobj->object;
+
+ if (mr->dm) {
+ ret = -EINVAL;
+ goto put_uobjs;
+ }
+
+ if (cmd.flags & IB_MR_REREG_ACCESS) {
+ ret = ib_check_mr_access(cmd.access_flags);
+ if (ret)
+ goto put_uobjs;
+ }
+
+ if (cmd.flags & IB_MR_REREG_PD) {
+ pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle,
+ file);
+ if (!pd) {
+ ret = -EINVAL;
+ goto put_uobjs;
+ }
+ }
+
+ old_pd = mr->pd;
+ ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start,
+ cmd.length, cmd.hca_va,
+ cmd.access_flags, pd, &udata);
+ if (!ret) {
+ if (cmd.flags & IB_MR_REREG_PD) {
+ atomic_inc(&pd->usecnt);
+ mr->pd = pd;
+ atomic_dec(&old_pd->usecnt);
+ }
+ } else {
+ goto put_uobj_pd;
+ }
+
+ memset(&resp, 0, sizeof(resp));
+ resp.lkey = mr->lkey;
+ resp.rkey = mr->rkey;
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp)))
+ ret = -EFAULT;
+ else
+ ret = in_len;
+
+put_uobj_pd:
+ if (cmd.flags & IB_MR_REREG_PD)
+ uobj_put_obj_read(pd);
+
+put_uobjs:
+ uobj_put_write(uobj);
+
+ return ret;
+}
+
+ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_dereg_mr cmd;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ return uobj_perform_destroy(UVERBS_OBJECT_MR, cmd.mr_handle, file,
+ in_len);
+}
+
+ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_alloc_mw cmd;
+ struct ib_uverbs_alloc_mw_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_pd *pd;
+ struct ib_mw *mw;
+ struct ib_udata udata;
+ int ret;
+ struct ib_device *ib_dev;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof(cmd)))
+ return -EFAULT;
+
+ uobj = uobj_alloc(UVERBS_OBJECT_MW, file, &ib_dev);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+
+ pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file);
+ if (!pd) {
+ ret = -EINVAL;
+ goto err_free;
+ }
+
+ ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+ u64_to_user_ptr(cmd.response) + sizeof(resp),
+ in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+ out_len - sizeof(resp));
+
+ mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata);
+ if (IS_ERR(mw)) {
+ ret = PTR_ERR(mw);
+ goto err_put;
+ }
+
+ mw->device = pd->device;
+ mw->pd = pd;
+ mw->uobject = uobj;
+ atomic_inc(&pd->usecnt);
+
+ uobj->object = mw;
+
+ memset(&resp, 0, sizeof(resp));
+ resp.rkey = mw->rkey;
+ resp.mw_handle = uobj->id;
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ uobj_put_obj_read(pd);
+ return uobj_alloc_commit(uobj, in_len);
+
+err_copy:
+ uverbs_dealloc_mw(mw);
+err_put:
+ uobj_put_obj_read(pd);
+err_free:
+ uobj_alloc_abort(uobj);
+ return ret;
+}
+
+ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_dealloc_mw cmd;
+
+ if (copy_from_user(&cmd, buf, sizeof(cmd)))
+ return -EFAULT;
+
+ return uobj_perform_destroy(UVERBS_OBJECT_MW, cmd.mw_handle, file,
+ in_len);
+}
+
+ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_create_comp_channel cmd;
+ struct ib_uverbs_create_comp_channel_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_uverbs_completion_event_file *ev_file;
+ struct ib_device *ib_dev;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ uobj = uobj_alloc(UVERBS_OBJECT_COMP_CHANNEL, file, &ib_dev);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+
+ resp.fd = uobj->id;
+
+ ev_file = container_of(uobj, struct ib_uverbs_completion_event_file,
+ uobj);
+ ib_uverbs_init_event_queue(&ev_file->ev_queue);
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
+ uobj_alloc_abort(uobj);
+ return -EFAULT;
+ }
+
+ return uobj_alloc_commit(uobj, in_len);
+}
+
+static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw,
+ struct ib_uverbs_ex_create_cq *cmd,
+ size_t cmd_sz,
+ int (*cb)(struct ib_uverbs_file *file,
+ struct ib_ucq_object *obj,
+ struct ib_uverbs_ex_create_cq_resp *resp,
+ struct ib_udata *udata,
+ void *context),
+ void *context)
+{
+ struct ib_ucq_object *obj;
+ struct ib_uverbs_completion_event_file *ev_file = NULL;
+ struct ib_cq *cq;
+ int ret;
+ struct ib_uverbs_ex_create_cq_resp resp;
+ struct ib_cq_init_attr attr = {};
+ struct ib_device *ib_dev;
+
+ if (cmd->comp_vector >= file->device->num_comp_vectors)
+ return ERR_PTR(-EINVAL);
+
+ obj = (struct ib_ucq_object *)uobj_alloc(UVERBS_OBJECT_CQ, file,
+ &ib_dev);
+ if (IS_ERR(obj))
+ return obj;
+
+ if (!ib_dev->create_cq) {
+ ret = -EOPNOTSUPP;
+ goto err;
+ }
+
+ if (cmd->comp_channel >= 0) {
+ ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel, file);
+ if (IS_ERR(ev_file)) {
+ ret = PTR_ERR(ev_file);
+ goto err;
+ }
+ }
+
+ obj->uobject.user_handle = cmd->user_handle;
+ obj->comp_events_reported = 0;
+ obj->async_events_reported = 0;
+ INIT_LIST_HEAD(&obj->comp_list);
+ INIT_LIST_HEAD(&obj->async_list);
+
+ attr.cqe = cmd->cqe;
+ attr.comp_vector = cmd->comp_vector;
+
+ if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
+ attr.flags = cmd->flags;
+
+ cq = ib_dev->create_cq(ib_dev, &attr, obj->uobject.context, uhw);
+ if (IS_ERR(cq)) {
+ ret = PTR_ERR(cq);
+ goto err_file;
+ }
+
+ cq->device = ib_dev;
+ cq->uobject = &obj->uobject;
+ cq->comp_handler = ib_uverbs_comp_handler;
+ cq->event_handler = ib_uverbs_cq_event_handler;
+ cq->cq_context = ev_file ? &ev_file->ev_queue : NULL;
+ atomic_set(&cq->usecnt, 0);
+
+ obj->uobject.object = cq;
+ memset(&resp, 0, sizeof resp);
+ resp.base.cq_handle = obj->uobject.id;
+ resp.base.cqe = cq->cqe;
+
+ resp.response_length = offsetof(typeof(resp), response_length) +
+ sizeof(resp.response_length);
+
+ cq->res.type = RDMA_RESTRACK_CQ;
+ rdma_restrack_add(&cq->res);
+
+ ret = cb(file, obj, &resp, ucore, context);
+ if (ret)
+ goto err_cb;
+
+ ret = uobj_alloc_commit(&obj->uobject, 0);
+ if (ret)
+ return ERR_PTR(ret);
+ return obj;
+
+err_cb:
+ ib_destroy_cq(cq);
+
+err_file:
+ if (ev_file)
+ ib_uverbs_release_ucq(file, ev_file, obj);
+
+err:
+ uobj_alloc_abort(&obj->uobject);
+
+ return ERR_PTR(ret);
+}
+
+static int ib_uverbs_create_cq_cb(struct ib_uverbs_file *file,
+ struct ib_ucq_object *obj,
+ struct ib_uverbs_ex_create_cq_resp *resp,
+ struct ib_udata *ucore, void *context)
+{
+ if (ib_copy_to_udata(ucore, &resp->base, sizeof(resp->base)))
+ return -EFAULT;
+
+ return 0;
+}
+
+ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_create_cq cmd;
+ struct ib_uverbs_ex_create_cq cmd_ex;
+ struct ib_uverbs_create_cq_resp resp;
+ struct ib_udata ucore;
+ struct ib_udata uhw;
+ struct ib_ucq_object *obj;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof(cmd)))
+ return -EFAULT;
+
+ ib_uverbs_init_udata(&ucore, buf, u64_to_user_ptr(cmd.response),
+ sizeof(cmd), sizeof(resp));
+
+ ib_uverbs_init_udata(&uhw, buf + sizeof(cmd),
+ u64_to_user_ptr(cmd.response) + sizeof(resp),
+ in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+ out_len - sizeof(resp));
+
+ memset(&cmd_ex, 0, sizeof(cmd_ex));
+ cmd_ex.user_handle = cmd.user_handle;
+ cmd_ex.cqe = cmd.cqe;
+ cmd_ex.comp_vector = cmd.comp_vector;
+ cmd_ex.comp_channel = cmd.comp_channel;
+
+ obj = create_cq(file, &ucore, &uhw, &cmd_ex,
+ offsetof(typeof(cmd_ex), comp_channel) +
+ sizeof(cmd.comp_channel), ib_uverbs_create_cq_cb,
+ NULL);
+
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+
+ return in_len;
+}
+
+static int ib_uverbs_ex_create_cq_cb(struct ib_uverbs_file *file,
+ struct ib_ucq_object *obj,
+ struct ib_uverbs_ex_create_cq_resp *resp,
+ struct ib_udata *ucore, void *context)
+{
+ if (ib_copy_to_udata(ucore, resp, resp->response_length))
+ return -EFAULT;
+
+ return 0;
+}
+
+int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_create_cq_resp resp;
+ struct ib_uverbs_ex_create_cq cmd;
+ struct ib_ucq_object *obj;
+ int err;
+
+ if (ucore->inlen < sizeof(cmd))
+ return -EINVAL;
+
+ err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+ if (err)
+ return err;
+
+ if (cmd.comp_mask)
+ return -EINVAL;
+
+ if (cmd.reserved)
+ return -EINVAL;
+
+ if (ucore->outlen < (offsetof(typeof(resp), response_length) +
+ sizeof(resp.response_length)))
+ return -ENOSPC;
+
+ obj = create_cq(file, ucore, uhw, &cmd,
+ min(ucore->inlen, sizeof(cmd)),
+ ib_uverbs_ex_create_cq_cb, NULL);
+
+ return PTR_ERR_OR_ZERO(obj);
+}
+
+ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_resize_cq cmd;
+ struct ib_uverbs_resize_cq_resp resp = {};
+ struct ib_udata udata;
+ struct ib_cq *cq;
+ int ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+ u64_to_user_ptr(cmd.response) + sizeof(resp),
+ in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+ out_len - sizeof(resp));
+
+ cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file);
+ if (!cq)
+ return -EINVAL;
+
+ ret = cq->device->resize_cq(cq, cmd.cqe, &udata);
+ if (ret)
+ goto out;
+
+ resp.cqe = cq->cqe;
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp.cqe))
+ ret = -EFAULT;
+
+out:
+ uobj_put_obj_read(cq);
+
+ return ret ? ret : in_len;
+}
+
+static int copy_wc_to_user(struct ib_device *ib_dev, void __user *dest,
+ struct ib_wc *wc)
+{
+ struct ib_uverbs_wc tmp;
+
+ tmp.wr_id = wc->wr_id;
+ tmp.status = wc->status;
+ tmp.opcode = wc->opcode;
+ tmp.vendor_err = wc->vendor_err;
+ tmp.byte_len = wc->byte_len;
+ tmp.ex.imm_data = wc->ex.imm_data;
+ tmp.qp_num = wc->qp->qp_num;
+ tmp.src_qp = wc->src_qp;
+ tmp.wc_flags = wc->wc_flags;
+ tmp.pkey_index = wc->pkey_index;
+ if (rdma_cap_opa_ah(ib_dev, wc->port_num))
+ tmp.slid = OPA_TO_IB_UCAST_LID(wc->slid);
+ else
+ tmp.slid = ib_lid_cpu16(wc->slid);
+ tmp.sl = wc->sl;
+ tmp.dlid_path_bits = wc->dlid_path_bits;
+ tmp.port_num = wc->port_num;
+ tmp.reserved = 0;
+
+ if (copy_to_user(dest, &tmp, sizeof tmp))
+ return -EFAULT;
+
+ return 0;
+}
+
+ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_poll_cq cmd;
+ struct ib_uverbs_poll_cq_resp resp;
+ u8 __user *header_ptr;
+ u8 __user *data_ptr;
+ struct ib_cq *cq;
+ struct ib_wc wc;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file);
+ if (!cq)
+ return -EINVAL;
+
+ /* we copy a struct ib_uverbs_poll_cq_resp to user space */
+ header_ptr = u64_to_user_ptr(cmd.response);
+ data_ptr = header_ptr + sizeof resp;
+
+ memset(&resp, 0, sizeof resp);
+ while (resp.count < cmd.ne) {
+ ret = ib_poll_cq(cq, 1, &wc);
+ if (ret < 0)
+ goto out_put;
+ if (!ret)
+ break;
+
+ ret = copy_wc_to_user(cq->device, data_ptr, &wc);
+ if (ret)
+ goto out_put;
+
+ data_ptr += sizeof(struct ib_uverbs_wc);
+ ++resp.count;
+ }
+
+ if (copy_to_user(header_ptr, &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto out_put;
+ }
+
+ ret = in_len;
+
+out_put:
+ uobj_put_obj_read(cq);
+ return ret;
+}
+
+ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_req_notify_cq cmd;
+ struct ib_cq *cq;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file);
+ if (!cq)
+ return -EINVAL;
+
+ ib_req_notify_cq(cq, cmd.solicited_only ?
+ IB_CQ_SOLICITED : IB_CQ_NEXT_COMP);
+
+ uobj_put_obj_read(cq);
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_destroy_cq cmd;
+ struct ib_uverbs_destroy_cq_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_ucq_object *obj;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ uobj = uobj_get_destroy(UVERBS_OBJECT_CQ, cmd.cq_handle, file);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+
+ obj = container_of(uobj, struct ib_ucq_object, uobject);
+ memset(&resp, 0, sizeof(resp));
+ resp.comp_events_reported = obj->comp_events_reported;
+ resp.async_events_reported = obj->async_events_reported;
+
+ uobj_put_destroy(uobj);
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
+ return -EFAULT;
+
+ return in_len;
+}
+
+static int create_qp(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw,
+ struct ib_uverbs_ex_create_qp *cmd,
+ size_t cmd_sz,
+ int (*cb)(struct ib_uverbs_file *file,
+ struct ib_uverbs_ex_create_qp_resp *resp,
+ struct ib_udata *udata),
+ void *context)
+{
+ struct ib_uqp_object *obj;
+ struct ib_device *device;
+ struct ib_pd *pd = NULL;
+ struct ib_xrcd *xrcd = NULL;
+ struct ib_uobject *xrcd_uobj = ERR_PTR(-ENOENT);
+ struct ib_cq *scq = NULL, *rcq = NULL;
+ struct ib_srq *srq = NULL;
+ struct ib_qp *qp;
+ char *buf;
+ struct ib_qp_init_attr attr = {};
+ struct ib_uverbs_ex_create_qp_resp resp;
+ int ret;
+ struct ib_rwq_ind_table *ind_tbl = NULL;
+ bool has_sq = true;
+ struct ib_device *ib_dev;
+
+ if (cmd->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
+ return -EPERM;
+
+ obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, file,
+ &ib_dev);
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+ obj->uxrcd = NULL;
+ obj->uevent.uobject.user_handle = cmd->user_handle;
+ mutex_init(&obj->mcast_lock);
+
+ if (cmd_sz >= offsetof(typeof(*cmd), rwq_ind_tbl_handle) +
+ sizeof(cmd->rwq_ind_tbl_handle) &&
+ (cmd->comp_mask & IB_UVERBS_CREATE_QP_MASK_IND_TABLE)) {
+ ind_tbl = uobj_get_obj_read(rwq_ind_table,
+ UVERBS_OBJECT_RWQ_IND_TBL,
+ cmd->rwq_ind_tbl_handle, file);
+ if (!ind_tbl) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ attr.rwq_ind_tbl = ind_tbl;
+ }
+
+ if (cmd_sz > sizeof(*cmd) &&
+ !ib_is_udata_cleared(ucore, sizeof(*cmd),
+ cmd_sz - sizeof(*cmd))) {
+ ret = -EOPNOTSUPP;
+ goto err_put;
+ }
+
+ if (ind_tbl && (cmd->max_recv_wr || cmd->max_recv_sge || cmd->is_srq)) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ if (ind_tbl && !cmd->max_send_wr)
+ has_sq = false;
+
+ if (cmd->qp_type == IB_QPT_XRC_TGT) {
+ xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd->pd_handle,
+ file);
+
+ if (IS_ERR(xrcd_uobj)) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ xrcd = (struct ib_xrcd *)xrcd_uobj->object;
+ if (!xrcd) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+ device = xrcd->device;
+ } else {
+ if (cmd->qp_type == IB_QPT_XRC_INI) {
+ cmd->max_recv_wr = 0;
+ cmd->max_recv_sge = 0;
+ } else {
+ if (cmd->is_srq) {
+ srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ,
+ cmd->srq_handle, file);
+ if (!srq || srq->srq_type == IB_SRQT_XRC) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+ }
+
+ if (!ind_tbl) {
+ if (cmd->recv_cq_handle != cmd->send_cq_handle) {
+ rcq = uobj_get_obj_read(
+ cq, UVERBS_OBJECT_CQ,
+ cmd->recv_cq_handle, file);
+ if (!rcq) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+ }
+ }
+ }
+
+ if (has_sq)
+ scq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ,
+ cmd->send_cq_handle, file);
+ if (!ind_tbl)
+ rcq = rcq ?: scq;
+ pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle,
+ file);
+ if (!pd || (!scq && has_sq)) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ device = pd->device;
+ }
+
+ attr.event_handler = ib_uverbs_qp_event_handler;
+ attr.qp_context = file;
+ attr.send_cq = scq;
+ attr.recv_cq = rcq;
+ attr.srq = srq;
+ attr.xrcd = xrcd;
+ attr.sq_sig_type = cmd->sq_sig_all ? IB_SIGNAL_ALL_WR :
+ IB_SIGNAL_REQ_WR;
+ attr.qp_type = cmd->qp_type;
+ attr.create_flags = 0;
+
+ attr.cap.max_send_wr = cmd->max_send_wr;
+ attr.cap.max_recv_wr = cmd->max_recv_wr;
+ attr.cap.max_send_sge = cmd->max_send_sge;
+ attr.cap.max_recv_sge = cmd->max_recv_sge;
+ attr.cap.max_inline_data = cmd->max_inline_data;
+
+ obj->uevent.events_reported = 0;
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+ INIT_LIST_HEAD(&obj->mcast_list);
+
+ if (cmd_sz >= offsetof(typeof(*cmd), create_flags) +
+ sizeof(cmd->create_flags))
+ attr.create_flags = cmd->create_flags;
+
+ if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
+ IB_QP_CREATE_CROSS_CHANNEL |
+ IB_QP_CREATE_MANAGED_SEND |
+ IB_QP_CREATE_MANAGED_RECV |
+ IB_QP_CREATE_SCATTER_FCS |
+ IB_QP_CREATE_CVLAN_STRIPPING |
+ IB_QP_CREATE_SOURCE_QPN |
+ IB_QP_CREATE_PCI_WRITE_END_PADDING)) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ if (attr.create_flags & IB_QP_CREATE_SOURCE_QPN) {
+ if (!capable(CAP_NET_RAW)) {
+ ret = -EPERM;
+ goto err_put;
+ }
+
+ attr.source_qpn = cmd->source_qpn;
+ }
+
+ buf = (void *)cmd + sizeof(*cmd);
+ if (cmd_sz > sizeof(*cmd))
+ if (!(buf[0] == 0 && !memcmp(buf, buf + 1,
+ cmd_sz - sizeof(*cmd) - 1))) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ if (cmd->qp_type == IB_QPT_XRC_TGT)
+ qp = ib_create_qp(pd, &attr);
+ else
+ qp = _ib_create_qp(device, pd, &attr, uhw,
+ &obj->uevent.uobject);
+
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
+ goto err_put;
+ }
+
+ if (cmd->qp_type != IB_QPT_XRC_TGT) {
+ ret = ib_create_qp_security(qp, device);
+ if (ret)
+ goto err_cb;
+
+ qp->real_qp = qp;
+ qp->pd = pd;
+ qp->send_cq = attr.send_cq;
+ qp->recv_cq = attr.recv_cq;
+ qp->srq = attr.srq;
+ qp->rwq_ind_tbl = ind_tbl;
+ qp->event_handler = attr.event_handler;
+ qp->qp_context = attr.qp_context;
+ qp->qp_type = attr.qp_type;
+ atomic_set(&qp->usecnt, 0);
+ atomic_inc(&pd->usecnt);
+ qp->port = 0;
+ if (attr.send_cq)
+ atomic_inc(&attr.send_cq->usecnt);
+ if (attr.recv_cq)
+ atomic_inc(&attr.recv_cq->usecnt);
+ if (attr.srq)
+ atomic_inc(&attr.srq->usecnt);
+ if (ind_tbl)
+ atomic_inc(&ind_tbl->usecnt);
+ } else {
+ /* It is done in _ib_create_qp for other QP types */
+ qp->uobject = &obj->uevent.uobject;
+ }
+
+ obj->uevent.uobject.object = qp;
+
+ memset(&resp, 0, sizeof resp);
+ resp.base.qpn = qp->qp_num;
+ resp.base.qp_handle = obj->uevent.uobject.id;
+ resp.base.max_recv_sge = attr.cap.max_recv_sge;
+ resp.base.max_send_sge = attr.cap.max_send_sge;
+ resp.base.max_recv_wr = attr.cap.max_recv_wr;
+ resp.base.max_send_wr = attr.cap.max_send_wr;
+ resp.base.max_inline_data = attr.cap.max_inline_data;
+
+ resp.response_length = offsetof(typeof(resp), response_length) +
+ sizeof(resp.response_length);
+
+ ret = cb(file, &resp, ucore);
+ if (ret)
+ goto err_cb;
+
+ if (xrcd) {
+ obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object,
+ uobject);
+ atomic_inc(&obj->uxrcd->refcnt);
+ uobj_put_read(xrcd_uobj);
+ }
+
+ if (pd)
+ uobj_put_obj_read(pd);
+ if (scq)
+ uobj_put_obj_read(scq);
+ if (rcq && rcq != scq)
+ uobj_put_obj_read(rcq);
+ if (srq)
+ uobj_put_obj_read(srq);
+ if (ind_tbl)
+ uobj_put_obj_read(ind_tbl);
+
+ return uobj_alloc_commit(&obj->uevent.uobject, 0);
+err_cb:
+ ib_destroy_qp(qp);
+
+err_put:
+ if (!IS_ERR(xrcd_uobj))
+ uobj_put_read(xrcd_uobj);
+ if (pd)
+ uobj_put_obj_read(pd);
+ if (scq)
+ uobj_put_obj_read(scq);
+ if (rcq && rcq != scq)
+ uobj_put_obj_read(rcq);
+ if (srq)
+ uobj_put_obj_read(srq);
+ if (ind_tbl)
+ uobj_put_obj_read(ind_tbl);
+
+ uobj_alloc_abort(&obj->uevent.uobject);
+ return ret;
+}
+
+static int ib_uverbs_create_qp_cb(struct ib_uverbs_file *file,
+ struct ib_uverbs_ex_create_qp_resp *resp,
+ struct ib_udata *ucore)
+{
+ if (ib_copy_to_udata(ucore, &resp->base, sizeof(resp->base)))
+ return -EFAULT;
+
+ return 0;
+}
+
+ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_create_qp cmd;
+ struct ib_uverbs_ex_create_qp cmd_ex;
+ struct ib_udata ucore;
+ struct ib_udata uhw;
+ ssize_t resp_size = sizeof(struct ib_uverbs_create_qp_resp);
+ int err;
+
+ if (out_len < resp_size)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof(cmd)))
+ return -EFAULT;
+
+ ib_uverbs_init_udata(&ucore, buf, u64_to_user_ptr(cmd.response),
+ sizeof(cmd), resp_size);
+ ib_uverbs_init_udata(&uhw, buf + sizeof(cmd),
+ u64_to_user_ptr(cmd.response) + resp_size,
+ in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+ out_len - resp_size);
+
+ memset(&cmd_ex, 0, sizeof(cmd_ex));
+ cmd_ex.user_handle = cmd.user_handle;
+ cmd_ex.pd_handle = cmd.pd_handle;
+ cmd_ex.send_cq_handle = cmd.send_cq_handle;
+ cmd_ex.recv_cq_handle = cmd.recv_cq_handle;
+ cmd_ex.srq_handle = cmd.srq_handle;
+ cmd_ex.max_send_wr = cmd.max_send_wr;
+ cmd_ex.max_recv_wr = cmd.max_recv_wr;
+ cmd_ex.max_send_sge = cmd.max_send_sge;
+ cmd_ex.max_recv_sge = cmd.max_recv_sge;
+ cmd_ex.max_inline_data = cmd.max_inline_data;
+ cmd_ex.sq_sig_all = cmd.sq_sig_all;
+ cmd_ex.qp_type = cmd.qp_type;
+ cmd_ex.is_srq = cmd.is_srq;
+
+ err = create_qp(file, &ucore, &uhw, &cmd_ex,
+ offsetof(typeof(cmd_ex), is_srq) +
+ sizeof(cmd.is_srq), ib_uverbs_create_qp_cb,
+ NULL);
+
+ if (err)
+ return err;
+
+ return in_len;
+}
+
+static int ib_uverbs_ex_create_qp_cb(struct ib_uverbs_file *file,
+ struct ib_uverbs_ex_create_qp_resp *resp,
+ struct ib_udata *ucore)
+{
+ if (ib_copy_to_udata(ucore, resp, resp->response_length))
+ return -EFAULT;
+
+ return 0;
+}
+
+int ib_uverbs_ex_create_qp(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_create_qp_resp resp;
+ struct ib_uverbs_ex_create_qp cmd = {0};
+ int err;
+
+ if (ucore->inlen < (offsetof(typeof(cmd), comp_mask) +
+ sizeof(cmd.comp_mask)))
+ return -EINVAL;
+
+ err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+ if (err)
+ return err;
+
+ if (cmd.comp_mask & ~IB_UVERBS_CREATE_QP_SUP_COMP_MASK)
+ return -EINVAL;
+
+ if (cmd.reserved)
+ return -EINVAL;
+
+ if (ucore->outlen < (offsetof(typeof(resp), response_length) +
+ sizeof(resp.response_length)))
+ return -ENOSPC;
+
+ err = create_qp(file, ucore, uhw, &cmd,
+ min(ucore->inlen, sizeof(cmd)),
+ ib_uverbs_ex_create_qp_cb, NULL);
+
+ if (err)
+ return err;
+
+ return 0;
+}
+
+ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len, int out_len)
+{
+ struct ib_uverbs_open_qp cmd;
+ struct ib_uverbs_create_qp_resp resp;
+ struct ib_udata udata;
+ struct ib_uqp_object *obj;
+ struct ib_xrcd *xrcd;
+ struct ib_uobject *uninitialized_var(xrcd_uobj);
+ struct ib_qp *qp;
+ struct ib_qp_open_attr attr;
+ int ret;
+ struct ib_device *ib_dev;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+ u64_to_user_ptr(cmd.response) + sizeof(resp),
+ in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+ out_len - sizeof(resp));
+
+ obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, file,
+ &ib_dev);
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+
+ xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd.pd_handle, file);
+ if (IS_ERR(xrcd_uobj)) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ xrcd = (struct ib_xrcd *)xrcd_uobj->object;
+ if (!xrcd) {
+ ret = -EINVAL;
+ goto err_xrcd;
+ }
+
+ attr.event_handler = ib_uverbs_qp_event_handler;
+ attr.qp_context = file;
+ attr.qp_num = cmd.qpn;
+ attr.qp_type = cmd.qp_type;
+
+ obj->uevent.events_reported = 0;
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+ INIT_LIST_HEAD(&obj->mcast_list);
+
+ qp = ib_open_qp(xrcd, &attr);
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
+ goto err_xrcd;
+ }
+
+ obj->uevent.uobject.object = qp;
+ obj->uevent.uobject.user_handle = cmd.user_handle;
+
+ memset(&resp, 0, sizeof resp);
+ resp.qpn = qp->qp_num;
+ resp.qp_handle = obj->uevent.uobject.id;
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_destroy;
+ }
+
+ obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
+ atomic_inc(&obj->uxrcd->refcnt);
+ qp->uobject = &obj->uevent.uobject;
+ uobj_put_read(xrcd_uobj);
+
+ return uobj_alloc_commit(&obj->uevent.uobject, in_len);
+
+err_destroy:
+ ib_destroy_qp(qp);
+err_xrcd:
+ uobj_put_read(xrcd_uobj);
+err_put:
+ uobj_alloc_abort(&obj->uevent.uobject);
+ return ret;
+}
+
+static void copy_ah_attr_to_uverbs(struct ib_uverbs_qp_dest *uverb_attr,
+ struct rdma_ah_attr *rdma_attr)
+{
+ const struct ib_global_route *grh;
+
+ uverb_attr->dlid = rdma_ah_get_dlid(rdma_attr);
+ uverb_attr->sl = rdma_ah_get_sl(rdma_attr);
+ uverb_attr->src_path_bits = rdma_ah_get_path_bits(rdma_attr);
+ uverb_attr->static_rate = rdma_ah_get_static_rate(rdma_attr);
+ uverb_attr->is_global = !!(rdma_ah_get_ah_flags(rdma_attr) &
+ IB_AH_GRH);
+ if (uverb_attr->is_global) {
+ grh = rdma_ah_read_grh(rdma_attr);
+ memcpy(uverb_attr->dgid, grh->dgid.raw, 16);
+ uverb_attr->flow_label = grh->flow_label;
+ uverb_attr->sgid_index = grh->sgid_index;
+ uverb_attr->hop_limit = grh->hop_limit;
+ uverb_attr->traffic_class = grh->traffic_class;
+ }
+ uverb_attr->port_num = rdma_ah_get_port_num(rdma_attr);
+}
+
+ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_query_qp cmd;
+ struct ib_uverbs_query_qp_resp resp;
+ struct ib_qp *qp;
+ struct ib_qp_attr *attr;
+ struct ib_qp_init_attr *init_attr;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ attr = kmalloc(sizeof *attr, GFP_KERNEL);
+ init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL);
+ if (!attr || !init_attr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file);
+ if (!qp) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = ib_query_qp(qp, attr, cmd.attr_mask, init_attr);
+
+ uobj_put_obj_read(qp);
+
+ if (ret)
+ goto out;
+
+ memset(&resp, 0, sizeof resp);
+
+ resp.qp_state = attr->qp_state;
+ resp.cur_qp_state = attr->cur_qp_state;
+ resp.path_mtu = attr->path_mtu;
+ resp.path_mig_state = attr->path_mig_state;
+ resp.qkey = attr->qkey;
+ resp.rq_psn = attr->rq_psn;
+ resp.sq_psn = attr->sq_psn;
+ resp.dest_qp_num = attr->dest_qp_num;
+ resp.qp_access_flags = attr->qp_access_flags;
+ resp.pkey_index = attr->pkey_index;
+ resp.alt_pkey_index = attr->alt_pkey_index;
+ resp.sq_draining = attr->sq_draining;
+ resp.max_rd_atomic = attr->max_rd_atomic;
+ resp.max_dest_rd_atomic = attr->max_dest_rd_atomic;
+ resp.min_rnr_timer = attr->min_rnr_timer;
+ resp.port_num = attr->port_num;
+ resp.timeout = attr->timeout;
+ resp.retry_cnt = attr->retry_cnt;
+ resp.rnr_retry = attr->rnr_retry;
+ resp.alt_port_num = attr->alt_port_num;
+ resp.alt_timeout = attr->alt_timeout;
+
+ copy_ah_attr_to_uverbs(&resp.dest, &attr->ah_attr);
+ copy_ah_attr_to_uverbs(&resp.alt_dest, &attr->alt_ah_attr);
+
+ resp.max_send_wr = init_attr->cap.max_send_wr;
+ resp.max_recv_wr = init_attr->cap.max_recv_wr;
+ resp.max_send_sge = init_attr->cap.max_send_sge;
+ resp.max_recv_sge = init_attr->cap.max_recv_sge;
+ resp.max_inline_data = init_attr->cap.max_inline_data;
+ resp.sq_sig_all = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR;
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
+ ret = -EFAULT;
+
+out:
+ kfree(attr);
+ kfree(init_attr);
+
+ return ret ? ret : in_len;
+}
+
+/* Remove ignored fields set in the attribute mask */
+static int modify_qp_mask(enum ib_qp_type qp_type, int mask)
+{
+ switch (qp_type) {
+ case IB_QPT_XRC_INI:
+ return mask & ~(IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER);
+ case IB_QPT_XRC_TGT:
+ return mask & ~(IB_QP_MAX_QP_RD_ATOMIC | IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY);
+ default:
+ return mask;
+ }
+}
+
+static void copy_ah_attr_from_uverbs(struct ib_device *dev,
+ struct rdma_ah_attr *rdma_attr,
+ struct ib_uverbs_qp_dest *uverb_attr)
+{
+ rdma_attr->type = rdma_ah_find_type(dev, uverb_attr->port_num);
+ if (uverb_attr->is_global) {
+ rdma_ah_set_grh(rdma_attr, NULL,
+ uverb_attr->flow_label,
+ uverb_attr->sgid_index,
+ uverb_attr->hop_limit,
+ uverb_attr->traffic_class);
+ rdma_ah_set_dgid_raw(rdma_attr, uverb_attr->dgid);
+ } else {
+ rdma_ah_set_ah_flags(rdma_attr, 0);
+ }
+ rdma_ah_set_dlid(rdma_attr, uverb_attr->dlid);
+ rdma_ah_set_sl(rdma_attr, uverb_attr->sl);
+ rdma_ah_set_path_bits(rdma_attr, uverb_attr->src_path_bits);
+ rdma_ah_set_static_rate(rdma_attr, uverb_attr->static_rate);
+ rdma_ah_set_port_num(rdma_attr, uverb_attr->port_num);
+ rdma_ah_set_make_grd(rdma_attr, false);
+}
+
+static int modify_qp(struct ib_uverbs_file *file,
+ struct ib_uverbs_ex_modify_qp *cmd, struct ib_udata *udata)
+{
+ struct ib_qp_attr *attr;
+ struct ib_qp *qp;
+ int ret;
+
+ attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+ if (!attr)
+ return -ENOMEM;
+
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd->base.qp_handle, file);
+ if (!qp) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if ((cmd->base.attr_mask & IB_QP_PORT) &&
+ !rdma_is_port_valid(qp->device, cmd->base.port_num)) {
+ ret = -EINVAL;
+ goto release_qp;
+ }
+
+ if ((cmd->base.attr_mask & IB_QP_AV)) {
+ if (!rdma_is_port_valid(qp->device, cmd->base.dest.port_num)) {
+ ret = -EINVAL;
+ goto release_qp;
+ }
+
+ if (cmd->base.attr_mask & IB_QP_STATE &&
+ cmd->base.qp_state == IB_QPS_RTR) {
+ /* We are in INIT->RTR TRANSITION (if we are not,
+ * this transition will be rejected in subsequent checks).
+ * In the INIT->RTR transition, we cannot have IB_QP_PORT set,
+ * but the IB_QP_STATE flag is required.
+ *
+ * Since kernel 3.14 (commit dbf727de7440), the uverbs driver,
+ * when IB_QP_AV is set, has required inclusion of a valid
+ * port number in the primary AV. (AVs are created and handled
+ * differently for infiniband and ethernet (RoCE) ports).
+ *
+ * Check the port number included in the primary AV against
+ * the port number in the qp struct, which was set (and saved)
+ * in the RST->INIT transition.
+ */
+ if (cmd->base.dest.port_num != qp->real_qp->port) {
+ ret = -EINVAL;
+ goto release_qp;
+ }
+ } else {
+ /* We are in SQD->SQD. (If we are not, this transition will
+ * be rejected later in the verbs layer checks).
+ * Check for both IB_QP_PORT and IB_QP_AV, these can be set
+ * together in the SQD->SQD transition.
+ *
+ * If only IP_QP_AV was set, add in IB_QP_PORT as well (the
+ * verbs layer driver does not track primary port changes
+ * resulting from path migration. Thus, in SQD, if the primary
+ * AV is modified, the primary port should also be modified).
+ *
+ * Note that in this transition, the IB_QP_STATE flag
+ * is not allowed.
+ */
+ if (((cmd->base.attr_mask & (IB_QP_AV | IB_QP_PORT))
+ == (IB_QP_AV | IB_QP_PORT)) &&
+ cmd->base.port_num != cmd->base.dest.port_num) {
+ ret = -EINVAL;
+ goto release_qp;
+ }
+ if ((cmd->base.attr_mask & (IB_QP_AV | IB_QP_PORT))
+ == IB_QP_AV) {
+ cmd->base.attr_mask |= IB_QP_PORT;
+ cmd->base.port_num = cmd->base.dest.port_num;
+ }
+ }
+ }
+
+ if ((cmd->base.attr_mask & IB_QP_ALT_PATH) &&
+ (!rdma_is_port_valid(qp->device, cmd->base.alt_port_num) ||
+ !rdma_is_port_valid(qp->device, cmd->base.alt_dest.port_num) ||
+ cmd->base.alt_port_num != cmd->base.alt_dest.port_num)) {
+ ret = -EINVAL;
+ goto release_qp;
+ }
+
+ if ((cmd->base.attr_mask & IB_QP_CUR_STATE &&
+ cmd->base.cur_qp_state > IB_QPS_ERR) ||
+ (cmd->base.attr_mask & IB_QP_STATE &&
+ cmd->base.qp_state > IB_QPS_ERR)) {
+ ret = -EINVAL;
+ goto release_qp;
+ }
+
+ if (cmd->base.attr_mask & IB_QP_STATE)
+ attr->qp_state = cmd->base.qp_state;
+ if (cmd->base.attr_mask & IB_QP_CUR_STATE)
+ attr->cur_qp_state = cmd->base.cur_qp_state;
+ if (cmd->base.attr_mask & IB_QP_PATH_MTU)
+ attr->path_mtu = cmd->base.path_mtu;
+ if (cmd->base.attr_mask & IB_QP_PATH_MIG_STATE)
+ attr->path_mig_state = cmd->base.path_mig_state;
+ if (cmd->base.attr_mask & IB_QP_QKEY)
+ attr->qkey = cmd->base.qkey;
+ if (cmd->base.attr_mask & IB_QP_RQ_PSN)
+ attr->rq_psn = cmd->base.rq_psn;
+ if (cmd->base.attr_mask & IB_QP_SQ_PSN)
+ attr->sq_psn = cmd->base.sq_psn;
+ if (cmd->base.attr_mask & IB_QP_DEST_QPN)
+ attr->dest_qp_num = cmd->base.dest_qp_num;
+ if (cmd->base.attr_mask & IB_QP_ACCESS_FLAGS)
+ attr->qp_access_flags = cmd->base.qp_access_flags;
+ if (cmd->base.attr_mask & IB_QP_PKEY_INDEX)
+ attr->pkey_index = cmd->base.pkey_index;
+ if (cmd->base.attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY)
+ attr->en_sqd_async_notify = cmd->base.en_sqd_async_notify;
+ if (cmd->base.attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
+ attr->max_rd_atomic = cmd->base.max_rd_atomic;
+ if (cmd->base.attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+ attr->max_dest_rd_atomic = cmd->base.max_dest_rd_atomic;
+ if (cmd->base.attr_mask & IB_QP_MIN_RNR_TIMER)
+ attr->min_rnr_timer = cmd->base.min_rnr_timer;
+ if (cmd->base.attr_mask & IB_QP_PORT)
+ attr->port_num = cmd->base.port_num;
+ if (cmd->base.attr_mask & IB_QP_TIMEOUT)
+ attr->timeout = cmd->base.timeout;
+ if (cmd->base.attr_mask & IB_QP_RETRY_CNT)
+ attr->retry_cnt = cmd->base.retry_cnt;
+ if (cmd->base.attr_mask & IB_QP_RNR_RETRY)
+ attr->rnr_retry = cmd->base.rnr_retry;
+ if (cmd->base.attr_mask & IB_QP_ALT_PATH) {
+ attr->alt_port_num = cmd->base.alt_port_num;
+ attr->alt_timeout = cmd->base.alt_timeout;
+ attr->alt_pkey_index = cmd->base.alt_pkey_index;
+ }
+ if (cmd->base.attr_mask & IB_QP_RATE_LIMIT)
+ attr->rate_limit = cmd->rate_limit;
+
+ if (cmd->base.attr_mask & IB_QP_AV)
+ copy_ah_attr_from_uverbs(qp->device, &attr->ah_attr,
+ &cmd->base.dest);
+
+ if (cmd->base.attr_mask & IB_QP_ALT_PATH)
+ copy_ah_attr_from_uverbs(qp->device, &attr->alt_ah_attr,
+ &cmd->base.alt_dest);
+
+ ret = ib_modify_qp_with_udata(qp, attr,
+ modify_qp_mask(qp->qp_type,
+ cmd->base.attr_mask),
+ udata);
+
+release_qp:
+ uobj_put_obj_read(qp);
+out:
+ kfree(attr);
+
+ return ret;
+}
+
+ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_ex_modify_qp cmd = {};
+ struct ib_udata udata;
+ int ret;
+
+ if (copy_from_user(&cmd.base, buf, sizeof(cmd.base)))
+ return -EFAULT;
+
+ if (cmd.base.attr_mask &
+ ~((IB_USER_LEGACY_LAST_QP_ATTR_MASK << 1) - 1))
+ return -EOPNOTSUPP;
+
+ ib_uverbs_init_udata(&udata, buf + sizeof(cmd.base), NULL,
+ in_len - sizeof(cmd.base) - sizeof(struct ib_uverbs_cmd_hdr),
+ out_len);
+
+ ret = modify_qp(file, &cmd, &udata);
+ if (ret)
+ return ret;
+
+ return in_len;
+}
+
+int ib_uverbs_ex_modify_qp(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_modify_qp cmd = {};
+ int ret;
+
+ /*
+ * Last bit is reserved for extending the attr_mask by
+ * using another field.
+ */
+ BUILD_BUG_ON(IB_USER_LAST_QP_ATTR_MASK == (1 << 31));
+
+ if (ucore->inlen < sizeof(cmd.base))
+ return -EINVAL;
+
+ ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+ if (ret)
+ return ret;
+
+ if (cmd.base.attr_mask &
+ ~((IB_USER_LAST_QP_ATTR_MASK << 1) - 1))
+ return -EOPNOTSUPP;
+
+ if (ucore->inlen > sizeof(cmd)) {
+ if (!ib_is_udata_cleared(ucore, sizeof(cmd),
+ ucore->inlen - sizeof(cmd)))
+ return -EOPNOTSUPP;
+ }
+
+ ret = modify_qp(file, &cmd, uhw);
+
+ return ret;
+}
+
+ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_destroy_qp cmd;
+ struct ib_uverbs_destroy_qp_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_uqp_object *obj;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ uobj = uobj_get_destroy(UVERBS_OBJECT_QP, cmd.qp_handle, file);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+
+ obj = container_of(uobj, struct ib_uqp_object, uevent.uobject);
+ memset(&resp, 0, sizeof(resp));
+ resp.events_reported = obj->uevent.events_reported;
+
+ uobj_put_destroy(uobj);
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
+ return -EFAULT;
+
+ return in_len;
+}
+
+static void *alloc_wr(size_t wr_size, __u32 num_sge)
+{
+ if (num_sge >= (U32_MAX - ALIGN(wr_size, sizeof (struct ib_sge))) /
+ sizeof (struct ib_sge))
+ return NULL;
+
+ return kmalloc(ALIGN(wr_size, sizeof (struct ib_sge)) +
+ num_sge * sizeof (struct ib_sge), GFP_KERNEL);
+}
+
+ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_post_send cmd;
+ struct ib_uverbs_post_send_resp resp;
+ struct ib_uverbs_send_wr *user_wr;
+ struct ib_send_wr *wr = NULL, *last, *next;
+ const struct ib_send_wr *bad_wr;
+ struct ib_qp *qp;
+ int i, sg_ind;
+ int is_ud;
+ ssize_t ret = -EINVAL;
+ size_t next_size;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ if (in_len < sizeof cmd + cmd.wqe_size * cmd.wr_count +
+ cmd.sge_count * sizeof (struct ib_uverbs_sge))
+ return -EINVAL;
+
+ if (cmd.wqe_size < sizeof (struct ib_uverbs_send_wr))
+ return -EINVAL;
+
+ user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL);
+ if (!user_wr)
+ return -ENOMEM;
+
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file);
+ if (!qp)
+ goto out;
+
+ is_ud = qp->qp_type == IB_QPT_UD;
+ sg_ind = 0;
+ last = NULL;
+ for (i = 0; i < cmd.wr_count; ++i) {
+ if (copy_from_user(user_wr,
+ buf + sizeof cmd + i * cmd.wqe_size,
+ cmd.wqe_size)) {
+ ret = -EFAULT;
+ goto out_put;
+ }
+
+ if (user_wr->num_sge + sg_ind > cmd.sge_count) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ if (is_ud) {
+ struct ib_ud_wr *ud;
+
+ if (user_wr->opcode != IB_WR_SEND &&
+ user_wr->opcode != IB_WR_SEND_WITH_IMM) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ next_size = sizeof(*ud);
+ ud = alloc_wr(next_size, user_wr->num_sge);
+ if (!ud) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+
+ ud->ah = uobj_get_obj_read(ah, UVERBS_OBJECT_AH,
+ user_wr->wr.ud.ah, file);
+ if (!ud->ah) {
+ kfree(ud);
+ ret = -EINVAL;
+ goto out_put;
+ }
+ ud->remote_qpn = user_wr->wr.ud.remote_qpn;
+ ud->remote_qkey = user_wr->wr.ud.remote_qkey;
+
+ next = &ud->wr;
+ } else if (user_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
+ user_wr->opcode == IB_WR_RDMA_WRITE ||
+ user_wr->opcode == IB_WR_RDMA_READ) {
+ struct ib_rdma_wr *rdma;
+
+ next_size = sizeof(*rdma);
+ rdma = alloc_wr(next_size, user_wr->num_sge);
+ if (!rdma) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+
+ rdma->remote_addr = user_wr->wr.rdma.remote_addr;
+ rdma->rkey = user_wr->wr.rdma.rkey;
+
+ next = &rdma->wr;
+ } else if (user_wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ user_wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+ struct ib_atomic_wr *atomic;
+
+ next_size = sizeof(*atomic);
+ atomic = alloc_wr(next_size, user_wr->num_sge);
+ if (!atomic) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+
+ atomic->remote_addr = user_wr->wr.atomic.remote_addr;
+ atomic->compare_add = user_wr->wr.atomic.compare_add;
+ atomic->swap = user_wr->wr.atomic.swap;
+ atomic->rkey = user_wr->wr.atomic.rkey;
+
+ next = &atomic->wr;
+ } else if (user_wr->opcode == IB_WR_SEND ||
+ user_wr->opcode == IB_WR_SEND_WITH_IMM ||
+ user_wr->opcode == IB_WR_SEND_WITH_INV) {
+ next_size = sizeof(*next);
+ next = alloc_wr(next_size, user_wr->num_sge);
+ if (!next) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+ } else {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ if (user_wr->opcode == IB_WR_SEND_WITH_IMM ||
+ user_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
+ next->ex.imm_data =
+ (__be32 __force) user_wr->ex.imm_data;
+ } else if (user_wr->opcode == IB_WR_SEND_WITH_INV) {
+ next->ex.invalidate_rkey = user_wr->ex.invalidate_rkey;
+ }
+
+ if (!last)
+ wr = next;
+ else
+ last->next = next;
+ last = next;
+
+ next->next = NULL;
+ next->wr_id = user_wr->wr_id;
+ next->num_sge = user_wr->num_sge;
+ next->opcode = user_wr->opcode;
+ next->send_flags = user_wr->send_flags;
+
+ if (next->num_sge) {
+ next->sg_list = (void *) next +
+ ALIGN(next_size, sizeof(struct ib_sge));
+ if (copy_from_user(next->sg_list,
+ buf + sizeof cmd +
+ cmd.wr_count * cmd.wqe_size +
+ sg_ind * sizeof (struct ib_sge),
+ next->num_sge * sizeof (struct ib_sge))) {
+ ret = -EFAULT;
+ goto out_put;
+ }
+ sg_ind += next->num_sge;
+ } else
+ next->sg_list = NULL;
+ }
+
+ resp.bad_wr = 0;
+ ret = qp->device->post_send(qp->real_qp, wr, &bad_wr);
+ if (ret)
+ for (next = wr; next; next = next->next) {
+ ++resp.bad_wr;
+ if (next == bad_wr)
+ break;
+ }
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
+ ret = -EFAULT;
+
+out_put:
+ uobj_put_obj_read(qp);
+
+ while (wr) {
+ if (is_ud && ud_wr(wr)->ah)
+ uobj_put_obj_read(ud_wr(wr)->ah);
+ next = wr->next;
+ kfree(wr);
+ wr = next;
+ }
+
+out:
+ kfree(user_wr);
+
+ return ret ? ret : in_len;
+}
+
+static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf,
+ int in_len,
+ u32 wr_count,
+ u32 sge_count,
+ u32 wqe_size)
+{
+ struct ib_uverbs_recv_wr *user_wr;
+ struct ib_recv_wr *wr = NULL, *last, *next;
+ int sg_ind;
+ int i;
+ int ret;
+
+ if (in_len < wqe_size * wr_count +
+ sge_count * sizeof (struct ib_uverbs_sge))
+ return ERR_PTR(-EINVAL);
+
+ if (wqe_size < sizeof (struct ib_uverbs_recv_wr))
+ return ERR_PTR(-EINVAL);
+
+ user_wr = kmalloc(wqe_size, GFP_KERNEL);
+ if (!user_wr)
+ return ERR_PTR(-ENOMEM);
+
+ sg_ind = 0;
+ last = NULL;
+ for (i = 0; i < wr_count; ++i) {
+ if (copy_from_user(user_wr, buf + i * wqe_size,
+ wqe_size)) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+ if (user_wr->num_sge + sg_ind > sge_count) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (user_wr->num_sge >=
+ (U32_MAX - ALIGN(sizeof *next, sizeof (struct ib_sge))) /
+ sizeof (struct ib_sge)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
+ user_wr->num_sge * sizeof (struct ib_sge),
+ GFP_KERNEL);
+ if (!next) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ if (!last)
+ wr = next;
+ else
+ last->next = next;
+ last = next;
+
+ next->next = NULL;
+ next->wr_id = user_wr->wr_id;
+ next->num_sge = user_wr->num_sge;
+
+ if (next->num_sge) {
+ next->sg_list = (void *) next +
+ ALIGN(sizeof *next, sizeof (struct ib_sge));
+ if (copy_from_user(next->sg_list,
+ buf + wr_count * wqe_size +
+ sg_ind * sizeof (struct ib_sge),
+ next->num_sge * sizeof (struct ib_sge))) {
+ ret = -EFAULT;
+ goto err;
+ }
+ sg_ind += next->num_sge;
+ } else
+ next->sg_list = NULL;
+ }
+
+ kfree(user_wr);
+ return wr;
+
+err:
+ kfree(user_wr);
+
+ while (wr) {
+ next = wr->next;
+ kfree(wr);
+ wr = next;
+ }
+
+ return ERR_PTR(ret);
+}
+
+ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_post_recv cmd;
+ struct ib_uverbs_post_recv_resp resp;
+ struct ib_recv_wr *wr, *next;
+ const struct ib_recv_wr *bad_wr;
+ struct ib_qp *qp;
+ ssize_t ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
+ in_len - sizeof cmd, cmd.wr_count,
+ cmd.sge_count, cmd.wqe_size);
+ if (IS_ERR(wr))
+ return PTR_ERR(wr);
+
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file);
+ if (!qp)
+ goto out;
+
+ resp.bad_wr = 0;
+ ret = qp->device->post_recv(qp->real_qp, wr, &bad_wr);
+
+ uobj_put_obj_read(qp);
+ if (ret) {
+ for (next = wr; next; next = next->next) {
+ ++resp.bad_wr;
+ if (next == bad_wr)
+ break;
+ }
+ }
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
+ ret = -EFAULT;
+
+out:
+ while (wr) {
+ next = wr->next;
+ kfree(wr);
+ wr = next;
+ }
+
+ return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_post_srq_recv cmd;
+ struct ib_uverbs_post_srq_recv_resp resp;
+ struct ib_recv_wr *wr, *next;
+ const struct ib_recv_wr *bad_wr;
+ struct ib_srq *srq;
+ ssize_t ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
+ in_len - sizeof cmd, cmd.wr_count,
+ cmd.sge_count, cmd.wqe_size);
+ if (IS_ERR(wr))
+ return PTR_ERR(wr);
+
+ srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, file);
+ if (!srq)
+ goto out;
+
+ resp.bad_wr = 0;
+ ret = srq->device->post_srq_recv ?
+ srq->device->post_srq_recv(srq, wr, &bad_wr) : -EOPNOTSUPP;
+
+ uobj_put_obj_read(srq);
+
+ if (ret)
+ for (next = wr; next; next = next->next) {
+ ++resp.bad_wr;
+ if (next == bad_wr)
+ break;
+ }
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
+ ret = -EFAULT;
+
+out:
+ while (wr) {
+ next = wr->next;
+ kfree(wr);
+ wr = next;
+ }
+
+ return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_create_ah cmd;
+ struct ib_uverbs_create_ah_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_pd *pd;
+ struct ib_ah *ah;
+ struct rdma_ah_attr attr = {};
+ int ret;
+ struct ib_udata udata;
+ struct ib_device *ib_dev;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+ u64_to_user_ptr(cmd.response) + sizeof(resp),
+ in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+ out_len - sizeof(resp));
+
+ uobj = uobj_alloc(UVERBS_OBJECT_AH, file, &ib_dev);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+
+ if (!rdma_is_port_valid(ib_dev, cmd.attr.port_num)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file);
+ if (!pd) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ attr.type = rdma_ah_find_type(ib_dev, cmd.attr.port_num);
+ rdma_ah_set_make_grd(&attr, false);
+ rdma_ah_set_dlid(&attr, cmd.attr.dlid);
+ rdma_ah_set_sl(&attr, cmd.attr.sl);
+ rdma_ah_set_path_bits(&attr, cmd.attr.src_path_bits);
+ rdma_ah_set_static_rate(&attr, cmd.attr.static_rate);
+ rdma_ah_set_port_num(&attr, cmd.attr.port_num);
+
+ if (cmd.attr.is_global) {
+ rdma_ah_set_grh(&attr, NULL, cmd.attr.grh.flow_label,
+ cmd.attr.grh.sgid_index,
+ cmd.attr.grh.hop_limit,
+ cmd.attr.grh.traffic_class);
+ rdma_ah_set_dgid_raw(&attr, cmd.attr.grh.dgid);
+ } else {
+ rdma_ah_set_ah_flags(&attr, 0);
+ }
+
+ ah = rdma_create_user_ah(pd, &attr, &udata);
+ if (IS_ERR(ah)) {
+ ret = PTR_ERR(ah);
+ goto err_put;
+ }
+
+ ah->uobject = uobj;
+ uobj->user_handle = cmd.user_handle;
+ uobj->object = ah;
+
+ resp.ah_handle = uobj->id;
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ uobj_put_obj_read(pd);
+ return uobj_alloc_commit(uobj, in_len);
+
+err_copy:
+ rdma_destroy_ah(ah);
+
+err_put:
+ uobj_put_obj_read(pd);
+
+err:
+ uobj_alloc_abort(uobj);
+ return ret;
+}
+
+ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len, int out_len)
+{
+ struct ib_uverbs_destroy_ah cmd;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ return uobj_perform_destroy(UVERBS_OBJECT_AH, cmd.ah_handle, file,
+ in_len);
+}
+
+ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_attach_mcast cmd;
+ struct ib_qp *qp;
+ struct ib_uqp_object *obj;
+ struct ib_uverbs_mcast_entry *mcast;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file);
+ if (!qp)
+ return -EINVAL;
+
+ obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject);
+
+ mutex_lock(&obj->mcast_lock);
+ list_for_each_entry(mcast, &obj->mcast_list, list)
+ if (cmd.mlid == mcast->lid &&
+ !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) {
+ ret = 0;
+ goto out_put;
+ }
+
+ mcast = kmalloc(sizeof *mcast, GFP_KERNEL);
+ if (!mcast) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+
+ mcast->lid = cmd.mlid;
+ memcpy(mcast->gid.raw, cmd.gid, sizeof mcast->gid.raw);
+
+ ret = ib_attach_mcast(qp, &mcast->gid, cmd.mlid);
+ if (!ret)
+ list_add_tail(&mcast->list, &obj->mcast_list);
+ else
+ kfree(mcast);
+
+out_put:
+ mutex_unlock(&obj->mcast_lock);
+ uobj_put_obj_read(qp);
+
+ return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_detach_mcast cmd;
+ struct ib_uqp_object *obj;
+ struct ib_qp *qp;
+ struct ib_uverbs_mcast_entry *mcast;
+ int ret = -EINVAL;
+ bool found = false;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file);
+ if (!qp)
+ return -EINVAL;
+
+ obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject);
+ mutex_lock(&obj->mcast_lock);
+
+ list_for_each_entry(mcast, &obj->mcast_list, list)
+ if (cmd.mlid == mcast->lid &&
+ !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) {
+ list_del(&mcast->list);
+ kfree(mcast);
+ found = true;
+ break;
+ }
+
+ if (!found) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ ret = ib_detach_mcast(qp, (union ib_gid *)cmd.gid, cmd.mlid);
+
+out_put:
+ mutex_unlock(&obj->mcast_lock);
+ uobj_put_obj_read(qp);
+ return ret ? ret : in_len;
+}
+
+struct ib_uflow_resources {
+ size_t max;
+ size_t num;
+ size_t collection_num;
+ size_t counters_num;
+ struct ib_counters **counters;
+ struct ib_flow_action **collection;
+};
+
+static struct ib_uflow_resources *flow_resources_alloc(size_t num_specs)
+{
+ struct ib_uflow_resources *resources;
+
+ resources = kzalloc(sizeof(*resources), GFP_KERNEL);
+
+ if (!resources)
+ return NULL;
+
+ if (!num_specs)
+ goto out;
+
+ resources->counters =
+ kcalloc(num_specs, sizeof(*resources->counters), GFP_KERNEL);
+ resources->collection =
+ kcalloc(num_specs, sizeof(*resources->collection), GFP_KERNEL);
+
+ if (!resources->counters || !resources->collection)
+ goto err;
+
+out:
+ resources->max = num_specs;
+ return resources;
+
+err:
+ kfree(resources->counters);
+ kfree(resources);
+
+ return NULL;
+}
+
+void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res)
+{
+ unsigned int i;
+
+ if (!uflow_res)
+ return;
+
+ for (i = 0; i < uflow_res->collection_num; i++)
+ atomic_dec(&uflow_res->collection[i]->usecnt);
+
+ for (i = 0; i < uflow_res->counters_num; i++)
+ atomic_dec(&uflow_res->counters[i]->usecnt);
+
+ kfree(uflow_res->collection);
+ kfree(uflow_res->counters);
+ kfree(uflow_res);
+}
+
+static void flow_resources_add(struct ib_uflow_resources *uflow_res,
+ enum ib_flow_spec_type type,
+ void *ibobj)
+{
+ WARN_ON(uflow_res->num >= uflow_res->max);
+
+ switch (type) {
+ case IB_FLOW_SPEC_ACTION_HANDLE:
+ atomic_inc(&((struct ib_flow_action *)ibobj)->usecnt);
+ uflow_res->collection[uflow_res->collection_num++] =
+ (struct ib_flow_action *)ibobj;
+ break;
+ case IB_FLOW_SPEC_ACTION_COUNT:
+ atomic_inc(&((struct ib_counters *)ibobj)->usecnt);
+ uflow_res->counters[uflow_res->counters_num++] =
+ (struct ib_counters *)ibobj;
+ break;
+ default:
+ WARN_ON(1);
+ }
+
+ uflow_res->num++;
+}
+
+static int kern_spec_to_ib_spec_action(struct ib_uverbs_file *ufile,
+ struct ib_uverbs_flow_spec *kern_spec,
+ union ib_flow_spec *ib_spec,
+ struct ib_uflow_resources *uflow_res)
+{
+ ib_spec->type = kern_spec->type;
+ switch (ib_spec->type) {
+ case IB_FLOW_SPEC_ACTION_TAG:
+ if (kern_spec->flow_tag.size !=
+ sizeof(struct ib_uverbs_flow_spec_action_tag))
+ return -EINVAL;
+
+ ib_spec->flow_tag.size = sizeof(struct ib_flow_spec_action_tag);
+ ib_spec->flow_tag.tag_id = kern_spec->flow_tag.tag_id;
+ break;
+ case IB_FLOW_SPEC_ACTION_DROP:
+ if (kern_spec->drop.size !=
+ sizeof(struct ib_uverbs_flow_spec_action_drop))
+ return -EINVAL;
+
+ ib_spec->drop.size = sizeof(struct ib_flow_spec_action_drop);
+ break;
+ case IB_FLOW_SPEC_ACTION_HANDLE:
+ if (kern_spec->action.size !=
+ sizeof(struct ib_uverbs_flow_spec_action_handle))
+ return -EOPNOTSUPP;
+ ib_spec->action.act = uobj_get_obj_read(flow_action,
+ UVERBS_OBJECT_FLOW_ACTION,
+ kern_spec->action.handle,
+ ufile);
+ if (!ib_spec->action.act)
+ return -EINVAL;
+ ib_spec->action.size =
+ sizeof(struct ib_flow_spec_action_handle);
+ flow_resources_add(uflow_res,
+ IB_FLOW_SPEC_ACTION_HANDLE,
+ ib_spec->action.act);
+ uobj_put_obj_read(ib_spec->action.act);
+ break;
+ case IB_FLOW_SPEC_ACTION_COUNT:
+ if (kern_spec->flow_count.size !=
+ sizeof(struct ib_uverbs_flow_spec_action_count))
+ return -EINVAL;
+ ib_spec->flow_count.counters =
+ uobj_get_obj_read(counters,
+ UVERBS_OBJECT_COUNTERS,
+ kern_spec->flow_count.handle,
+ ufile);
+ if (!ib_spec->flow_count.counters)
+ return -EINVAL;
+ ib_spec->flow_count.size =
+ sizeof(struct ib_flow_spec_action_count);
+ flow_resources_add(uflow_res,
+ IB_FLOW_SPEC_ACTION_COUNT,
+ ib_spec->flow_count.counters);
+ uobj_put_obj_read(ib_spec->flow_count.counters);
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static ssize_t spec_filter_size(const void *kern_spec_filter, u16 kern_filter_size,
+ u16 ib_real_filter_sz)
+{
+ /*
+ * User space filter structures must be 64 bit aligned, otherwise this
+ * may pass, but we won't handle additional new attributes.
+ */
+
+ if (kern_filter_size > ib_real_filter_sz) {
+ if (memchr_inv(kern_spec_filter +
+ ib_real_filter_sz, 0,
+ kern_filter_size - ib_real_filter_sz))
+ return -EINVAL;
+ return ib_real_filter_sz;
+ }
+ return kern_filter_size;
+}
+
+int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type,
+ const void *kern_spec_mask,
+ const void *kern_spec_val,
+ size_t kern_filter_sz,
+ union ib_flow_spec *ib_spec)
+{
+ ssize_t actual_filter_sz;
+ ssize_t ib_filter_sz;
+
+ /* User flow spec size must be aligned to 4 bytes */
+ if (kern_filter_sz != ALIGN(kern_filter_sz, 4))
+ return -EINVAL;
+
+ ib_spec->type = type;
+
+ if (ib_spec->type == (IB_FLOW_SPEC_INNER | IB_FLOW_SPEC_VXLAN_TUNNEL))
+ return -EINVAL;
+
+ switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) {
+ case IB_FLOW_SPEC_ETH:
+ ib_filter_sz = offsetof(struct ib_flow_eth_filter, real_sz);
+ actual_filter_sz = spec_filter_size(kern_spec_mask,
+ kern_filter_sz,
+ ib_filter_sz);
+ if (actual_filter_sz <= 0)
+ return -EINVAL;
+ ib_spec->size = sizeof(struct ib_flow_spec_eth);
+ memcpy(&ib_spec->eth.val, kern_spec_val, actual_filter_sz);
+ memcpy(&ib_spec->eth.mask, kern_spec_mask, actual_filter_sz);
+ break;
+ case IB_FLOW_SPEC_IPV4:
+ ib_filter_sz = offsetof(struct ib_flow_ipv4_filter, real_sz);
+ actual_filter_sz = spec_filter_size(kern_spec_mask,
+ kern_filter_sz,
+ ib_filter_sz);
+ if (actual_filter_sz <= 0)
+ return -EINVAL;
+ ib_spec->size = sizeof(struct ib_flow_spec_ipv4);
+ memcpy(&ib_spec->ipv4.val, kern_spec_val, actual_filter_sz);
+ memcpy(&ib_spec->ipv4.mask, kern_spec_mask, actual_filter_sz);
+ break;
+ case IB_FLOW_SPEC_IPV6:
+ ib_filter_sz = offsetof(struct ib_flow_ipv6_filter, real_sz);
+ actual_filter_sz = spec_filter_size(kern_spec_mask,
+ kern_filter_sz,
+ ib_filter_sz);
+ if (actual_filter_sz <= 0)
+ return -EINVAL;
+ ib_spec->size = sizeof(struct ib_flow_spec_ipv6);
+ memcpy(&ib_spec->ipv6.val, kern_spec_val, actual_filter_sz);
+ memcpy(&ib_spec->ipv6.mask, kern_spec_mask, actual_filter_sz);
+
+ if ((ntohl(ib_spec->ipv6.mask.flow_label)) >= BIT(20) ||
+ (ntohl(ib_spec->ipv6.val.flow_label)) >= BIT(20))
+ return -EINVAL;
+ break;
+ case IB_FLOW_SPEC_TCP:
+ case IB_FLOW_SPEC_UDP:
+ ib_filter_sz = offsetof(struct ib_flow_tcp_udp_filter, real_sz);
+ actual_filter_sz = spec_filter_size(kern_spec_mask,
+ kern_filter_sz,
+ ib_filter_sz);
+ if (actual_filter_sz <= 0)
+ return -EINVAL;
+ ib_spec->size = sizeof(struct ib_flow_spec_tcp_udp);
+ memcpy(&ib_spec->tcp_udp.val, kern_spec_val, actual_filter_sz);
+ memcpy(&ib_spec->tcp_udp.mask, kern_spec_mask, actual_filter_sz);
+ break;
+ case IB_FLOW_SPEC_VXLAN_TUNNEL:
+ ib_filter_sz = offsetof(struct ib_flow_tunnel_filter, real_sz);
+ actual_filter_sz = spec_filter_size(kern_spec_mask,
+ kern_filter_sz,
+ ib_filter_sz);
+ if (actual_filter_sz <= 0)
+ return -EINVAL;
+ ib_spec->tunnel.size = sizeof(struct ib_flow_spec_tunnel);
+ memcpy(&ib_spec->tunnel.val, kern_spec_val, actual_filter_sz);
+ memcpy(&ib_spec->tunnel.mask, kern_spec_mask, actual_filter_sz);
+
+ if ((ntohl(ib_spec->tunnel.mask.tunnel_id)) >= BIT(24) ||
+ (ntohl(ib_spec->tunnel.val.tunnel_id)) >= BIT(24))
+ return -EINVAL;
+ break;
+ case IB_FLOW_SPEC_ESP:
+ ib_filter_sz = offsetof(struct ib_flow_esp_filter, real_sz);
+ actual_filter_sz = spec_filter_size(kern_spec_mask,
+ kern_filter_sz,
+ ib_filter_sz);
+ if (actual_filter_sz <= 0)
+ return -EINVAL;
+ ib_spec->esp.size = sizeof(struct ib_flow_spec_esp);
+ memcpy(&ib_spec->esp.val, kern_spec_val, actual_filter_sz);
+ memcpy(&ib_spec->esp.mask, kern_spec_mask, actual_filter_sz);
+ break;
+ case IB_FLOW_SPEC_GRE:
+ ib_filter_sz = offsetof(struct ib_flow_gre_filter, real_sz);
+ actual_filter_sz = spec_filter_size(kern_spec_mask,
+ kern_filter_sz,
+ ib_filter_sz);
+ if (actual_filter_sz <= 0)
+ return -EINVAL;
+ ib_spec->gre.size = sizeof(struct ib_flow_spec_gre);
+ memcpy(&ib_spec->gre.val, kern_spec_val, actual_filter_sz);
+ memcpy(&ib_spec->gre.mask, kern_spec_mask, actual_filter_sz);
+ break;
+ case IB_FLOW_SPEC_MPLS:
+ ib_filter_sz = offsetof(struct ib_flow_mpls_filter, real_sz);
+ actual_filter_sz = spec_filter_size(kern_spec_mask,
+ kern_filter_sz,
+ ib_filter_sz);
+ if (actual_filter_sz <= 0)
+ return -EINVAL;
+ ib_spec->mpls.size = sizeof(struct ib_flow_spec_mpls);
+ memcpy(&ib_spec->mpls.val, kern_spec_val, actual_filter_sz);
+ memcpy(&ib_spec->mpls.mask, kern_spec_mask, actual_filter_sz);
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int kern_spec_to_ib_spec_filter(struct ib_uverbs_flow_spec *kern_spec,
+ union ib_flow_spec *ib_spec)
+{
+ size_t kern_filter_sz;
+ void *kern_spec_mask;
+ void *kern_spec_val;
+
+ if (check_sub_overflow((size_t)kern_spec->hdr.size,
+ sizeof(struct ib_uverbs_flow_spec_hdr),
+ &kern_filter_sz))
+ return -EINVAL;
+
+ kern_filter_sz /= 2;
+
+ kern_spec_val = (void *)kern_spec +
+ sizeof(struct ib_uverbs_flow_spec_hdr);
+ kern_spec_mask = kern_spec_val + kern_filter_sz;
+
+ return ib_uverbs_kern_spec_to_ib_spec_filter(kern_spec->type,
+ kern_spec_mask,
+ kern_spec_val,
+ kern_filter_sz, ib_spec);
+}
+
+static int kern_spec_to_ib_spec(struct ib_uverbs_file *ufile,
+ struct ib_uverbs_flow_spec *kern_spec,
+ union ib_flow_spec *ib_spec,
+ struct ib_uflow_resources *uflow_res)
+{
+ if (kern_spec->reserved)
+ return -EINVAL;
+
+ if (kern_spec->type >= IB_FLOW_SPEC_ACTION_TAG)
+ return kern_spec_to_ib_spec_action(ufile, kern_spec, ib_spec,
+ uflow_res);
+ else
+ return kern_spec_to_ib_spec_filter(kern_spec, ib_spec);
+}
+
+int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_create_wq cmd = {};
+ struct ib_uverbs_ex_create_wq_resp resp = {};
+ struct ib_uwq_object *obj;
+ int err = 0;
+ struct ib_cq *cq;
+ struct ib_pd *pd;
+ struct ib_wq *wq;
+ struct ib_wq_init_attr wq_init_attr = {};
+ size_t required_cmd_sz;
+ size_t required_resp_len;
+ struct ib_device *ib_dev;
+
+ required_cmd_sz = offsetof(typeof(cmd), max_sge) + sizeof(cmd.max_sge);
+ required_resp_len = offsetof(typeof(resp), wqn) + sizeof(resp.wqn);
+
+ if (ucore->inlen < required_cmd_sz)
+ return -EINVAL;
+
+ if (ucore->outlen < required_resp_len)
+ return -ENOSPC;
+
+ if (ucore->inlen > sizeof(cmd) &&
+ !ib_is_udata_cleared(ucore, sizeof(cmd),
+ ucore->inlen - sizeof(cmd)))
+ return -EOPNOTSUPP;
+
+ err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+ if (err)
+ return err;
+
+ if (cmd.comp_mask)
+ return -EOPNOTSUPP;
+
+ obj = (struct ib_uwq_object *)uobj_alloc(UVERBS_OBJECT_WQ, file,
+ &ib_dev);
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+
+ pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file);
+ if (!pd) {
+ err = -EINVAL;
+ goto err_uobj;
+ }
+
+ cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file);
+ if (!cq) {
+ err = -EINVAL;
+ goto err_put_pd;
+ }
+
+ wq_init_attr.cq = cq;
+ wq_init_attr.max_sge = cmd.max_sge;
+ wq_init_attr.max_wr = cmd.max_wr;
+ wq_init_attr.wq_context = file;
+ wq_init_attr.wq_type = cmd.wq_type;
+ wq_init_attr.event_handler = ib_uverbs_wq_event_handler;
+ if (ucore->inlen >= (offsetof(typeof(cmd), create_flags) +
+ sizeof(cmd.create_flags)))
+ wq_init_attr.create_flags = cmd.create_flags;
+ obj->uevent.events_reported = 0;
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+
+ if (!pd->device->create_wq) {
+ err = -EOPNOTSUPP;
+ goto err_put_cq;
+ }
+ wq = pd->device->create_wq(pd, &wq_init_attr, uhw);
+ if (IS_ERR(wq)) {
+ err = PTR_ERR(wq);
+ goto err_put_cq;
+ }
+
+ wq->uobject = &obj->uevent.uobject;
+ obj->uevent.uobject.object = wq;
+ wq->wq_type = wq_init_attr.wq_type;
+ wq->cq = cq;
+ wq->pd = pd;
+ wq->device = pd->device;
+ wq->wq_context = wq_init_attr.wq_context;
+ atomic_set(&wq->usecnt, 0);
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&cq->usecnt);
+ wq->uobject = &obj->uevent.uobject;
+ obj->uevent.uobject.object = wq;
+
+ memset(&resp, 0, sizeof(resp));
+ resp.wq_handle = obj->uevent.uobject.id;
+ resp.max_sge = wq_init_attr.max_sge;
+ resp.max_wr = wq_init_attr.max_wr;
+ resp.wqn = wq->wq_num;
+ resp.response_length = required_resp_len;
+ err = ib_copy_to_udata(ucore,
+ &resp, resp.response_length);
+ if (err)
+ goto err_copy;
+
+ uobj_put_obj_read(pd);
+ uobj_put_obj_read(cq);
+ return uobj_alloc_commit(&obj->uevent.uobject, 0);
+
+err_copy:
+ ib_destroy_wq(wq);
+err_put_cq:
+ uobj_put_obj_read(cq);
+err_put_pd:
+ uobj_put_obj_read(pd);
+err_uobj:
+ uobj_alloc_abort(&obj->uevent.uobject);
+
+ return err;
+}
+
+int ib_uverbs_ex_destroy_wq(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_destroy_wq cmd = {};
+ struct ib_uverbs_ex_destroy_wq_resp resp = {};
+ struct ib_uobject *uobj;
+ struct ib_uwq_object *obj;
+ size_t required_cmd_sz;
+ size_t required_resp_len;
+ int ret;
+
+ required_cmd_sz = offsetof(typeof(cmd), wq_handle) + sizeof(cmd.wq_handle);
+ required_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
+
+ if (ucore->inlen < required_cmd_sz)
+ return -EINVAL;
+
+ if (ucore->outlen < required_resp_len)
+ return -ENOSPC;
+
+ if (ucore->inlen > sizeof(cmd) &&
+ !ib_is_udata_cleared(ucore, sizeof(cmd),
+ ucore->inlen - sizeof(cmd)))
+ return -EOPNOTSUPP;
+
+ ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+ if (ret)
+ return ret;
+
+ if (cmd.comp_mask)
+ return -EOPNOTSUPP;
+
+ resp.response_length = required_resp_len;
+ uobj = uobj_get_destroy(UVERBS_OBJECT_WQ, cmd.wq_handle, file);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+
+ obj = container_of(uobj, struct ib_uwq_object, uevent.uobject);
+ resp.events_reported = obj->uevent.events_reported;
+
+ uobj_put_destroy(uobj);
+
+ return ib_copy_to_udata(ucore, &resp, resp.response_length);
+}
+
+int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_modify_wq cmd = {};
+ struct ib_wq *wq;
+ struct ib_wq_attr wq_attr = {};
+ size_t required_cmd_sz;
+ int ret;
+
+ required_cmd_sz = offsetof(typeof(cmd), curr_wq_state) + sizeof(cmd.curr_wq_state);
+ if (ucore->inlen < required_cmd_sz)
+ return -EINVAL;
+
+ if (ucore->inlen > sizeof(cmd) &&
+ !ib_is_udata_cleared(ucore, sizeof(cmd),
+ ucore->inlen - sizeof(cmd)))
+ return -EOPNOTSUPP;
+
+ ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+ if (ret)
+ return ret;
+
+ if (!cmd.attr_mask)
+ return -EINVAL;
+
+ if (cmd.attr_mask > (IB_WQ_STATE | IB_WQ_CUR_STATE | IB_WQ_FLAGS))
+ return -EINVAL;
+
+ wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, cmd.wq_handle, file);
+ if (!wq)
+ return -EINVAL;
+
+ wq_attr.curr_wq_state = cmd.curr_wq_state;
+ wq_attr.wq_state = cmd.wq_state;
+ if (cmd.attr_mask & IB_WQ_FLAGS) {
+ wq_attr.flags = cmd.flags;
+ wq_attr.flags_mask = cmd.flags_mask;
+ }
+ if (!wq->device->modify_wq) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ ret = wq->device->modify_wq(wq, &wq_attr, cmd.attr_mask, uhw);
+out:
+ uobj_put_obj_read(wq);
+ return ret;
+}
+
+int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_create_rwq_ind_table cmd = {};
+ struct ib_uverbs_ex_create_rwq_ind_table_resp resp = {};
+ struct ib_uobject *uobj;
+ int err = 0;
+ struct ib_rwq_ind_table_init_attr init_attr = {};
+ struct ib_rwq_ind_table *rwq_ind_tbl;
+ struct ib_wq **wqs = NULL;
+ u32 *wqs_handles = NULL;
+ struct ib_wq *wq = NULL;
+ int i, j, num_read_wqs;
+ u32 num_wq_handles;
+ u32 expected_in_size;
+ size_t required_cmd_sz_header;
+ size_t required_resp_len;
+ struct ib_device *ib_dev;
+
+ required_cmd_sz_header = offsetof(typeof(cmd), log_ind_tbl_size) + sizeof(cmd.log_ind_tbl_size);
+ required_resp_len = offsetof(typeof(resp), ind_tbl_num) + sizeof(resp.ind_tbl_num);
+
+ if (ucore->inlen < required_cmd_sz_header)
+ return -EINVAL;
+
+ if (ucore->outlen < required_resp_len)
+ return -ENOSPC;
+
+ err = ib_copy_from_udata(&cmd, ucore, required_cmd_sz_header);
+ if (err)
+ return err;
+
+ ucore->inbuf += required_cmd_sz_header;
+ ucore->inlen -= required_cmd_sz_header;
+
+ if (cmd.comp_mask)
+ return -EOPNOTSUPP;
+
+ if (cmd.log_ind_tbl_size > IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE)
+ return -EINVAL;
+
+ num_wq_handles = 1 << cmd.log_ind_tbl_size;
+ expected_in_size = num_wq_handles * sizeof(__u32);
+ if (num_wq_handles == 1)
+ /* input size for wq handles is u64 aligned */
+ expected_in_size += sizeof(__u32);
+
+ if (ucore->inlen < expected_in_size)
+ return -EINVAL;
+
+ if (ucore->inlen > expected_in_size &&
+ !ib_is_udata_cleared(ucore, expected_in_size,
+ ucore->inlen - expected_in_size))
+ return -EOPNOTSUPP;
+
+ wqs_handles = kcalloc(num_wq_handles, sizeof(*wqs_handles),
+ GFP_KERNEL);
+ if (!wqs_handles)
+ return -ENOMEM;
+
+ err = ib_copy_from_udata(wqs_handles, ucore,
+ num_wq_handles * sizeof(__u32));
+ if (err)
+ goto err_free;
+
+ wqs = kcalloc(num_wq_handles, sizeof(*wqs), GFP_KERNEL);
+ if (!wqs) {
+ err = -ENOMEM;
+ goto err_free;
+ }
+
+ for (num_read_wqs = 0; num_read_wqs < num_wq_handles;
+ num_read_wqs++) {
+ wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ,
+ wqs_handles[num_read_wqs], file);
+ if (!wq) {
+ err = -EINVAL;
+ goto put_wqs;
+ }
+
+ wqs[num_read_wqs] = wq;
+ }
+
+ uobj = uobj_alloc(UVERBS_OBJECT_RWQ_IND_TBL, file, &ib_dev);
+ if (IS_ERR(uobj)) {
+ err = PTR_ERR(uobj);
+ goto put_wqs;
+ }
+
+ init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size;
+ init_attr.ind_tbl = wqs;
+
+ if (!ib_dev->create_rwq_ind_table) {
+ err = -EOPNOTSUPP;
+ goto err_uobj;
+ }
+ rwq_ind_tbl = ib_dev->create_rwq_ind_table(ib_dev, &init_attr, uhw);
+
+ if (IS_ERR(rwq_ind_tbl)) {
+ err = PTR_ERR(rwq_ind_tbl);
+ goto err_uobj;
+ }
+
+ rwq_ind_tbl->ind_tbl = wqs;
+ rwq_ind_tbl->log_ind_tbl_size = init_attr.log_ind_tbl_size;
+ rwq_ind_tbl->uobject = uobj;
+ uobj->object = rwq_ind_tbl;
+ rwq_ind_tbl->device = ib_dev;
+ atomic_set(&rwq_ind_tbl->usecnt, 0);
+
+ for (i = 0; i < num_wq_handles; i++)
+ atomic_inc(&wqs[i]->usecnt);
+
+ resp.ind_tbl_handle = uobj->id;
+ resp.ind_tbl_num = rwq_ind_tbl->ind_tbl_num;
+ resp.response_length = required_resp_len;
+
+ err = ib_copy_to_udata(ucore,
+ &resp, resp.response_length);
+ if (err)
+ goto err_copy;
+
+ kfree(wqs_handles);
+
+ for (j = 0; j < num_read_wqs; j++)
+ uobj_put_obj_read(wqs[j]);
+
+ return uobj_alloc_commit(uobj, 0);
+
+err_copy:
+ ib_destroy_rwq_ind_table(rwq_ind_tbl);
+err_uobj:
+ uobj_alloc_abort(uobj);
+put_wqs:
+ for (j = 0; j < num_read_wqs; j++)
+ uobj_put_obj_read(wqs[j]);
+err_free:
+ kfree(wqs_handles);
+ kfree(wqs);
+ return err;
+}
+
+int ib_uverbs_ex_destroy_rwq_ind_table(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_destroy_rwq_ind_table cmd = {};
+ int ret;
+ size_t required_cmd_sz;
+
+ required_cmd_sz = offsetof(typeof(cmd), ind_tbl_handle) + sizeof(cmd.ind_tbl_handle);
+
+ if (ucore->inlen < required_cmd_sz)
+ return -EINVAL;
+
+ if (ucore->inlen > sizeof(cmd) &&
+ !ib_is_udata_cleared(ucore, sizeof(cmd),
+ ucore->inlen - sizeof(cmd)))
+ return -EOPNOTSUPP;
+
+ ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+ if (ret)
+ return ret;
+
+ if (cmd.comp_mask)
+ return -EOPNOTSUPP;
+
+ return uobj_perform_destroy(UVERBS_OBJECT_RWQ_IND_TBL,
+ cmd.ind_tbl_handle, file, 0);
+}
+
+int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_create_flow cmd;
+ struct ib_uverbs_create_flow_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_uflow_object *uflow;
+ struct ib_flow *flow_id;
+ struct ib_uverbs_flow_attr *kern_flow_attr;
+ struct ib_flow_attr *flow_attr;
+ struct ib_qp *qp;
+ struct ib_uflow_resources *uflow_res;
+ struct ib_uverbs_flow_spec_hdr *kern_spec;
+ int err = 0;
+ void *ib_spec;
+ int i;
+ struct ib_device *ib_dev;
+
+ if (ucore->inlen < sizeof(cmd))
+ return -EINVAL;
+
+ if (ucore->outlen < sizeof(resp))
+ return -ENOSPC;
+
+ err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+ if (err)
+ return err;
+
+ ucore->inbuf += sizeof(cmd);
+ ucore->inlen -= sizeof(cmd);
+
+ if (cmd.comp_mask)
+ return -EINVAL;
+
+ if (!capable(CAP_NET_RAW))
+ return -EPERM;
+
+ if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED)
+ return -EINVAL;
+
+ if ((cmd.flow_attr.flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) &&
+ ((cmd.flow_attr.type == IB_FLOW_ATTR_ALL_DEFAULT) ||
+ (cmd.flow_attr.type == IB_FLOW_ATTR_MC_DEFAULT)))
+ return -EINVAL;
+
+ if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS)
+ return -EINVAL;
+
+ if (cmd.flow_attr.size > ucore->inlen ||
+ cmd.flow_attr.size >
+ (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec)))
+ return -EINVAL;
+
+ if (cmd.flow_attr.reserved[0] ||
+ cmd.flow_attr.reserved[1])
+ return -EINVAL;
+
+ if (cmd.flow_attr.num_of_specs) {
+ kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size,
+ GFP_KERNEL);
+ if (!kern_flow_attr)
+ return -ENOMEM;
+
+ *kern_flow_attr = cmd.flow_attr;
+ err = ib_copy_from_udata(&kern_flow_attr->flow_specs, ucore,
+ cmd.flow_attr.size);
+ if (err)
+ goto err_free_attr;
+ } else {
+ kern_flow_attr = &cmd.flow_attr;
+ }
+
+ uobj = uobj_alloc(UVERBS_OBJECT_FLOW, file, &ib_dev);
+ if (IS_ERR(uobj)) {
+ err = PTR_ERR(uobj);
+ goto err_free_attr;
+ }
+
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file);
+ if (!qp) {
+ err = -EINVAL;
+ goto err_uobj;
+ }
+
+ if (qp->qp_type != IB_QPT_UD && qp->qp_type != IB_QPT_RAW_PACKET) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
+ if (!qp->device->create_flow) {
+ err = -EOPNOTSUPP;
+ goto err_put;
+ }
+
+ flow_attr = kzalloc(struct_size(flow_attr, flows,
+ cmd.flow_attr.num_of_specs), GFP_KERNEL);
+ if (!flow_attr) {
+ err = -ENOMEM;
+ goto err_put;
+ }
+ uflow_res = flow_resources_alloc(cmd.flow_attr.num_of_specs);
+ if (!uflow_res) {
+ err = -ENOMEM;
+ goto err_free_flow_attr;
+ }
+
+ flow_attr->type = kern_flow_attr->type;
+ flow_attr->priority = kern_flow_attr->priority;
+ flow_attr->num_of_specs = kern_flow_attr->num_of_specs;
+ flow_attr->port = kern_flow_attr->port;
+ flow_attr->flags = kern_flow_attr->flags;
+ flow_attr->size = sizeof(*flow_attr);
+
+ kern_spec = kern_flow_attr->flow_specs;
+ ib_spec = flow_attr + 1;
+ for (i = 0; i < flow_attr->num_of_specs &&
+ cmd.flow_attr.size >= sizeof(*kern_spec) &&
+ cmd.flow_attr.size >= kern_spec->size;
+ i++) {
+ err = kern_spec_to_ib_spec(
+ file, (struct ib_uverbs_flow_spec *)kern_spec,
+ ib_spec, uflow_res);
+ if (err)
+ goto err_free;
+
+ flow_attr->size +=
+ ((union ib_flow_spec *) ib_spec)->size;
+ cmd.flow_attr.size -= kern_spec->size;
+ kern_spec = ((void *)kern_spec) + kern_spec->size;
+ ib_spec += ((union ib_flow_spec *) ib_spec)->size;
+ }
+ if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) {
+ pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n",
+ i, cmd.flow_attr.size);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ flow_id = qp->device->create_flow(qp, flow_attr,
+ IB_FLOW_DOMAIN_USER, uhw);
+
+ if (IS_ERR(flow_id)) {
+ err = PTR_ERR(flow_id);
+ goto err_free;
+ }
+ atomic_inc(&qp->usecnt);
+ flow_id->qp = qp;
+ flow_id->device = qp->device;
+ flow_id->uobject = uobj;
+ uobj->object = flow_id;
+ uflow = container_of(uobj, typeof(*uflow), uobject);
+ uflow->resources = uflow_res;
+
+ memset(&resp, 0, sizeof(resp));
+ resp.flow_handle = uobj->id;
+
+ err = ib_copy_to_udata(ucore,
+ &resp, sizeof(resp));
+ if (err)
+ goto err_copy;
+
+ uobj_put_obj_read(qp);
+ kfree(flow_attr);
+ if (cmd.flow_attr.num_of_specs)
+ kfree(kern_flow_attr);
+ return uobj_alloc_commit(uobj, 0);
+err_copy:
+ if (!qp->device->destroy_flow(flow_id))
+ atomic_dec(&qp->usecnt);
+err_free:
+ ib_uverbs_flow_resources_free(uflow_res);
+err_free_flow_attr:
+ kfree(flow_attr);
+err_put:
+ uobj_put_obj_read(qp);
+err_uobj:
+ uobj_alloc_abort(uobj);
+err_free_attr:
+ if (cmd.flow_attr.num_of_specs)
+ kfree(kern_flow_attr);
+ return err;
+}
+
+int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_destroy_flow cmd;
+ int ret;
+
+ if (ucore->inlen < sizeof(cmd))
+ return -EINVAL;
+
+ ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ if (cmd.comp_mask)
+ return -EINVAL;
+
+ return uobj_perform_destroy(UVERBS_OBJECT_FLOW, cmd.flow_handle, file,
+ 0);
+}
+
+static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
+ struct ib_uverbs_create_xsrq *cmd,
+ struct ib_udata *udata)
+{
+ struct ib_uverbs_create_srq_resp resp;
+ struct ib_usrq_object *obj;
+ struct ib_pd *pd;
+ struct ib_srq *srq;
+ struct ib_uobject *uninitialized_var(xrcd_uobj);
+ struct ib_srq_init_attr attr;
+ int ret;
+ struct ib_device *ib_dev;
+
+ obj = (struct ib_usrq_object *)uobj_alloc(UVERBS_OBJECT_SRQ, file,
+ &ib_dev);
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+
+ if (cmd->srq_type == IB_SRQT_TM)
+ attr.ext.tag_matching.max_num_tags = cmd->max_num_tags;
+
+ if (cmd->srq_type == IB_SRQT_XRC) {
+ xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd->xrcd_handle,
+ file);
+ if (IS_ERR(xrcd_uobj)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ attr.ext.xrc.xrcd = (struct ib_xrcd *)xrcd_uobj->object;
+ if (!attr.ext.xrc.xrcd) {
+ ret = -EINVAL;
+ goto err_put_xrcd;
+ }
+
+ obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
+ atomic_inc(&obj->uxrcd->refcnt);
+ }
+
+ if (ib_srq_has_cq(cmd->srq_type)) {
+ attr.ext.cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ,
+ cmd->cq_handle, file);
+ if (!attr.ext.cq) {
+ ret = -EINVAL;
+ goto err_put_xrcd;
+ }
+ }
+
+ pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, file);
+ if (!pd) {
+ ret = -EINVAL;
+ goto err_put_cq;
+ }
+
+ attr.event_handler = ib_uverbs_srq_event_handler;
+ attr.srq_context = file;
+ attr.srq_type = cmd->srq_type;
+ attr.attr.max_wr = cmd->max_wr;
+ attr.attr.max_sge = cmd->max_sge;
+ attr.attr.srq_limit = cmd->srq_limit;
+
+ obj->uevent.events_reported = 0;
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+
+ srq = pd->device->create_srq(pd, &attr, udata);
+ if (IS_ERR(srq)) {
+ ret = PTR_ERR(srq);
+ goto err_put;
+ }
+
+ srq->device = pd->device;
+ srq->pd = pd;
+ srq->srq_type = cmd->srq_type;
+ srq->uobject = &obj->uevent.uobject;
+ srq->event_handler = attr.event_handler;
+ srq->srq_context = attr.srq_context;
+
+ if (ib_srq_has_cq(cmd->srq_type)) {
+ srq->ext.cq = attr.ext.cq;
+ atomic_inc(&attr.ext.cq->usecnt);
+ }
+
+ if (cmd->srq_type == IB_SRQT_XRC) {
+ srq->ext.xrc.xrcd = attr.ext.xrc.xrcd;
+ atomic_inc(&attr.ext.xrc.xrcd->usecnt);
+ }
+
+ atomic_inc(&pd->usecnt);
+ atomic_set(&srq->usecnt, 0);
+
+ obj->uevent.uobject.object = srq;
+ obj->uevent.uobject.user_handle = cmd->user_handle;
+
+ memset(&resp, 0, sizeof resp);
+ resp.srq_handle = obj->uevent.uobject.id;
+ resp.max_wr = attr.attr.max_wr;
+ resp.max_sge = attr.attr.max_sge;
+ if (cmd->srq_type == IB_SRQT_XRC)
+ resp.srqn = srq->ext.xrc.srq_num;
+
+ if (copy_to_user(u64_to_user_ptr(cmd->response),
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ if (cmd->srq_type == IB_SRQT_XRC)
+ uobj_put_read(xrcd_uobj);
+
+ if (ib_srq_has_cq(cmd->srq_type))
+ uobj_put_obj_read(attr.ext.cq);
+
+ uobj_put_obj_read(pd);
+ return uobj_alloc_commit(&obj->uevent.uobject, 0);
+
+err_copy:
+ ib_destroy_srq(srq);
+
+err_put:
+ uobj_put_obj_read(pd);
+
+err_put_cq:
+ if (ib_srq_has_cq(cmd->srq_type))
+ uobj_put_obj_read(attr.ext.cq);
+
+err_put_xrcd:
+ if (cmd->srq_type == IB_SRQT_XRC) {
+ atomic_dec(&obj->uxrcd->refcnt);
+ uobj_put_read(xrcd_uobj);
+ }
+
+err:
+ uobj_alloc_abort(&obj->uevent.uobject);
+ return ret;
+}
+
+ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_create_srq cmd;
+ struct ib_uverbs_create_xsrq xcmd;
+ struct ib_uverbs_create_srq_resp resp;
+ struct ib_udata udata;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ memset(&xcmd, 0, sizeof(xcmd));
+ xcmd.response = cmd.response;
+ xcmd.user_handle = cmd.user_handle;
+ xcmd.srq_type = IB_SRQT_BASIC;
+ xcmd.pd_handle = cmd.pd_handle;
+ xcmd.max_wr = cmd.max_wr;
+ xcmd.max_sge = cmd.max_sge;
+ xcmd.srq_limit = cmd.srq_limit;
+
+ ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+ u64_to_user_ptr(cmd.response) + sizeof(resp),
+ in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+ out_len - sizeof(resp));
+
+ ret = __uverbs_create_xsrq(file, &xcmd, &udata);
+ if (ret)
+ return ret;
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len, int out_len)
+{
+ struct ib_uverbs_create_xsrq cmd;
+ struct ib_uverbs_create_srq_resp resp;
+ struct ib_udata udata;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+ u64_to_user_ptr(cmd.response) + sizeof(resp),
+ in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+ out_len - sizeof(resp));
+
+ ret = __uverbs_create_xsrq(file, &cmd, &udata);
+ if (ret)
+ return ret;
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_modify_srq cmd;
+ struct ib_udata udata;
+ struct ib_srq *srq;
+ struct ib_srq_attr attr;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ ib_uverbs_init_udata(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
+ out_len);
+
+ srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, file);
+ if (!srq)
+ return -EINVAL;
+
+ attr.max_wr = cmd.max_wr;
+ attr.srq_limit = cmd.srq_limit;
+
+ ret = srq->device->modify_srq(srq, &attr, cmd.attr_mask, &udata);
+
+ uobj_put_obj_read(srq);
+
+ return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
+ const char __user *buf,
+ int in_len, int out_len)
+{
+ struct ib_uverbs_query_srq cmd;
+ struct ib_uverbs_query_srq_resp resp;
+ struct ib_srq_attr attr;
+ struct ib_srq *srq;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, file);
+ if (!srq)
+ return -EINVAL;
+
+ ret = ib_query_srq(srq, &attr);
+
+ uobj_put_obj_read(srq);
+
+ if (ret)
+ return ret;
+
+ memset(&resp, 0, sizeof resp);
+
+ resp.max_wr = attr.max_wr;
+ resp.max_sge = attr.max_sge;
+ resp.srq_limit = attr.srq_limit;
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
+ return -EFAULT;
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_destroy_srq cmd;
+ struct ib_uverbs_destroy_srq_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_uevent_object *obj;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ uobj = uobj_get_destroy(UVERBS_OBJECT_SRQ, cmd.srq_handle, file);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+
+ obj = container_of(uobj, struct ib_uevent_object, uobject);
+ memset(&resp, 0, sizeof(resp));
+ resp.events_reported = obj->events_reported;
+
+ uobj_put_destroy(uobj);
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp)))
+ return -EFAULT;
+
+ return in_len;
+}
+
+int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_query_device_resp resp = { {0} };
+ struct ib_uverbs_ex_query_device cmd;
+ struct ib_device_attr attr = {0};
+ struct ib_ucontext *ucontext;
+ struct ib_device *ib_dev;
+ int err;
+
+ ucontext = ib_uverbs_get_ucontext(file);
+ if (IS_ERR(ucontext))
+ return PTR_ERR(ucontext);
+ ib_dev = ucontext->device;
+
+ if (!ib_dev->query_device)
+ return -EOPNOTSUPP;
+
+ if (ucore->inlen < sizeof(cmd))
+ return -EINVAL;
+
+ err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+ if (err)
+ return err;
+
+ if (cmd.comp_mask)
+ return -EINVAL;
+
+ if (cmd.reserved)
+ return -EINVAL;
+
+ resp.response_length = offsetof(typeof(resp), odp_caps);
+
+ if (ucore->outlen < resp.response_length)
+ return -ENOSPC;
+
+ err = ib_dev->query_device(ib_dev, &attr, uhw);
+ if (err)
+ return err;
+
+ copy_query_dev_fields(ucontext, &resp.base, &attr);
+
+ if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps))
+ goto end;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+ resp.odp_caps.general_caps = attr.odp_caps.general_caps;
+ resp.odp_caps.per_transport_caps.rc_odp_caps =
+ attr.odp_caps.per_transport_caps.rc_odp_caps;
+ resp.odp_caps.per_transport_caps.uc_odp_caps =
+ attr.odp_caps.per_transport_caps.uc_odp_caps;
+ resp.odp_caps.per_transport_caps.ud_odp_caps =
+ attr.odp_caps.per_transport_caps.ud_odp_caps;
+#endif
+ resp.response_length += sizeof(resp.odp_caps);
+
+ if (ucore->outlen < resp.response_length + sizeof(resp.timestamp_mask))
+ goto end;
+
+ resp.timestamp_mask = attr.timestamp_mask;
+ resp.response_length += sizeof(resp.timestamp_mask);
+
+ if (ucore->outlen < resp.response_length + sizeof(resp.hca_core_clock))
+ goto end;
+
+ resp.hca_core_clock = attr.hca_core_clock;
+ resp.response_length += sizeof(resp.hca_core_clock);
+
+ if (ucore->outlen < resp.response_length + sizeof(resp.device_cap_flags_ex))
+ goto end;
+
+ resp.device_cap_flags_ex = attr.device_cap_flags;
+ resp.response_length += sizeof(resp.device_cap_flags_ex);
+
+ if (ucore->outlen < resp.response_length + sizeof(resp.rss_caps))
+ goto end;
+
+ resp.rss_caps.supported_qpts = attr.rss_caps.supported_qpts;
+ resp.rss_caps.max_rwq_indirection_tables =
+ attr.rss_caps.max_rwq_indirection_tables;
+ resp.rss_caps.max_rwq_indirection_table_size =
+ attr.rss_caps.max_rwq_indirection_table_size;
+
+ resp.response_length += sizeof(resp.rss_caps);
+
+ if (ucore->outlen < resp.response_length + sizeof(resp.max_wq_type_rq))
+ goto end;
+
+ resp.max_wq_type_rq = attr.max_wq_type_rq;
+ resp.response_length += sizeof(resp.max_wq_type_rq);
+
+ if (ucore->outlen < resp.response_length + sizeof(resp.raw_packet_caps))
+ goto end;
+
+ resp.raw_packet_caps = attr.raw_packet_caps;
+ resp.response_length += sizeof(resp.raw_packet_caps);
+
+ if (ucore->outlen < resp.response_length + sizeof(resp.tm_caps))
+ goto end;
+
+ resp.tm_caps.max_rndv_hdr_size = attr.tm_caps.max_rndv_hdr_size;
+ resp.tm_caps.max_num_tags = attr.tm_caps.max_num_tags;
+ resp.tm_caps.max_ops = attr.tm_caps.max_ops;
+ resp.tm_caps.max_sge = attr.tm_caps.max_sge;
+ resp.tm_caps.flags = attr.tm_caps.flags;
+ resp.response_length += sizeof(resp.tm_caps);
+
+ if (ucore->outlen < resp.response_length + sizeof(resp.cq_moderation_caps))
+ goto end;
+
+ resp.cq_moderation_caps.max_cq_moderation_count =
+ attr.cq_caps.max_cq_moderation_count;
+ resp.cq_moderation_caps.max_cq_moderation_period =
+ attr.cq_caps.max_cq_moderation_period;
+ resp.response_length += sizeof(resp.cq_moderation_caps);
+
+ if (ucore->outlen < resp.response_length + sizeof(resp.max_dm_size))
+ goto end;
+
+ resp.max_dm_size = attr.max_dm_size;
+ resp.response_length += sizeof(resp.max_dm_size);
+end:
+ err = ib_copy_to_udata(ucore, &resp, resp.response_length);
+ return err;
+}
+
+int ib_uverbs_ex_modify_cq(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_modify_cq cmd = {};
+ struct ib_cq *cq;
+ size_t required_cmd_sz;
+ int ret;
+
+ required_cmd_sz = offsetof(typeof(cmd), reserved) +
+ sizeof(cmd.reserved);
+ if (ucore->inlen < required_cmd_sz)
+ return -EINVAL;
+
+ /* sanity checks */
+ if (ucore->inlen > sizeof(cmd) &&
+ !ib_is_udata_cleared(ucore, sizeof(cmd),
+ ucore->inlen - sizeof(cmd)))
+ return -EOPNOTSUPP;
+
+ ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+ if (ret)
+ return ret;
+
+ if (!cmd.attr_mask || cmd.reserved)
+ return -EINVAL;
+
+ if (cmd.attr_mask > IB_CQ_MODERATE)
+ return -EOPNOTSUPP;
+
+ cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file);
+ if (!cq)
+ return -EINVAL;
+
+ ret = rdma_set_cq_moderation(cq, cmd.attr.cq_count, cmd.attr.cq_period);
+
+ uobj_put_obj_read(cq);
+
+ return ret;
+}
diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c
new file mode 100644
index 000000000..1a6b229e3
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_ioctl.c
@@ -0,0 +1,613 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/rdma_user_ioctl.h>
+#include <rdma/uverbs_ioctl.h>
+#include "rdma_core.h"
+#include "uverbs.h"
+
+struct bundle_alloc_head {
+ struct bundle_alloc_head *next;
+ u8 data[];
+};
+
+struct bundle_priv {
+ /* Must be first */
+ struct bundle_alloc_head alloc_head;
+ struct bundle_alloc_head *allocated_mem;
+ size_t internal_avail;
+ size_t internal_used;
+
+ struct radix_tree_root *radix;
+ const struct uverbs_api_ioctl_method *method_elm;
+ void __rcu **radix_slots;
+ unsigned long radix_slots_len;
+ u32 method_key;
+
+ struct ib_uverbs_attr __user *user_attrs;
+ struct ib_uverbs_attr *uattrs;
+
+ DECLARE_BITMAP(uobj_finalize, UVERBS_API_ATTR_BKEY_LEN);
+
+ /*
+ * Must be last. bundle ends in a flex array which overlaps
+ * internal_buffer.
+ */
+ struct uverbs_attr_bundle bundle;
+ u64 internal_buffer[32];
+};
+
+/*
+ * Each method has an absolute minimum amount of memory it needs to allocate,
+ * precompute that amount and determine if the onstack memory can be used or
+ * if allocation is need.
+ */
+void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm,
+ unsigned int num_attrs)
+{
+ struct bundle_priv *pbundle;
+ size_t bundle_size =
+ offsetof(struct bundle_priv, internal_buffer) +
+ sizeof(*pbundle->bundle.attrs) * method_elm->key_bitmap_len +
+ sizeof(*pbundle->uattrs) * num_attrs;
+
+ method_elm->use_stack = bundle_size <= sizeof(*pbundle);
+ method_elm->bundle_size =
+ ALIGN(bundle_size + 256, sizeof(*pbundle->internal_buffer));
+
+ /* Do not want order-2 allocations for this. */
+ WARN_ON_ONCE(method_elm->bundle_size > PAGE_SIZE);
+}
+
+/**
+ * uverbs_alloc() - Quickly allocate memory for use with a bundle
+ * @bundle: The bundle
+ * @size: Number of bytes to allocate
+ * @flags: Allocator flags
+ *
+ * The bundle allocator is intended for allocations that are connected with
+ * processing the system call related to the bundle. The allocated memory is
+ * always freed once the system call completes, and cannot be freed any other
+ * way.
+ *
+ * This tries to use a small pool of pre-allocated memory for performance.
+ */
+__malloc void *_uverbs_alloc(struct uverbs_attr_bundle *bundle, size_t size,
+ gfp_t flags)
+{
+ struct bundle_priv *pbundle =
+ container_of(bundle, struct bundle_priv, bundle);
+ size_t new_used;
+ void *res;
+
+ if (check_add_overflow(size, pbundle->internal_used, &new_used))
+ return ERR_PTR(-EOVERFLOW);
+
+ if (new_used > pbundle->internal_avail) {
+ struct bundle_alloc_head *buf;
+
+ buf = kvmalloc(struct_size(buf, data, size), flags);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+ buf->next = pbundle->allocated_mem;
+ pbundle->allocated_mem = buf;
+ return buf->data;
+ }
+
+ res = (void *)pbundle->internal_buffer + pbundle->internal_used;
+ pbundle->internal_used =
+ ALIGN(new_used, sizeof(*pbundle->internal_buffer));
+ if (flags & __GFP_ZERO)
+ memset(res, 0, size);
+ return res;
+}
+EXPORT_SYMBOL(_uverbs_alloc);
+
+static bool uverbs_is_attr_cleared(const struct ib_uverbs_attr *uattr,
+ u16 len)
+{
+ if (uattr->len > sizeof(((struct ib_uverbs_attr *)0)->data))
+ return ib_is_buffer_cleared(u64_to_user_ptr(uattr->data) + len,
+ uattr->len - len);
+
+ return !memchr_inv((const void *)&uattr->data + len,
+ 0, uattr->len - len);
+}
+
+static int uverbs_process_attr(struct bundle_priv *pbundle,
+ const struct uverbs_api_attr *attr_uapi,
+ struct ib_uverbs_attr *uattr, u32 attr_bkey)
+{
+ const struct uverbs_attr_spec *spec = &attr_uapi->spec;
+ struct uverbs_attr *e = &pbundle->bundle.attrs[attr_bkey];
+ const struct uverbs_attr_spec *val_spec = spec;
+ struct uverbs_obj_attr *o_attr;
+
+ switch (spec->type) {
+ case UVERBS_ATTR_TYPE_ENUM_IN:
+ if (uattr->attr_data.enum_data.elem_id >= spec->u.enum_def.num_elems)
+ return -EOPNOTSUPP;
+
+ if (uattr->attr_data.enum_data.reserved)
+ return -EINVAL;
+
+ val_spec = &spec->u2.enum_def.ids[uattr->attr_data.enum_data.elem_id];
+
+ /* Currently we only support PTR_IN based enums */
+ if (val_spec->type != UVERBS_ATTR_TYPE_PTR_IN)
+ return -EOPNOTSUPP;
+
+ e->ptr_attr.enum_id = uattr->attr_data.enum_data.elem_id;
+ /* fall through */
+ case UVERBS_ATTR_TYPE_PTR_IN:
+ /* Ensure that any data provided by userspace beyond the known
+ * struct is zero. Userspace that knows how to use some future
+ * longer struct will fail here if used with an old kernel and
+ * non-zero content, making ABI compat/discovery simpler.
+ */
+ if (uattr->len > val_spec->u.ptr.len &&
+ val_spec->zero_trailing &&
+ !uverbs_is_attr_cleared(uattr, val_spec->u.ptr.len))
+ return -EOPNOTSUPP;
+
+ /* fall through */
+ case UVERBS_ATTR_TYPE_PTR_OUT:
+ if (uattr->len < val_spec->u.ptr.min_len ||
+ (!val_spec->zero_trailing &&
+ uattr->len > val_spec->u.ptr.len))
+ return -EINVAL;
+
+ if (spec->type != UVERBS_ATTR_TYPE_ENUM_IN &&
+ uattr->attr_data.reserved)
+ return -EINVAL;
+
+ e->ptr_attr.uattr_idx = uattr - pbundle->uattrs;
+ e->ptr_attr.len = uattr->len;
+
+ if (val_spec->alloc_and_copy && !uverbs_attr_ptr_is_inline(e)) {
+ void *p;
+
+ p = uverbs_alloc(&pbundle->bundle, uattr->len);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ e->ptr_attr.ptr = p;
+
+ if (copy_from_user(p, u64_to_user_ptr(uattr->data),
+ uattr->len))
+ return -EFAULT;
+ } else {
+ e->ptr_attr.data = uattr->data;
+ }
+ break;
+
+ case UVERBS_ATTR_TYPE_IDR:
+ case UVERBS_ATTR_TYPE_FD:
+ if (uattr->attr_data.reserved)
+ return -EINVAL;
+
+ if (uattr->len != 0)
+ return -EINVAL;
+
+ o_attr = &e->obj_attr;
+ o_attr->attr_elm = attr_uapi;
+
+ /*
+ * The type of uattr->data is u64 for UVERBS_ATTR_TYPE_IDR and
+ * s64 for UVERBS_ATTR_TYPE_FD. We can cast the u64 to s64
+ * here without caring about truncation as we know that the
+ * IDR implementation today rejects negative IDs
+ */
+ o_attr->uobject = uverbs_get_uobject_from_file(
+ spec->u.obj.obj_type,
+ pbundle->bundle.ufile,
+ spec->u.obj.access,
+ uattr->data_s64);
+ if (IS_ERR(o_attr->uobject))
+ return PTR_ERR(o_attr->uobject);
+ __set_bit(attr_bkey, pbundle->uobj_finalize);
+
+ if (spec->u.obj.access == UVERBS_ACCESS_NEW) {
+ unsigned int uattr_idx = uattr - pbundle->uattrs;
+ s64 id = o_attr->uobject->id;
+
+ /* Copy the allocated id to the user-space */
+ if (put_user(id, &pbundle->user_attrs[uattr_idx].data))
+ return -EFAULT;
+ }
+
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+/*
+ * We search the radix tree with the method prefix and now we want to fast
+ * search the suffix bits to get a particular attribute pointer. It is not
+ * totally clear to me if this breaks the radix tree encasulation or not, but
+ * it uses the iter data to determine if the method iter points at the same
+ * chunk that will store the attribute, if so it just derefs it directly. By
+ * construction in most kernel configs the method and attrs will all fit in a
+ * single radix chunk, so in most cases this will have no search. Other cases
+ * this falls back to a full search.
+ */
+static void __rcu **uapi_get_attr_for_method(struct bundle_priv *pbundle,
+ u32 attr_key)
+{
+ void __rcu **slot;
+
+ if (likely(attr_key < pbundle->radix_slots_len)) {
+ void *entry;
+
+ slot = pbundle->radix_slots + attr_key;
+ entry = rcu_dereference_raw(*slot);
+ if (likely(!radix_tree_is_internal_node(entry) && entry))
+ return slot;
+ }
+
+ return radix_tree_lookup_slot(pbundle->radix,
+ pbundle->method_key | attr_key);
+}
+
+static int uverbs_set_attr(struct bundle_priv *pbundle,
+ struct ib_uverbs_attr *uattr)
+{
+ u32 attr_key = uapi_key_attr(uattr->attr_id);
+ u32 attr_bkey = uapi_bkey_attr(attr_key);
+ const struct uverbs_api_attr *attr;
+ void __rcu **slot;
+ int ret;
+
+ slot = uapi_get_attr_for_method(pbundle, attr_key);
+ if (!slot) {
+ /*
+ * Kernel does not support the attribute but user-space says it
+ * is mandatory
+ */
+ if (uattr->flags & UVERBS_ATTR_F_MANDATORY)
+ return -EPROTONOSUPPORT;
+ return 0;
+ }
+ attr = srcu_dereference(
+ *slot, &pbundle->bundle.ufile->device->disassociate_srcu);
+
+ /* Reject duplicate attributes from user-space */
+ if (test_bit(attr_bkey, pbundle->bundle.attr_present))
+ return -EINVAL;
+
+ ret = uverbs_process_attr(pbundle, attr, uattr, attr_bkey);
+ if (ret)
+ return ret;
+
+ __set_bit(attr_bkey, pbundle->bundle.attr_present);
+
+ return 0;
+}
+
+static int ib_uverbs_run_method(struct bundle_priv *pbundle,
+ unsigned int num_attrs)
+{
+ int (*handler)(struct ib_uverbs_file *ufile,
+ struct uverbs_attr_bundle *ctx);
+ size_t uattrs_size = array_size(sizeof(*pbundle->uattrs), num_attrs);
+ unsigned int destroy_bkey = pbundle->method_elm->destroy_bkey;
+ unsigned int i;
+ int ret;
+
+ /* See uverbs_disassociate_api() */
+ handler = srcu_dereference(
+ pbundle->method_elm->handler,
+ &pbundle->bundle.ufile->device->disassociate_srcu);
+ if (!handler)
+ return -EIO;
+
+ pbundle->uattrs = uverbs_alloc(&pbundle->bundle, uattrs_size);
+ if (IS_ERR(pbundle->uattrs))
+ return PTR_ERR(pbundle->uattrs);
+ if (copy_from_user(pbundle->uattrs, pbundle->user_attrs, uattrs_size))
+ return -EFAULT;
+
+ for (i = 0; i != num_attrs; i++) {
+ ret = uverbs_set_attr(pbundle, &pbundle->uattrs[i]);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ /* User space did not provide all the mandatory attributes */
+ if (unlikely(!bitmap_subset(pbundle->method_elm->attr_mandatory,
+ pbundle->bundle.attr_present,
+ pbundle->method_elm->key_bitmap_len)))
+ return -EINVAL;
+
+ if (destroy_bkey != UVERBS_API_ATTR_BKEY_LEN) {
+ struct uverbs_obj_attr *destroy_attr =
+ &pbundle->bundle.attrs[destroy_bkey].obj_attr;
+
+ ret = uobj_destroy(destroy_attr->uobject);
+ if (ret)
+ return ret;
+ __clear_bit(destroy_bkey, pbundle->uobj_finalize);
+
+ ret = handler(pbundle->bundle.ufile, &pbundle->bundle);
+ uobj_put_destroy(destroy_attr->uobject);
+ } else {
+ ret = handler(pbundle->bundle.ufile, &pbundle->bundle);
+ }
+
+ /*
+ * EPROTONOSUPPORT is ONLY to be returned if the ioctl framework can
+ * not invoke the method because the request is not supported. No
+ * other cases should return this code.
+ */
+ if (WARN_ON_ONCE(ret == -EPROTONOSUPPORT))
+ return -EINVAL;
+
+ return ret;
+}
+
+static int bundle_destroy(struct bundle_priv *pbundle, bool commit)
+{
+ unsigned int key_bitmap_len = pbundle->method_elm->key_bitmap_len;
+ struct bundle_alloc_head *memblock;
+ unsigned int i;
+ int ret = 0;
+
+ i = -1;
+ while ((i = find_next_bit(pbundle->uobj_finalize, key_bitmap_len,
+ i + 1)) < key_bitmap_len) {
+ struct uverbs_attr *attr = &pbundle->bundle.attrs[i];
+ int current_ret;
+
+ current_ret = uverbs_finalize_object(
+ attr->obj_attr.uobject,
+ attr->obj_attr.attr_elm->spec.u.obj.access, commit);
+ if (!ret)
+ ret = current_ret;
+ }
+
+ for (memblock = pbundle->allocated_mem; memblock;) {
+ struct bundle_alloc_head *tmp = memblock;
+
+ memblock = memblock->next;
+ kvfree(tmp);
+ }
+
+ return ret;
+}
+
+static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile,
+ struct ib_uverbs_ioctl_hdr *hdr,
+ struct ib_uverbs_attr __user *user_attrs)
+{
+ const struct uverbs_api_ioctl_method *method_elm;
+ struct uverbs_api *uapi = ufile->device->uapi;
+ struct radix_tree_iter attrs_iter;
+ struct bundle_priv *pbundle;
+ struct bundle_priv onstack;
+ void __rcu **slot;
+ int destroy_ret;
+ int ret;
+
+ if (unlikely(hdr->driver_id != uapi->driver_id))
+ return -EINVAL;
+
+ slot = radix_tree_iter_lookup(
+ &uapi->radix, &attrs_iter,
+ uapi_key_obj(hdr->object_id) |
+ uapi_key_ioctl_method(hdr->method_id));
+ if (unlikely(!slot))
+ return -EPROTONOSUPPORT;
+ method_elm = srcu_dereference(*slot, &ufile->device->disassociate_srcu);
+
+ if (!method_elm->use_stack) {
+ pbundle = kmalloc(method_elm->bundle_size, GFP_KERNEL);
+ if (!pbundle)
+ return -ENOMEM;
+ pbundle->internal_avail =
+ method_elm->bundle_size -
+ offsetof(struct bundle_priv, internal_buffer);
+ pbundle->alloc_head.next = NULL;
+ pbundle->allocated_mem = &pbundle->alloc_head;
+ } else {
+ pbundle = &onstack;
+ pbundle->internal_avail = sizeof(pbundle->internal_buffer);
+ pbundle->allocated_mem = NULL;
+ }
+
+ /* Space for the pbundle->bundle.attrs flex array */
+ pbundle->method_elm = method_elm;
+ pbundle->method_key = attrs_iter.index;
+ pbundle->bundle.ufile = ufile;
+ pbundle->radix = &uapi->radix;
+ pbundle->radix_slots = slot;
+ pbundle->radix_slots_len = radix_tree_chunk_size(&attrs_iter);
+ pbundle->user_attrs = user_attrs;
+
+ pbundle->internal_used = ALIGN(pbundle->method_elm->key_bitmap_len *
+ sizeof(*pbundle->bundle.attrs),
+ sizeof(*pbundle->internal_buffer));
+ memset(pbundle->bundle.attr_present, 0,
+ sizeof(pbundle->bundle.attr_present));
+ memset(pbundle->uobj_finalize, 0, sizeof(pbundle->uobj_finalize));
+
+ ret = ib_uverbs_run_method(pbundle, hdr->num_attrs);
+ destroy_ret = bundle_destroy(pbundle, ret == 0);
+ if (unlikely(destroy_ret && !ret))
+ return destroy_ret;
+
+ return ret;
+}
+
+long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ struct ib_uverbs_file *file = filp->private_data;
+ struct ib_uverbs_ioctl_hdr __user *user_hdr =
+ (struct ib_uverbs_ioctl_hdr __user *)arg;
+ struct ib_uverbs_ioctl_hdr hdr;
+ int srcu_key;
+ int err;
+
+ if (unlikely(cmd != RDMA_VERBS_IOCTL))
+ return -ENOIOCTLCMD;
+
+ err = copy_from_user(&hdr, user_hdr, sizeof(hdr));
+ if (err)
+ return -EFAULT;
+
+ if (hdr.length > PAGE_SIZE ||
+ hdr.length != struct_size(&hdr, attrs, hdr.num_attrs))
+ return -EINVAL;
+
+ if (hdr.reserved1 || hdr.reserved2)
+ return -EPROTONOSUPPORT;
+
+ srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+ err = ib_uverbs_cmd_verbs(file, &hdr, user_hdr->attrs);
+ srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+ return err;
+}
+
+int uverbs_get_flags64(u64 *to, const struct uverbs_attr_bundle *attrs_bundle,
+ size_t idx, u64 allowed_bits)
+{
+ const struct uverbs_attr *attr;
+ u64 flags;
+
+ attr = uverbs_attr_get(attrs_bundle, idx);
+ /* Missing attribute means 0 flags */
+ if (IS_ERR(attr)) {
+ *to = 0;
+ return 0;
+ }
+
+ /*
+ * New userspace code should use 8 bytes to pass flags, but we
+ * transparently support old userspaces that were using 4 bytes as
+ * well.
+ */
+ if (attr->ptr_attr.len == 8)
+ flags = attr->ptr_attr.data;
+ else if (attr->ptr_attr.len == 4)
+ flags = *(u32 *)&attr->ptr_attr.data;
+ else
+ return -EINVAL;
+
+ if (flags & ~allowed_bits)
+ return -EINVAL;
+
+ *to = flags;
+ return 0;
+}
+EXPORT_SYMBOL(uverbs_get_flags64);
+
+int uverbs_get_flags32(u32 *to, const struct uverbs_attr_bundle *attrs_bundle,
+ size_t idx, u64 allowed_bits)
+{
+ u64 flags;
+ int ret;
+
+ ret = uverbs_get_flags64(&flags, attrs_bundle, idx, allowed_bits);
+ if (ret)
+ return ret;
+
+ if (flags > U32_MAX)
+ return -EINVAL;
+ *to = flags;
+
+ return 0;
+}
+EXPORT_SYMBOL(uverbs_get_flags32);
+
+/*
+ * This is for ease of conversion. The purpose is to convert all drivers to
+ * use uverbs_attr_bundle instead of ib_udata. Assume attr == 0 is input and
+ * attr == 1 is output.
+ */
+void create_udata(struct uverbs_attr_bundle *bundle, struct ib_udata *udata)
+{
+ struct bundle_priv *pbundle =
+ container_of(bundle, struct bundle_priv, bundle);
+ const struct uverbs_attr *uhw_in =
+ uverbs_attr_get(bundle, UVERBS_ATTR_UHW_IN);
+ const struct uverbs_attr *uhw_out =
+ uverbs_attr_get(bundle, UVERBS_ATTR_UHW_OUT);
+
+ if (!IS_ERR(uhw_in)) {
+ udata->inlen = uhw_in->ptr_attr.len;
+ if (uverbs_attr_ptr_is_inline(uhw_in))
+ udata->inbuf =
+ &pbundle->user_attrs[uhw_in->ptr_attr.uattr_idx]
+ .data;
+ else
+ udata->inbuf = u64_to_user_ptr(uhw_in->ptr_attr.data);
+ } else {
+ udata->inbuf = NULL;
+ udata->inlen = 0;
+ }
+
+ if (!IS_ERR(uhw_out)) {
+ udata->outbuf = u64_to_user_ptr(uhw_out->ptr_attr.data);
+ udata->outlen = uhw_out->ptr_attr.len;
+ } else {
+ udata->outbuf = NULL;
+ udata->outlen = 0;
+ }
+}
+
+int uverbs_copy_to(const struct uverbs_attr_bundle *bundle, size_t idx,
+ const void *from, size_t size)
+{
+ struct bundle_priv *pbundle =
+ container_of(bundle, struct bundle_priv, bundle);
+ const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx);
+ u16 flags;
+ size_t min_size;
+
+ if (IS_ERR(attr))
+ return PTR_ERR(attr);
+
+ min_size = min_t(size_t, attr->ptr_attr.len, size);
+ if (copy_to_user(u64_to_user_ptr(attr->ptr_attr.data), from, min_size))
+ return -EFAULT;
+
+ flags = pbundle->uattrs[attr->ptr_attr.uattr_idx].flags |
+ UVERBS_ATTR_F_VALID_OUTPUT;
+ if (put_user(flags,
+ &pbundle->user_attrs[attr->ptr_attr.uattr_idx].flags))
+ return -EFAULT;
+
+ return 0;
+}
+EXPORT_SYMBOL(uverbs_copy_to);
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
new file mode 100644
index 000000000..fc4b46258
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -0,0 +1,1261 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/cdev.h>
+#include <linux/anon_inodes.h>
+#include <linux/slab.h>
+
+#include <linux/uaccess.h>
+
+#include <rdma/ib.h>
+#include <rdma/uverbs_std_types.h>
+
+#include "uverbs.h"
+#include "core_priv.h"
+#include "rdma_core.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace verbs access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+ IB_UVERBS_MAJOR = 231,
+ IB_UVERBS_BASE_MINOR = 192,
+ IB_UVERBS_MAX_DEVICES = RDMA_MAX_PORTS,
+ IB_UVERBS_NUM_FIXED_MINOR = 32,
+ IB_UVERBS_NUM_DYNAMIC_MINOR = IB_UVERBS_MAX_DEVICES - IB_UVERBS_NUM_FIXED_MINOR,
+};
+
+#define IB_UVERBS_BASE_DEV MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR)
+
+static dev_t dynamic_uverbs_dev;
+static struct class *uverbs_class;
+
+static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
+
+static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len) = {
+ [IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context,
+ [IB_USER_VERBS_CMD_QUERY_DEVICE] = ib_uverbs_query_device,
+ [IB_USER_VERBS_CMD_QUERY_PORT] = ib_uverbs_query_port,
+ [IB_USER_VERBS_CMD_ALLOC_PD] = ib_uverbs_alloc_pd,
+ [IB_USER_VERBS_CMD_DEALLOC_PD] = ib_uverbs_dealloc_pd,
+ [IB_USER_VERBS_CMD_REG_MR] = ib_uverbs_reg_mr,
+ [IB_USER_VERBS_CMD_REREG_MR] = ib_uverbs_rereg_mr,
+ [IB_USER_VERBS_CMD_DEREG_MR] = ib_uverbs_dereg_mr,
+ [IB_USER_VERBS_CMD_ALLOC_MW] = ib_uverbs_alloc_mw,
+ [IB_USER_VERBS_CMD_DEALLOC_MW] = ib_uverbs_dealloc_mw,
+ [IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel,
+ [IB_USER_VERBS_CMD_CREATE_CQ] = ib_uverbs_create_cq,
+ [IB_USER_VERBS_CMD_RESIZE_CQ] = ib_uverbs_resize_cq,
+ [IB_USER_VERBS_CMD_POLL_CQ] = ib_uverbs_poll_cq,
+ [IB_USER_VERBS_CMD_REQ_NOTIFY_CQ] = ib_uverbs_req_notify_cq,
+ [IB_USER_VERBS_CMD_DESTROY_CQ] = ib_uverbs_destroy_cq,
+ [IB_USER_VERBS_CMD_CREATE_QP] = ib_uverbs_create_qp,
+ [IB_USER_VERBS_CMD_QUERY_QP] = ib_uverbs_query_qp,
+ [IB_USER_VERBS_CMD_MODIFY_QP] = ib_uverbs_modify_qp,
+ [IB_USER_VERBS_CMD_DESTROY_QP] = ib_uverbs_destroy_qp,
+ [IB_USER_VERBS_CMD_POST_SEND] = ib_uverbs_post_send,
+ [IB_USER_VERBS_CMD_POST_RECV] = ib_uverbs_post_recv,
+ [IB_USER_VERBS_CMD_POST_SRQ_RECV] = ib_uverbs_post_srq_recv,
+ [IB_USER_VERBS_CMD_CREATE_AH] = ib_uverbs_create_ah,
+ [IB_USER_VERBS_CMD_DESTROY_AH] = ib_uverbs_destroy_ah,
+ [IB_USER_VERBS_CMD_ATTACH_MCAST] = ib_uverbs_attach_mcast,
+ [IB_USER_VERBS_CMD_DETACH_MCAST] = ib_uverbs_detach_mcast,
+ [IB_USER_VERBS_CMD_CREATE_SRQ] = ib_uverbs_create_srq,
+ [IB_USER_VERBS_CMD_MODIFY_SRQ] = ib_uverbs_modify_srq,
+ [IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq,
+ [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq,
+ [IB_USER_VERBS_CMD_OPEN_XRCD] = ib_uverbs_open_xrcd,
+ [IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrcd,
+ [IB_USER_VERBS_CMD_CREATE_XSRQ] = ib_uverbs_create_xsrq,
+ [IB_USER_VERBS_CMD_OPEN_QP] = ib_uverbs_open_qp,
+};
+
+static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw) = {
+ [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow,
+ [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow,
+ [IB_USER_VERBS_EX_CMD_QUERY_DEVICE] = ib_uverbs_ex_query_device,
+ [IB_USER_VERBS_EX_CMD_CREATE_CQ] = ib_uverbs_ex_create_cq,
+ [IB_USER_VERBS_EX_CMD_CREATE_QP] = ib_uverbs_ex_create_qp,
+ [IB_USER_VERBS_EX_CMD_CREATE_WQ] = ib_uverbs_ex_create_wq,
+ [IB_USER_VERBS_EX_CMD_MODIFY_WQ] = ib_uverbs_ex_modify_wq,
+ [IB_USER_VERBS_EX_CMD_DESTROY_WQ] = ib_uverbs_ex_destroy_wq,
+ [IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL] = ib_uverbs_ex_create_rwq_ind_table,
+ [IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL] = ib_uverbs_ex_destroy_rwq_ind_table,
+ [IB_USER_VERBS_EX_CMD_MODIFY_QP] = ib_uverbs_ex_modify_qp,
+ [IB_USER_VERBS_EX_CMD_MODIFY_CQ] = ib_uverbs_ex_modify_cq,
+};
+
+static void ib_uverbs_add_one(struct ib_device *device);
+static void ib_uverbs_remove_one(struct ib_device *device, void *client_data);
+
+/*
+ * Must be called with the ufile->device->disassociate_srcu held, and the lock
+ * must be held until use of the ucontext is finished.
+ */
+struct ib_ucontext *ib_uverbs_get_ucontext(struct ib_uverbs_file *ufile)
+{
+ /*
+ * We do not hold the hw_destroy_rwsem lock for this flow, instead
+ * srcu is used. It does not matter if someone races this with
+ * get_context, we get NULL or valid ucontext.
+ */
+ struct ib_ucontext *ucontext = smp_load_acquire(&ufile->ucontext);
+
+ if (!srcu_dereference(ufile->device->ib_dev,
+ &ufile->device->disassociate_srcu))
+ return ERR_PTR(-EIO);
+
+ if (!ucontext)
+ return ERR_PTR(-EINVAL);
+
+ return ucontext;
+}
+EXPORT_SYMBOL(ib_uverbs_get_ucontext);
+
+int uverbs_dealloc_mw(struct ib_mw *mw)
+{
+ struct ib_pd *pd = mw->pd;
+ int ret;
+
+ ret = mw->device->dealloc_mw(mw);
+ if (!ret)
+ atomic_dec(&pd->usecnt);
+ return ret;
+}
+
+static void ib_uverbs_release_dev(struct kobject *kobj)
+{
+ struct ib_uverbs_device *dev =
+ container_of(kobj, struct ib_uverbs_device, kobj);
+
+ uverbs_destroy_api(dev->uapi);
+ cleanup_srcu_struct(&dev->disassociate_srcu);
+ kfree(dev);
+}
+
+static struct kobj_type ib_uverbs_dev_ktype = {
+ .release = ib_uverbs_release_dev,
+};
+
+static void ib_uverbs_release_async_event_file(struct kref *ref)
+{
+ struct ib_uverbs_async_event_file *file =
+ container_of(ref, struct ib_uverbs_async_event_file, ref);
+
+ kfree(file);
+}
+
+void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
+ struct ib_uverbs_completion_event_file *ev_file,
+ struct ib_ucq_object *uobj)
+{
+ struct ib_uverbs_event *evt, *tmp;
+
+ if (ev_file) {
+ spin_lock_irq(&ev_file->ev_queue.lock);
+ list_for_each_entry_safe(evt, tmp, &uobj->comp_list, obj_list) {
+ list_del(&evt->list);
+ kfree(evt);
+ }
+ spin_unlock_irq(&ev_file->ev_queue.lock);
+
+ uverbs_uobject_put(&ev_file->uobj);
+ }
+
+ spin_lock_irq(&file->async_file->ev_queue.lock);
+ list_for_each_entry_safe(evt, tmp, &uobj->async_list, obj_list) {
+ list_del(&evt->list);
+ kfree(evt);
+ }
+ spin_unlock_irq(&file->async_file->ev_queue.lock);
+}
+
+void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
+ struct ib_uevent_object *uobj)
+{
+ struct ib_uverbs_event *evt, *tmp;
+
+ spin_lock_irq(&file->async_file->ev_queue.lock);
+ list_for_each_entry_safe(evt, tmp, &uobj->event_list, obj_list) {
+ list_del(&evt->list);
+ kfree(evt);
+ }
+ spin_unlock_irq(&file->async_file->ev_queue.lock);
+}
+
+void ib_uverbs_detach_umcast(struct ib_qp *qp,
+ struct ib_uqp_object *uobj)
+{
+ struct ib_uverbs_mcast_entry *mcast, *tmp;
+
+ list_for_each_entry_safe(mcast, tmp, &uobj->mcast_list, list) {
+ ib_detach_mcast(qp, &mcast->gid, mcast->lid);
+ list_del(&mcast->list);
+ kfree(mcast);
+ }
+}
+
+static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev)
+{
+ complete(&dev->comp);
+}
+
+void ib_uverbs_release_file(struct kref *ref)
+{
+ struct ib_uverbs_file *file =
+ container_of(ref, struct ib_uverbs_file, ref);
+ struct ib_device *ib_dev;
+ int srcu_key;
+
+ release_ufile_idr_uobject(file);
+
+ srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+ ib_dev = srcu_dereference(file->device->ib_dev,
+ &file->device->disassociate_srcu);
+ if (ib_dev && !ib_dev->disassociate_ucontext)
+ module_put(ib_dev->owner);
+ srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+
+ if (atomic_dec_and_test(&file->device->refcount))
+ ib_uverbs_comp_dev(file->device);
+
+ if (file->async_file)
+ kref_put(&file->async_file->ref,
+ ib_uverbs_release_async_event_file);
+ kobject_put(&file->device->kobj);
+ kfree(file);
+}
+
+static ssize_t ib_uverbs_event_read(struct ib_uverbs_event_queue *ev_queue,
+ struct file *filp, char __user *buf,
+ size_t count, loff_t *pos,
+ size_t eventsz)
+{
+ struct ib_uverbs_event *event;
+ int ret = 0;
+
+ spin_lock_irq(&ev_queue->lock);
+
+ while (list_empty(&ev_queue->event_list)) {
+ spin_unlock_irq(&ev_queue->lock);
+
+ if (filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ if (wait_event_interruptible(ev_queue->poll_wait,
+ (!list_empty(&ev_queue->event_list) ||
+ ev_queue->is_closed)))
+ return -ERESTARTSYS;
+
+ spin_lock_irq(&ev_queue->lock);
+
+ /* If device was disassociated and no event exists set an error */
+ if (list_empty(&ev_queue->event_list) && ev_queue->is_closed) {
+ spin_unlock_irq(&ev_queue->lock);
+ return -EIO;
+ }
+ }
+
+ event = list_entry(ev_queue->event_list.next, struct ib_uverbs_event, list);
+
+ if (eventsz > count) {
+ ret = -EINVAL;
+ event = NULL;
+ } else {
+ list_del(ev_queue->event_list.next);
+ if (event->counter) {
+ ++(*event->counter);
+ list_del(&event->obj_list);
+ }
+ }
+
+ spin_unlock_irq(&ev_queue->lock);
+
+ if (event) {
+ if (copy_to_user(buf, event, eventsz))
+ ret = -EFAULT;
+ else
+ ret = eventsz;
+ }
+
+ kfree(event);
+
+ return ret;
+}
+
+static ssize_t ib_uverbs_async_event_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct ib_uverbs_async_event_file *file = filp->private_data;
+
+ return ib_uverbs_event_read(&file->ev_queue, filp, buf, count, pos,
+ sizeof(struct ib_uverbs_async_event_desc));
+}
+
+static ssize_t ib_uverbs_comp_event_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct ib_uverbs_completion_event_file *comp_ev_file =
+ filp->private_data;
+
+ return ib_uverbs_event_read(&comp_ev_file->ev_queue, filp, buf, count,
+ pos,
+ sizeof(struct ib_uverbs_comp_event_desc));
+}
+
+static __poll_t ib_uverbs_event_poll(struct ib_uverbs_event_queue *ev_queue,
+ struct file *filp,
+ struct poll_table_struct *wait)
+{
+ __poll_t pollflags = 0;
+
+ poll_wait(filp, &ev_queue->poll_wait, wait);
+
+ spin_lock_irq(&ev_queue->lock);
+ if (!list_empty(&ev_queue->event_list))
+ pollflags = EPOLLIN | EPOLLRDNORM;
+ else if (ev_queue->is_closed)
+ pollflags = EPOLLERR;
+ spin_unlock_irq(&ev_queue->lock);
+
+ return pollflags;
+}
+
+static __poll_t ib_uverbs_async_event_poll(struct file *filp,
+ struct poll_table_struct *wait)
+{
+ struct ib_uverbs_async_event_file *file = filp->private_data;
+
+ return ib_uverbs_event_poll(&file->ev_queue, filp, wait);
+}
+
+static __poll_t ib_uverbs_comp_event_poll(struct file *filp,
+ struct poll_table_struct *wait)
+{
+ struct ib_uverbs_completion_event_file *comp_ev_file =
+ filp->private_data;
+
+ return ib_uverbs_event_poll(&comp_ev_file->ev_queue, filp, wait);
+}
+
+static int ib_uverbs_async_event_fasync(int fd, struct file *filp, int on)
+{
+ struct ib_uverbs_async_event_file *file = filp->private_data;
+
+ return fasync_helper(fd, filp, on, &file->ev_queue.async_queue);
+}
+
+static int ib_uverbs_comp_event_fasync(int fd, struct file *filp, int on)
+{
+ struct ib_uverbs_completion_event_file *comp_ev_file =
+ filp->private_data;
+
+ return fasync_helper(fd, filp, on, &comp_ev_file->ev_queue.async_queue);
+}
+
+static int ib_uverbs_async_event_close(struct inode *inode, struct file *filp)
+{
+ struct ib_uverbs_async_event_file *file = filp->private_data;
+ struct ib_uverbs_file *uverbs_file = file->uverbs_file;
+ struct ib_uverbs_event *entry, *tmp;
+ int closed_already = 0;
+
+ mutex_lock(&uverbs_file->device->lists_mutex);
+ spin_lock_irq(&file->ev_queue.lock);
+ closed_already = file->ev_queue.is_closed;
+ file->ev_queue.is_closed = 1;
+ list_for_each_entry_safe(entry, tmp, &file->ev_queue.event_list, list) {
+ if (entry->counter)
+ list_del(&entry->obj_list);
+ kfree(entry);
+ }
+ spin_unlock_irq(&file->ev_queue.lock);
+ if (!closed_already) {
+ list_del(&file->list);
+ ib_unregister_event_handler(&uverbs_file->event_handler);
+ }
+ mutex_unlock(&uverbs_file->device->lists_mutex);
+
+ kref_put(&uverbs_file->ref, ib_uverbs_release_file);
+ kref_put(&file->ref, ib_uverbs_release_async_event_file);
+
+ return 0;
+}
+
+static int ib_uverbs_comp_event_close(struct inode *inode, struct file *filp)
+{
+ struct ib_uobject *uobj = filp->private_data;
+ struct ib_uverbs_completion_event_file *file = container_of(
+ uobj, struct ib_uverbs_completion_event_file, uobj);
+ struct ib_uverbs_event *entry, *tmp;
+
+ spin_lock_irq(&file->ev_queue.lock);
+ list_for_each_entry_safe(entry, tmp, &file->ev_queue.event_list, list) {
+ if (entry->counter)
+ list_del(&entry->obj_list);
+ kfree(entry);
+ }
+ file->ev_queue.is_closed = 1;
+ spin_unlock_irq(&file->ev_queue.lock);
+
+ uverbs_close_fd(filp);
+
+ return 0;
+}
+
+const struct file_operations uverbs_event_fops = {
+ .owner = THIS_MODULE,
+ .read = ib_uverbs_comp_event_read,
+ .poll = ib_uverbs_comp_event_poll,
+ .release = ib_uverbs_comp_event_close,
+ .fasync = ib_uverbs_comp_event_fasync,
+ .llseek = no_llseek,
+};
+
+static const struct file_operations uverbs_async_event_fops = {
+ .owner = THIS_MODULE,
+ .read = ib_uverbs_async_event_read,
+ .poll = ib_uverbs_async_event_poll,
+ .release = ib_uverbs_async_event_close,
+ .fasync = ib_uverbs_async_event_fasync,
+ .llseek = no_llseek,
+};
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+ struct ib_uverbs_event_queue *ev_queue = cq_context;
+ struct ib_ucq_object *uobj;
+ struct ib_uverbs_event *entry;
+ unsigned long flags;
+
+ if (!ev_queue)
+ return;
+
+ spin_lock_irqsave(&ev_queue->lock, flags);
+ if (ev_queue->is_closed) {
+ spin_unlock_irqrestore(&ev_queue->lock, flags);
+ return;
+ }
+
+ entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!entry) {
+ spin_unlock_irqrestore(&ev_queue->lock, flags);
+ return;
+ }
+
+ uobj = container_of(cq->uobject, struct ib_ucq_object, uobject);
+
+ entry->desc.comp.cq_handle = cq->uobject->user_handle;
+ entry->counter = &uobj->comp_events_reported;
+
+ list_add_tail(&entry->list, &ev_queue->event_list);
+ list_add_tail(&entry->obj_list, &uobj->comp_list);
+ spin_unlock_irqrestore(&ev_queue->lock, flags);
+
+ wake_up_interruptible(&ev_queue->poll_wait);
+ kill_fasync(&ev_queue->async_queue, SIGIO, POLL_IN);
+}
+
+static void ib_uverbs_async_handler(struct ib_uverbs_file *file,
+ __u64 element, __u64 event,
+ struct list_head *obj_list,
+ u32 *counter)
+{
+ struct ib_uverbs_event *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&file->async_file->ev_queue.lock, flags);
+ if (file->async_file->ev_queue.is_closed) {
+ spin_unlock_irqrestore(&file->async_file->ev_queue.lock, flags);
+ return;
+ }
+
+ entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!entry) {
+ spin_unlock_irqrestore(&file->async_file->ev_queue.lock, flags);
+ return;
+ }
+
+ entry->desc.async.element = element;
+ entry->desc.async.event_type = event;
+ entry->desc.async.reserved = 0;
+ entry->counter = counter;
+
+ list_add_tail(&entry->list, &file->async_file->ev_queue.event_list);
+ if (obj_list)
+ list_add_tail(&entry->obj_list, obj_list);
+ spin_unlock_irqrestore(&file->async_file->ev_queue.lock, flags);
+
+ wake_up_interruptible(&file->async_file->ev_queue.poll_wait);
+ kill_fasync(&file->async_file->ev_queue.async_queue, SIGIO, POLL_IN);
+}
+
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr)
+{
+ struct ib_ucq_object *uobj = container_of(event->element.cq->uobject,
+ struct ib_ucq_object, uobject);
+
+ ib_uverbs_async_handler(uobj->uobject.ufile, uobj->uobject.user_handle,
+ event->event, &uobj->async_list,
+ &uobj->async_events_reported);
+}
+
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr)
+{
+ struct ib_uevent_object *uobj;
+
+ /* for XRC target qp's, check that qp is live */
+ if (!event->element.qp->uobject)
+ return;
+
+ uobj = container_of(event->element.qp->uobject,
+ struct ib_uevent_object, uobject);
+
+ ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+ event->event, &uobj->event_list,
+ &uobj->events_reported);
+}
+
+void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr)
+{
+ struct ib_uevent_object *uobj = container_of(event->element.wq->uobject,
+ struct ib_uevent_object, uobject);
+
+ ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+ event->event, &uobj->event_list,
+ &uobj->events_reported);
+}
+
+void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr)
+{
+ struct ib_uevent_object *uobj;
+
+ uobj = container_of(event->element.srq->uobject,
+ struct ib_uevent_object, uobject);
+
+ ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+ event->event, &uobj->event_list,
+ &uobj->events_reported);
+}
+
+void ib_uverbs_event_handler(struct ib_event_handler *handler,
+ struct ib_event *event)
+{
+ struct ib_uverbs_file *file =
+ container_of(handler, struct ib_uverbs_file, event_handler);
+
+ ib_uverbs_async_handler(file, event->element.port_num, event->event,
+ NULL, NULL);
+}
+
+void ib_uverbs_free_async_event_file(struct ib_uverbs_file *file)
+{
+ kref_put(&file->async_file->ref, ib_uverbs_release_async_event_file);
+ file->async_file = NULL;
+}
+
+void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue)
+{
+ spin_lock_init(&ev_queue->lock);
+ INIT_LIST_HEAD(&ev_queue->event_list);
+ init_waitqueue_head(&ev_queue->poll_wait);
+ ev_queue->is_closed = 0;
+ ev_queue->async_queue = NULL;
+}
+
+struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file,
+ struct ib_device *ib_dev)
+{
+ struct ib_uverbs_async_event_file *ev_file;
+ struct file *filp;
+
+ ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL);
+ if (!ev_file)
+ return ERR_PTR(-ENOMEM);
+
+ ib_uverbs_init_event_queue(&ev_file->ev_queue);
+ ev_file->uverbs_file = uverbs_file;
+ kref_get(&ev_file->uverbs_file->ref);
+ kref_init(&ev_file->ref);
+ filp = anon_inode_getfile("[infinibandevent]", &uverbs_async_event_fops,
+ ev_file, O_RDONLY);
+ if (IS_ERR(filp))
+ goto err_put_refs;
+
+ mutex_lock(&uverbs_file->device->lists_mutex);
+ list_add_tail(&ev_file->list,
+ &uverbs_file->device->uverbs_events_file_list);
+ mutex_unlock(&uverbs_file->device->lists_mutex);
+
+ WARN_ON(uverbs_file->async_file);
+ uverbs_file->async_file = ev_file;
+ kref_get(&uverbs_file->async_file->ref);
+ INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler,
+ ib_dev,
+ ib_uverbs_event_handler);
+ ib_register_event_handler(&uverbs_file->event_handler);
+ /* At that point async file stuff was fully set */
+
+ return filp;
+
+err_put_refs:
+ kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file);
+ kref_put(&ev_file->ref, ib_uverbs_release_async_event_file);
+ return filp;
+}
+
+static bool verify_command_mask(struct ib_uverbs_file *ufile, u32 command,
+ bool extended)
+{
+ if (!extended)
+ return ufile->uverbs_cmd_mask & BIT_ULL(command);
+
+ return ufile->uverbs_ex_cmd_mask & BIT_ULL(command);
+}
+
+static bool verify_command_idx(u32 command, bool extended)
+{
+ if (extended)
+ return command < ARRAY_SIZE(uverbs_ex_cmd_table) &&
+ uverbs_ex_cmd_table[command];
+
+ return command < ARRAY_SIZE(uverbs_cmd_table) &&
+ uverbs_cmd_table[command];
+}
+
+static ssize_t process_hdr(struct ib_uverbs_cmd_hdr *hdr,
+ u32 *command, bool *extended)
+{
+ if (hdr->command & ~(u32)(IB_USER_VERBS_CMD_FLAG_EXTENDED |
+ IB_USER_VERBS_CMD_COMMAND_MASK))
+ return -EINVAL;
+
+ *command = hdr->command & IB_USER_VERBS_CMD_COMMAND_MASK;
+ *extended = hdr->command & IB_USER_VERBS_CMD_FLAG_EXTENDED;
+
+ if (!verify_command_idx(*command, *extended))
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+static ssize_t verify_hdr(struct ib_uverbs_cmd_hdr *hdr,
+ struct ib_uverbs_ex_cmd_hdr *ex_hdr,
+ size_t count, bool extended)
+{
+ if (extended) {
+ count -= sizeof(*hdr) + sizeof(*ex_hdr);
+
+ if ((hdr->in_words + ex_hdr->provider_in_words) * 8 != count)
+ return -EINVAL;
+
+ if (ex_hdr->cmd_hdr_reserved)
+ return -EINVAL;
+
+ if (ex_hdr->response) {
+ if (!hdr->out_words && !ex_hdr->provider_out_words)
+ return -EINVAL;
+
+ if (!access_ok(VERIFY_WRITE,
+ u64_to_user_ptr(ex_hdr->response),
+ (hdr->out_words + ex_hdr->provider_out_words) * 8))
+ return -EFAULT;
+ } else {
+ if (hdr->out_words || ex_hdr->provider_out_words)
+ return -EINVAL;
+ }
+
+ return 0;
+ }
+
+ /* not extended command */
+ if (hdr->in_words * 4 != count)
+ return -EINVAL;
+
+ return 0;
+}
+
+static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct ib_uverbs_file *file = filp->private_data;
+ struct ib_uverbs_ex_cmd_hdr ex_hdr;
+ struct ib_uverbs_cmd_hdr hdr;
+ bool extended;
+ int srcu_key;
+ u32 command;
+ ssize_t ret;
+
+ if (!ib_safe_file_access(filp)) {
+ pr_err_once("uverbs_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n",
+ task_tgid_vnr(current), current->comm);
+ return -EACCES;
+ }
+
+ if (count < sizeof(hdr))
+ return -EINVAL;
+
+ if (copy_from_user(&hdr, buf, sizeof(hdr)))
+ return -EFAULT;
+
+ ret = process_hdr(&hdr, &command, &extended);
+ if (ret)
+ return ret;
+
+ if (extended) {
+ if (count < (sizeof(hdr) + sizeof(ex_hdr)))
+ return -EINVAL;
+ if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
+ return -EFAULT;
+ }
+
+ ret = verify_hdr(&hdr, &ex_hdr, count, extended);
+ if (ret)
+ return ret;
+
+ srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+
+ if (!verify_command_mask(file, command, extended)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ buf += sizeof(hdr);
+
+ if (!extended) {
+ ret = uverbs_cmd_table[command](file, buf,
+ hdr.in_words * 4,
+ hdr.out_words * 4);
+ } else {
+ struct ib_udata ucore;
+ struct ib_udata uhw;
+
+ buf += sizeof(ex_hdr);
+
+ ib_uverbs_init_udata_buf_or_null(&ucore, buf,
+ u64_to_user_ptr(ex_hdr.response),
+ hdr.in_words * 8, hdr.out_words * 8);
+
+ ib_uverbs_init_udata_buf_or_null(&uhw,
+ buf + ucore.inlen,
+ u64_to_user_ptr(ex_hdr.response) + ucore.outlen,
+ ex_hdr.provider_in_words * 8,
+ ex_hdr.provider_out_words * 8);
+
+ ret = uverbs_ex_cmd_table[command](file, &ucore, &uhw);
+ ret = (ret) ? : count;
+ }
+
+out:
+ srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+ return ret;
+}
+
+static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct ib_uverbs_file *file = filp->private_data;
+ struct ib_ucontext *ucontext;
+ int ret = 0;
+ int srcu_key;
+
+ srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+ ucontext = ib_uverbs_get_ucontext(file);
+ if (IS_ERR(ucontext)) {
+ ret = PTR_ERR(ucontext);
+ goto out;
+ }
+
+ ret = ucontext->device->mmap(ucontext, vma);
+out:
+ srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+ return ret;
+}
+
+/*
+ * ib_uverbs_open() does not need the BKL:
+ *
+ * - the ib_uverbs_device structures are properly reference counted and
+ * everything else is purely local to the file being created, so
+ * races against other open calls are not a problem;
+ * - there is no ioctl method to race against;
+ * - the open method will either immediately run -ENXIO, or all
+ * required initialization will be done.
+ */
+static int ib_uverbs_open(struct inode *inode, struct file *filp)
+{
+ struct ib_uverbs_device *dev;
+ struct ib_uverbs_file *file;
+ struct ib_device *ib_dev;
+ int ret;
+ int module_dependent;
+ int srcu_key;
+
+ dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev);
+ if (!atomic_inc_not_zero(&dev->refcount))
+ return -ENXIO;
+
+ srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+ mutex_lock(&dev->lists_mutex);
+ ib_dev = srcu_dereference(dev->ib_dev,
+ &dev->disassociate_srcu);
+ if (!ib_dev) {
+ ret = -EIO;
+ goto err;
+ }
+
+ /* In case IB device supports disassociate ucontext, there is no hard
+ * dependency between uverbs device and its low level device.
+ */
+ module_dependent = !(ib_dev->disassociate_ucontext);
+
+ if (module_dependent) {
+ if (!try_module_get(ib_dev->owner)) {
+ ret = -ENODEV;
+ goto err;
+ }
+ }
+
+ file = kzalloc(sizeof(*file), GFP_KERNEL);
+ if (!file) {
+ ret = -ENOMEM;
+ if (module_dependent)
+ goto err_module;
+
+ goto err;
+ }
+
+ file->device = dev;
+ kref_init(&file->ref);
+ mutex_init(&file->ucontext_lock);
+
+ spin_lock_init(&file->uobjects_lock);
+ INIT_LIST_HEAD(&file->uobjects);
+ init_rwsem(&file->hw_destroy_rwsem);
+
+ filp->private_data = file;
+ kobject_get(&dev->kobj);
+ list_add_tail(&file->list, &dev->uverbs_file_list);
+ mutex_unlock(&dev->lists_mutex);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+
+ file->uverbs_cmd_mask = ib_dev->uverbs_cmd_mask;
+ file->uverbs_ex_cmd_mask = ib_dev->uverbs_ex_cmd_mask;
+
+ setup_ufile_idr_uobject(file);
+
+ return nonseekable_open(inode, filp);
+
+err_module:
+ module_put(ib_dev->owner);
+
+err:
+ mutex_unlock(&dev->lists_mutex);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+ if (atomic_dec_and_test(&dev->refcount))
+ ib_uverbs_comp_dev(dev);
+
+ return ret;
+}
+
+static int ib_uverbs_close(struct inode *inode, struct file *filp)
+{
+ struct ib_uverbs_file *file = filp->private_data;
+
+ uverbs_destroy_ufile_hw(file, RDMA_REMOVE_CLOSE);
+
+ mutex_lock(&file->device->lists_mutex);
+ if (!file->is_closed) {
+ list_del(&file->list);
+ file->is_closed = 1;
+ }
+ mutex_unlock(&file->device->lists_mutex);
+
+ kref_put(&file->ref, ib_uverbs_release_file);
+
+ return 0;
+}
+
+static const struct file_operations uverbs_fops = {
+ .owner = THIS_MODULE,
+ .write = ib_uverbs_write,
+ .open = ib_uverbs_open,
+ .release = ib_uverbs_close,
+ .llseek = no_llseek,
+ .unlocked_ioctl = ib_uverbs_ioctl,
+ .compat_ioctl = ib_uverbs_ioctl,
+};
+
+static const struct file_operations uverbs_mmap_fops = {
+ .owner = THIS_MODULE,
+ .write = ib_uverbs_write,
+ .mmap = ib_uverbs_mmap,
+ .open = ib_uverbs_open,
+ .release = ib_uverbs_close,
+ .llseek = no_llseek,
+ .unlocked_ioctl = ib_uverbs_ioctl,
+ .compat_ioctl = ib_uverbs_ioctl,
+};
+
+static struct ib_client uverbs_client = {
+ .name = "uverbs",
+ .add = ib_uverbs_add_one,
+ .remove = ib_uverbs_remove_one
+};
+
+static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ int ret = -ENODEV;
+ int srcu_key;
+ struct ib_uverbs_device *dev = dev_get_drvdata(device);
+ struct ib_device *ib_dev;
+
+ if (!dev)
+ return -ENODEV;
+
+ srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+ ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
+ if (ib_dev)
+ ret = sprintf(buf, "%s\n", ib_dev->name);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+
+ return ret;
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_dev_abi_version(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_uverbs_device *dev = dev_get_drvdata(device);
+ int ret = -ENODEV;
+ int srcu_key;
+ struct ib_device *ib_dev;
+
+ if (!dev)
+ return -ENODEV;
+ srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+ ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
+ if (ib_dev)
+ ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+
+ return ret;
+}
+static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL);
+
+static CLASS_ATTR_STRING(abi_version, S_IRUGO,
+ __stringify(IB_USER_VERBS_ABI_VERSION));
+
+static int ib_uverbs_create_uapi(struct ib_device *device,
+ struct ib_uverbs_device *uverbs_dev)
+{
+ struct uverbs_api *uapi;
+
+ uapi = uverbs_alloc_api(device->driver_specs, device->driver_id);
+ if (IS_ERR(uapi))
+ return PTR_ERR(uapi);
+
+ uverbs_dev->uapi = uapi;
+ return 0;
+}
+
+static void ib_uverbs_add_one(struct ib_device *device)
+{
+ int devnum;
+ dev_t base;
+ struct ib_uverbs_device *uverbs_dev;
+ int ret;
+
+ if (!device->alloc_ucontext)
+ return;
+
+ uverbs_dev = kzalloc(sizeof(*uverbs_dev), GFP_KERNEL);
+ if (!uverbs_dev)
+ return;
+
+ ret = init_srcu_struct(&uverbs_dev->disassociate_srcu);
+ if (ret) {
+ kfree(uverbs_dev);
+ return;
+ }
+
+ atomic_set(&uverbs_dev->refcount, 1);
+ init_completion(&uverbs_dev->comp);
+ uverbs_dev->xrcd_tree = RB_ROOT;
+ mutex_init(&uverbs_dev->xrcd_tree_mutex);
+ kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype);
+ mutex_init(&uverbs_dev->lists_mutex);
+ INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list);
+ INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list);
+
+ devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
+ if (devnum >= IB_UVERBS_MAX_DEVICES)
+ goto err;
+ uverbs_dev->devnum = devnum;
+ set_bit(devnum, dev_map);
+ if (devnum >= IB_UVERBS_NUM_FIXED_MINOR)
+ base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR;
+ else
+ base = IB_UVERBS_BASE_DEV + devnum;
+
+ rcu_assign_pointer(uverbs_dev->ib_dev, device);
+ uverbs_dev->num_comp_vectors = device->num_comp_vectors;
+
+ if (ib_uverbs_create_uapi(device, uverbs_dev))
+ goto err_uapi;
+
+ cdev_init(&uverbs_dev->cdev, NULL);
+ uverbs_dev->cdev.owner = THIS_MODULE;
+ uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
+ cdev_set_parent(&uverbs_dev->cdev, &uverbs_dev->kobj);
+ kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum);
+ if (cdev_add(&uverbs_dev->cdev, base, 1))
+ goto err_cdev;
+
+ uverbs_dev->dev = device_create(uverbs_class, device->dev.parent,
+ uverbs_dev->cdev.dev, uverbs_dev,
+ "uverbs%d", uverbs_dev->devnum);
+ if (IS_ERR(uverbs_dev->dev))
+ goto err_cdev;
+
+ if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev))
+ goto err_class;
+ if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version))
+ goto err_class;
+
+ ib_set_client_data(device, &uverbs_client, uverbs_dev);
+
+ return;
+
+err_class:
+ device_destroy(uverbs_class, uverbs_dev->cdev.dev);
+err_cdev:
+ cdev_del(&uverbs_dev->cdev);
+err_uapi:
+ clear_bit(devnum, dev_map);
+err:
+ if (atomic_dec_and_test(&uverbs_dev->refcount))
+ ib_uverbs_comp_dev(uverbs_dev);
+ wait_for_completion(&uverbs_dev->comp);
+ kobject_put(&uverbs_dev->kobj);
+ return;
+}
+
+static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev,
+ struct ib_device *ib_dev)
+{
+ struct ib_uverbs_file *file;
+ struct ib_uverbs_async_event_file *event_file;
+ struct ib_event event;
+
+ /* Pending running commands to terminate */
+ uverbs_disassociate_api_pre(uverbs_dev);
+ event.event = IB_EVENT_DEVICE_FATAL;
+ event.element.port_num = 0;
+ event.device = ib_dev;
+
+ mutex_lock(&uverbs_dev->lists_mutex);
+ while (!list_empty(&uverbs_dev->uverbs_file_list)) {
+ file = list_first_entry(&uverbs_dev->uverbs_file_list,
+ struct ib_uverbs_file, list);
+ file->is_closed = 1;
+ list_del(&file->list);
+ kref_get(&file->ref);
+
+ /* We must release the mutex before going ahead and calling
+ * uverbs_cleanup_ufile, as it might end up indirectly calling
+ * uverbs_close, for example due to freeing the resources (e.g
+ * mmput).
+ */
+ mutex_unlock(&uverbs_dev->lists_mutex);
+
+ ib_uverbs_event_handler(&file->event_handler, &event);
+ uverbs_destroy_ufile_hw(file, RDMA_REMOVE_DRIVER_REMOVE);
+ kref_put(&file->ref, ib_uverbs_release_file);
+
+ mutex_lock(&uverbs_dev->lists_mutex);
+ }
+
+ while (!list_empty(&uverbs_dev->uverbs_events_file_list)) {
+ event_file = list_first_entry(&uverbs_dev->
+ uverbs_events_file_list,
+ struct ib_uverbs_async_event_file,
+ list);
+ spin_lock_irq(&event_file->ev_queue.lock);
+ event_file->ev_queue.is_closed = 1;
+ spin_unlock_irq(&event_file->ev_queue.lock);
+
+ list_del(&event_file->list);
+ ib_unregister_event_handler(
+ &event_file->uverbs_file->event_handler);
+ event_file->uverbs_file->event_handler.device =
+ NULL;
+
+ wake_up_interruptible(&event_file->ev_queue.poll_wait);
+ kill_fasync(&event_file->ev_queue.async_queue, SIGIO, POLL_IN);
+ }
+ mutex_unlock(&uverbs_dev->lists_mutex);
+
+ uverbs_disassociate_api(uverbs_dev->uapi);
+}
+
+static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
+{
+ struct ib_uverbs_device *uverbs_dev = client_data;
+ int wait_clients = 1;
+
+ if (!uverbs_dev)
+ return;
+
+ dev_set_drvdata(uverbs_dev->dev, NULL);
+ device_destroy(uverbs_class, uverbs_dev->cdev.dev);
+ cdev_del(&uverbs_dev->cdev);
+ clear_bit(uverbs_dev->devnum, dev_map);
+
+ if (device->disassociate_ucontext) {
+ /* We disassociate HW resources and immediately return.
+ * Userspace will see a EIO errno for all future access.
+ * Upon returning, ib_device may be freed internally and is not
+ * valid any more.
+ * uverbs_device is still available until all clients close
+ * their files, then the uverbs device ref count will be zero
+ * and its resources will be freed.
+ * Note: At this point no more files can be opened since the
+ * cdev was deleted, however active clients can still issue
+ * commands and close their open files.
+ */
+ ib_uverbs_free_hw_resources(uverbs_dev, device);
+ wait_clients = 0;
+ }
+
+ if (atomic_dec_and_test(&uverbs_dev->refcount))
+ ib_uverbs_comp_dev(uverbs_dev);
+ if (wait_clients)
+ wait_for_completion(&uverbs_dev->comp);
+
+ kobject_put(&uverbs_dev->kobj);
+}
+
+static char *uverbs_devnode(struct device *dev, umode_t *mode)
+{
+ if (mode)
+ *mode = 0666;
+ return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+static int __init ib_uverbs_init(void)
+{
+ int ret;
+
+ ret = register_chrdev_region(IB_UVERBS_BASE_DEV,
+ IB_UVERBS_NUM_FIXED_MINOR,
+ "infiniband_verbs");
+ if (ret) {
+ pr_err("user_verbs: couldn't register device number\n");
+ goto out;
+ }
+
+ ret = alloc_chrdev_region(&dynamic_uverbs_dev, 0,
+ IB_UVERBS_NUM_DYNAMIC_MINOR,
+ "infiniband_verbs");
+ if (ret) {
+ pr_err("couldn't register dynamic device number\n");
+ goto out_alloc;
+ }
+
+ uverbs_class = class_create(THIS_MODULE, "infiniband_verbs");
+ if (IS_ERR(uverbs_class)) {
+ ret = PTR_ERR(uverbs_class);
+ pr_err("user_verbs: couldn't create class infiniband_verbs\n");
+ goto out_chrdev;
+ }
+
+ uverbs_class->devnode = uverbs_devnode;
+
+ ret = class_create_file(uverbs_class, &class_attr_abi_version.attr);
+ if (ret) {
+ pr_err("user_verbs: couldn't create abi_version attribute\n");
+ goto out_class;
+ }
+
+ ret = ib_register_client(&uverbs_client);
+ if (ret) {
+ pr_err("user_verbs: couldn't register client\n");
+ goto out_class;
+ }
+
+ return 0;
+
+out_class:
+ class_destroy(uverbs_class);
+
+out_chrdev:
+ unregister_chrdev_region(dynamic_uverbs_dev,
+ IB_UVERBS_NUM_DYNAMIC_MINOR);
+
+out_alloc:
+ unregister_chrdev_region(IB_UVERBS_BASE_DEV,
+ IB_UVERBS_NUM_FIXED_MINOR);
+
+out:
+ return ret;
+}
+
+static void __exit ib_uverbs_cleanup(void)
+{
+ ib_unregister_client(&uverbs_client);
+ class_destroy(uverbs_class);
+ unregister_chrdev_region(IB_UVERBS_BASE_DEV,
+ IB_UVERBS_NUM_FIXED_MINOR);
+ unregister_chrdev_region(dynamic_uverbs_dev,
+ IB_UVERBS_NUM_DYNAMIC_MINOR);
+}
+
+module_init(ib_uverbs_init);
+module_exit(ib_uverbs_cleanup);
diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c
new file mode 100644
index 000000000..11a080646
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_marshall.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/export.h>
+#include <rdma/ib_marshall.h>
+
+#define OPA_DEFAULT_GID_PREFIX cpu_to_be64(0xfe80000000000000ULL)
+static int rdma_ah_conv_opa_to_ib(struct ib_device *dev,
+ struct rdma_ah_attr *ib,
+ struct rdma_ah_attr *opa)
+{
+ struct ib_port_attr port_attr;
+ int ret = 0;
+
+ /* Do structure copy and the over-write fields */
+ *ib = *opa;
+
+ ib->type = RDMA_AH_ATTR_TYPE_IB;
+ rdma_ah_set_grh(ib, NULL, 0, 0, 1, 0);
+
+ if (ib_query_port(dev, opa->port_num, &port_attr)) {
+ /* Set to default subnet to indicate error */
+ rdma_ah_set_subnet_prefix(ib, OPA_DEFAULT_GID_PREFIX);
+ ret = -EINVAL;
+ } else {
+ rdma_ah_set_subnet_prefix(ib,
+ cpu_to_be64(port_attr.subnet_prefix));
+ }
+ rdma_ah_set_interface_id(ib, OPA_MAKE_ID(rdma_ah_get_dlid(opa)));
+ return ret;
+}
+
+void ib_copy_ah_attr_to_user(struct ib_device *device,
+ struct ib_uverbs_ah_attr *dst,
+ struct rdma_ah_attr *ah_attr)
+{
+ struct rdma_ah_attr *src = ah_attr;
+ struct rdma_ah_attr conv_ah;
+
+ memset(&dst->grh, 0, sizeof(dst->grh));
+
+ if ((ah_attr->type == RDMA_AH_ATTR_TYPE_OPA) &&
+ (rdma_ah_get_dlid(ah_attr) > be16_to_cpu(IB_LID_PERMISSIVE)) &&
+ (!rdma_ah_conv_opa_to_ib(device, &conv_ah, ah_attr)))
+ src = &conv_ah;
+
+ dst->dlid = rdma_ah_get_dlid(src);
+ dst->sl = rdma_ah_get_sl(src);
+ dst->src_path_bits = rdma_ah_get_path_bits(src);
+ dst->static_rate = rdma_ah_get_static_rate(src);
+ dst->is_global = rdma_ah_get_ah_flags(src) &
+ IB_AH_GRH ? 1 : 0;
+ if (dst->is_global) {
+ const struct ib_global_route *grh = rdma_ah_read_grh(src);
+
+ memcpy(dst->grh.dgid, grh->dgid.raw, sizeof(grh->dgid));
+ dst->grh.flow_label = grh->flow_label;
+ dst->grh.sgid_index = grh->sgid_index;
+ dst->grh.hop_limit = grh->hop_limit;
+ dst->grh.traffic_class = grh->traffic_class;
+ }
+ dst->port_num = rdma_ah_get_port_num(src);
+ dst->reserved = 0;
+}
+EXPORT_SYMBOL(ib_copy_ah_attr_to_user);
+
+void ib_copy_qp_attr_to_user(struct ib_device *device,
+ struct ib_uverbs_qp_attr *dst,
+ struct ib_qp_attr *src)
+{
+ dst->qp_state = src->qp_state;
+ dst->cur_qp_state = src->cur_qp_state;
+ dst->path_mtu = src->path_mtu;
+ dst->path_mig_state = src->path_mig_state;
+ dst->qkey = src->qkey;
+ dst->rq_psn = src->rq_psn;
+ dst->sq_psn = src->sq_psn;
+ dst->dest_qp_num = src->dest_qp_num;
+ dst->qp_access_flags = src->qp_access_flags;
+
+ dst->max_send_wr = src->cap.max_send_wr;
+ dst->max_recv_wr = src->cap.max_recv_wr;
+ dst->max_send_sge = src->cap.max_send_sge;
+ dst->max_recv_sge = src->cap.max_recv_sge;
+ dst->max_inline_data = src->cap.max_inline_data;
+
+ ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr);
+ ib_copy_ah_attr_to_user(device, &dst->alt_ah_attr, &src->alt_ah_attr);
+
+ dst->pkey_index = src->pkey_index;
+ dst->alt_pkey_index = src->alt_pkey_index;
+ dst->en_sqd_async_notify = src->en_sqd_async_notify;
+ dst->sq_draining = src->sq_draining;
+ dst->max_rd_atomic = src->max_rd_atomic;
+ dst->max_dest_rd_atomic = src->max_dest_rd_atomic;
+ dst->min_rnr_timer = src->min_rnr_timer;
+ dst->port_num = src->port_num;
+ dst->timeout = src->timeout;
+ dst->retry_cnt = src->retry_cnt;
+ dst->rnr_retry = src->rnr_retry;
+ dst->alt_port_num = src->alt_port_num;
+ dst->alt_timeout = src->alt_timeout;
+ memset(dst->reserved, 0, sizeof(dst->reserved));
+}
+EXPORT_SYMBOL(ib_copy_qp_attr_to_user);
+
+static void __ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
+ struct sa_path_rec *src)
+{
+ memcpy(dst->dgid, src->dgid.raw, sizeof(src->dgid));
+ memcpy(dst->sgid, src->sgid.raw, sizeof(src->sgid));
+
+ dst->dlid = htons(ntohl(sa_path_get_dlid(src)));
+ dst->slid = htons(ntohl(sa_path_get_slid(src)));
+ dst->raw_traffic = sa_path_get_raw_traffic(src);
+ dst->flow_label = src->flow_label;
+ dst->hop_limit = src->hop_limit;
+ dst->traffic_class = src->traffic_class;
+ dst->reversible = src->reversible;
+ dst->numb_path = src->numb_path;
+ dst->pkey = src->pkey;
+ dst->sl = src->sl;
+ dst->mtu_selector = src->mtu_selector;
+ dst->mtu = src->mtu;
+ dst->rate_selector = src->rate_selector;
+ dst->rate = src->rate;
+ dst->packet_life_time = src->packet_life_time;
+ dst->preference = src->preference;
+ dst->packet_life_time_selector = src->packet_life_time_selector;
+}
+
+void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
+ struct sa_path_rec *src)
+{
+ struct sa_path_rec rec;
+
+ if (src->rec_type == SA_PATH_REC_TYPE_OPA) {
+ sa_convert_path_opa_to_ib(&rec, src);
+ __ib_copy_path_rec_to_user(dst, &rec);
+ return;
+ }
+ __ib_copy_path_rec_to_user(dst, src);
+}
+EXPORT_SYMBOL(ib_copy_path_rec_to_user);
+
+void ib_copy_path_rec_from_user(struct sa_path_rec *dst,
+ struct ib_user_path_rec *src)
+{
+ u32 slid, dlid;
+
+ memset(dst, 0, sizeof(*dst));
+ if ((ib_is_opa_gid((union ib_gid *)src->sgid)) ||
+ (ib_is_opa_gid((union ib_gid *)src->dgid))) {
+ dst->rec_type = SA_PATH_REC_TYPE_OPA;
+ slid = opa_get_lid_from_gid((union ib_gid *)src->sgid);
+ dlid = opa_get_lid_from_gid((union ib_gid *)src->dgid);
+ } else {
+ dst->rec_type = SA_PATH_REC_TYPE_IB;
+ slid = ntohs(src->slid);
+ dlid = ntohs(src->dlid);
+ }
+ memcpy(dst->dgid.raw, src->dgid, sizeof dst->dgid);
+ memcpy(dst->sgid.raw, src->sgid, sizeof dst->sgid);
+
+ sa_path_set_dlid(dst, dlid);
+ sa_path_set_slid(dst, slid);
+ sa_path_set_raw_traffic(dst, src->raw_traffic);
+ dst->flow_label = src->flow_label;
+ dst->hop_limit = src->hop_limit;
+ dst->traffic_class = src->traffic_class;
+ dst->reversible = src->reversible;
+ dst->numb_path = src->numb_path;
+ dst->pkey = src->pkey;
+ dst->sl = src->sl;
+ dst->mtu_selector = src->mtu_selector;
+ dst->mtu = src->mtu;
+ dst->rate_selector = src->rate_selector;
+ dst->rate = src->rate;
+ dst->packet_life_time = src->packet_life_time;
+ dst->preference = src->preference;
+ dst->packet_life_time_selector = src->packet_life_time_selector;
+
+ /* TODO: No need to set this */
+ sa_path_set_dmac_zero(dst);
+}
+EXPORT_SYMBOL(ib_copy_path_rec_from_user);
diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c
new file mode 100644
index 000000000..203cc96ac
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types.c
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/uverbs_std_types.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+#include <linux/bug.h>
+#include <linux/file.h>
+#include <rdma/restrack.h>
+#include "rdma_core.h"
+#include "uverbs.h"
+
+static int uverbs_free_ah(struct ib_uobject *uobject,
+ enum rdma_remove_reason why)
+{
+ return rdma_destroy_ah((struct ib_ah *)uobject->object);
+}
+
+static int uverbs_free_flow(struct ib_uobject *uobject,
+ enum rdma_remove_reason why)
+{
+ struct ib_flow *flow = (struct ib_flow *)uobject->object;
+ struct ib_uflow_object *uflow =
+ container_of(uobject, struct ib_uflow_object, uobject);
+ struct ib_qp *qp = flow->qp;
+ int ret;
+
+ ret = flow->device->destroy_flow(flow);
+ if (!ret) {
+ if (qp)
+ atomic_dec(&qp->usecnt);
+ ib_uverbs_flow_resources_free(uflow->resources);
+ }
+
+ return ret;
+}
+
+static int uverbs_free_mw(struct ib_uobject *uobject,
+ enum rdma_remove_reason why)
+{
+ return uverbs_dealloc_mw((struct ib_mw *)uobject->object);
+}
+
+static int uverbs_free_qp(struct ib_uobject *uobject,
+ enum rdma_remove_reason why)
+{
+ struct ib_qp *qp = uobject->object;
+ struct ib_uqp_object *uqp =
+ container_of(uobject, struct ib_uqp_object, uevent.uobject);
+ int ret;
+
+ /*
+ * If this is a user triggered destroy then do not allow destruction
+ * until the user cleans up all the mcast bindings. Unlike in other
+ * places we forcibly clean up the mcast attachments for !DESTROY
+ * because the mcast attaches are not ubojects and will not be
+ * destroyed by anything else during cleanup processing.
+ */
+ if (why == RDMA_REMOVE_DESTROY) {
+ if (!list_empty(&uqp->mcast_list))
+ return -EBUSY;
+ } else if (qp == qp->real_qp) {
+ ib_uverbs_detach_umcast(qp, uqp);
+ }
+
+ ret = ib_destroy_qp(qp);
+ if (ib_is_destroy_retryable(ret, why, uobject))
+ return ret;
+
+ if (uqp->uxrcd)
+ atomic_dec(&uqp->uxrcd->refcnt);
+
+ ib_uverbs_release_uevent(uobject->context->ufile, &uqp->uevent);
+ return ret;
+}
+
+static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject,
+ enum rdma_remove_reason why)
+{
+ struct ib_rwq_ind_table *rwq_ind_tbl = uobject->object;
+ struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl;
+ int ret;
+
+ ret = ib_destroy_rwq_ind_table(rwq_ind_tbl);
+ if (ib_is_destroy_retryable(ret, why, uobject))
+ return ret;
+
+ kfree(ind_tbl);
+ return ret;
+}
+
+static int uverbs_free_wq(struct ib_uobject *uobject,
+ enum rdma_remove_reason why)
+{
+ struct ib_wq *wq = uobject->object;
+ struct ib_uwq_object *uwq =
+ container_of(uobject, struct ib_uwq_object, uevent.uobject);
+ int ret;
+
+ ret = ib_destroy_wq(wq);
+ if (ib_is_destroy_retryable(ret, why, uobject))
+ return ret;
+
+ ib_uverbs_release_uevent(uobject->context->ufile, &uwq->uevent);
+ return ret;
+}
+
+static int uverbs_free_srq(struct ib_uobject *uobject,
+ enum rdma_remove_reason why)
+{
+ struct ib_srq *srq = uobject->object;
+ struct ib_uevent_object *uevent =
+ container_of(uobject, struct ib_uevent_object, uobject);
+ enum ib_srq_type srq_type = srq->srq_type;
+ int ret;
+
+ ret = ib_destroy_srq(srq);
+ if (ib_is_destroy_retryable(ret, why, uobject))
+ return ret;
+
+ if (srq_type == IB_SRQT_XRC) {
+ struct ib_usrq_object *us =
+ container_of(uevent, struct ib_usrq_object, uevent);
+
+ atomic_dec(&us->uxrcd->refcnt);
+ }
+
+ ib_uverbs_release_uevent(uobject->context->ufile, uevent);
+ return ret;
+}
+
+static int uverbs_free_xrcd(struct ib_uobject *uobject,
+ enum rdma_remove_reason why)
+{
+ struct ib_xrcd *xrcd = uobject->object;
+ struct ib_uxrcd_object *uxrcd =
+ container_of(uobject, struct ib_uxrcd_object, uobject);
+ int ret;
+
+ ret = ib_destroy_usecnt(&uxrcd->refcnt, why, uobject);
+ if (ret)
+ return ret;
+
+ mutex_lock(&uobject->context->ufile->device->xrcd_tree_mutex);
+ ret = ib_uverbs_dealloc_xrcd(uobject, xrcd, why);
+ mutex_unlock(&uobject->context->ufile->device->xrcd_tree_mutex);
+
+ return ret;
+}
+
+static int uverbs_free_pd(struct ib_uobject *uobject,
+ enum rdma_remove_reason why)
+{
+ struct ib_pd *pd = uobject->object;
+ int ret;
+
+ ret = ib_destroy_usecnt(&pd->usecnt, why, uobject);
+ if (ret)
+ return ret;
+
+ ib_dealloc_pd((struct ib_pd *)uobject->object);
+ return 0;
+}
+
+static int uverbs_hot_unplug_completion_event_file(struct ib_uobject *uobj,
+ enum rdma_remove_reason why)
+{
+ struct ib_uverbs_completion_event_file *comp_event_file =
+ container_of(uobj, struct ib_uverbs_completion_event_file,
+ uobj);
+ struct ib_uverbs_event_queue *event_queue = &comp_event_file->ev_queue;
+
+ spin_lock_irq(&event_queue->lock);
+ event_queue->is_closed = 1;
+ spin_unlock_irq(&event_queue->lock);
+
+ if (why == RDMA_REMOVE_DRIVER_REMOVE) {
+ wake_up_interruptible(&event_queue->poll_wait);
+ kill_fasync(&event_queue->async_queue, SIGIO, POLL_IN);
+ }
+ return 0;
+};
+
+int uverbs_destroy_def_handler(struct ib_uverbs_file *file,
+ struct uverbs_attr_bundle *attrs)
+{
+ return 0;
+}
+EXPORT_SYMBOL(uverbs_destroy_def_handler);
+
+DECLARE_UVERBS_NAMED_OBJECT(
+ UVERBS_OBJECT_COMP_CHANNEL,
+ UVERBS_TYPE_ALLOC_FD(sizeof(struct ib_uverbs_completion_event_file),
+ uverbs_hot_unplug_completion_event_file,
+ &uverbs_event_fops,
+ "[infinibandevent]",
+ O_RDONLY));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+ UVERBS_OBJECT_QP,
+ UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), uverbs_free_qp));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_MW,
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_mw));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+ UVERBS_OBJECT_SRQ,
+ UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object),
+ uverbs_free_srq));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_AH,
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_ah));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+ UVERBS_OBJECT_FLOW,
+ UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uflow_object),
+ uverbs_free_flow));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+ UVERBS_OBJECT_WQ,
+ UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), uverbs_free_wq));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL,
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_rwq_ind_tbl));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+ UVERBS_OBJECT_XRCD,
+ UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object),
+ uverbs_free_xrcd));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_PD,
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_pd));
+
+DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DEVICE);
+
+DECLARE_UVERBS_OBJECT_TREE(uverbs_default_objects,
+ &UVERBS_OBJECT(UVERBS_OBJECT_DEVICE),
+ &UVERBS_OBJECT(UVERBS_OBJECT_PD),
+ &UVERBS_OBJECT(UVERBS_OBJECT_MR),
+ &UVERBS_OBJECT(UVERBS_OBJECT_COMP_CHANNEL),
+ &UVERBS_OBJECT(UVERBS_OBJECT_CQ),
+ &UVERBS_OBJECT(UVERBS_OBJECT_QP),
+ &UVERBS_OBJECT(UVERBS_OBJECT_AH),
+ &UVERBS_OBJECT(UVERBS_OBJECT_MW),
+ &UVERBS_OBJECT(UVERBS_OBJECT_SRQ),
+ &UVERBS_OBJECT(UVERBS_OBJECT_FLOW),
+ &UVERBS_OBJECT(UVERBS_OBJECT_WQ),
+ &UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL),
+ &UVERBS_OBJECT(UVERBS_OBJECT_XRCD),
+ &UVERBS_OBJECT(UVERBS_OBJECT_FLOW_ACTION),
+ &UVERBS_OBJECT(UVERBS_OBJECT_DM),
+ &UVERBS_OBJECT(UVERBS_OBJECT_COUNTERS));
+
+const struct uverbs_object_tree_def *uverbs_default_get_objects(void)
+{
+ return &uverbs_default_objects;
+}
diff --git a/drivers/infiniband/core/uverbs_std_types_counters.c b/drivers/infiniband/core/uverbs_std_types_counters.c
new file mode 100644
index 000000000..a0ffdcf9a
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_counters.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "uverbs.h"
+#include <rdma/uverbs_std_types.h>
+
+static int uverbs_free_counters(struct ib_uobject *uobject,
+ enum rdma_remove_reason why)
+{
+ struct ib_counters *counters = uobject->object;
+ int ret;
+
+ ret = ib_destroy_usecnt(&counters->usecnt, why, uobject);
+ if (ret)
+ return ret;
+
+ return counters->device->destroy_counters(counters);
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)(
+ struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj = uverbs_attr_get_uobject(
+ attrs, UVERBS_ATTR_CREATE_COUNTERS_HANDLE);
+ struct ib_device *ib_dev = uobj->context->device;
+ struct ib_counters *counters;
+ int ret;
+
+ /*
+ * This check should be removed once the infrastructure
+ * have the ability to remove methods from parse tree once
+ * such condition is met.
+ */
+ if (!ib_dev->create_counters)
+ return -EOPNOTSUPP;
+
+ counters = ib_dev->create_counters(ib_dev, attrs);
+ if (IS_ERR(counters)) {
+ ret = PTR_ERR(counters);
+ goto err_create_counters;
+ }
+
+ counters->device = ib_dev;
+ counters->uobject = uobj;
+ uobj->object = counters;
+ atomic_set(&counters->usecnt, 0);
+
+ return 0;
+
+err_create_counters:
+ return ret;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_READ)(
+ struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+{
+ struct ib_counters_read_attr read_attr = {};
+ const struct uverbs_attr *uattr;
+ struct ib_counters *counters =
+ uverbs_attr_get_obj(attrs, UVERBS_ATTR_READ_COUNTERS_HANDLE);
+ int ret;
+
+ if (!counters->device->read_counters)
+ return -EOPNOTSUPP;
+
+ if (!atomic_read(&counters->usecnt))
+ return -EINVAL;
+
+ ret = uverbs_get_flags32(&read_attr.flags, attrs,
+ UVERBS_ATTR_READ_COUNTERS_FLAGS,
+ IB_UVERBS_READ_COUNTERS_PREFER_CACHED);
+ if (ret)
+ return ret;
+
+ uattr = uverbs_attr_get(attrs, UVERBS_ATTR_READ_COUNTERS_BUFF);
+ read_attr.ncounters = uattr->ptr_attr.len / sizeof(u64);
+ read_attr.counters_buff = uverbs_zalloc(
+ attrs, array_size(read_attr.ncounters, sizeof(u64)));
+ if (IS_ERR(read_attr.counters_buff))
+ return PTR_ERR(read_attr.counters_buff);
+
+ ret = counters->device->read_counters(counters, &read_attr, attrs);
+ if (ret)
+ return ret;
+
+ return uverbs_copy_to(attrs, UVERBS_ATTR_READ_COUNTERS_BUFF,
+ read_attr.counters_buff,
+ read_attr.ncounters * sizeof(u64));
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_COUNTERS_CREATE,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_COUNTERS_HANDLE,
+ UVERBS_OBJECT_COUNTERS,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_COUNTERS_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_COUNTERS_HANDLE,
+ UVERBS_OBJECT_COUNTERS,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_COUNTERS_READ,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_READ_COUNTERS_HANDLE,
+ UVERBS_OBJECT_COUNTERS,
+ UVERBS_ACCESS_READ,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_READ_COUNTERS_BUFF,
+ UVERBS_ATTR_MIN_SIZE(0),
+ UA_MANDATORY),
+ UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_READ_COUNTERS_FLAGS,
+ enum ib_uverbs_read_counters_flags));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_COUNTERS,
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_counters),
+ &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_CREATE),
+ &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_DESTROY),
+ &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_READ));
diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c
new file mode 100644
index 000000000..5b5f2052c
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_cq.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/uverbs_std_types.h>
+#include "rdma_core.h"
+#include "uverbs.h"
+
+static int uverbs_free_cq(struct ib_uobject *uobject,
+ enum rdma_remove_reason why)
+{
+ struct ib_cq *cq = uobject->object;
+ struct ib_uverbs_event_queue *ev_queue = cq->cq_context;
+ struct ib_ucq_object *ucq =
+ container_of(uobject, struct ib_ucq_object, uobject);
+ int ret;
+
+ ret = ib_destroy_cq(cq);
+ if (ib_is_destroy_retryable(ret, why, uobject))
+ return ret;
+
+ ib_uverbs_release_ucq(
+ uobject->context->ufile,
+ ev_queue ? container_of(ev_queue,
+ struct ib_uverbs_completion_event_file,
+ ev_queue) :
+ NULL,
+ ucq);
+ return ret;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
+ struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+{
+ struct ib_ucq_object *obj = container_of(
+ uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE),
+ typeof(*obj), uobject);
+ struct ib_device *ib_dev = obj->uobject.context->device;
+ struct ib_udata uhw;
+ int ret;
+ u64 user_handle;
+ struct ib_cq_init_attr attr = {};
+ struct ib_cq *cq;
+ struct ib_uverbs_completion_event_file *ev_file = NULL;
+ struct ib_uobject *ev_file_uobj;
+
+ if (!ib_dev->create_cq || !ib_dev->destroy_cq)
+ return -EOPNOTSUPP;
+
+ ret = uverbs_copy_from(&attr.comp_vector, attrs,
+ UVERBS_ATTR_CREATE_CQ_COMP_VECTOR);
+ if (!ret)
+ ret = uverbs_copy_from(&attr.cqe, attrs,
+ UVERBS_ATTR_CREATE_CQ_CQE);
+ if (!ret)
+ ret = uverbs_copy_from(&user_handle, attrs,
+ UVERBS_ATTR_CREATE_CQ_USER_HANDLE);
+ if (ret)
+ return ret;
+
+ ret = uverbs_get_flags32(&attr.flags, attrs,
+ UVERBS_ATTR_CREATE_CQ_FLAGS,
+ IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION |
+ IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN);
+ if (ret)
+ return ret;
+
+ ev_file_uobj = uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL);
+ if (!IS_ERR(ev_file_uobj)) {
+ ev_file = container_of(ev_file_uobj,
+ struct ib_uverbs_completion_event_file,
+ uobj);
+ uverbs_uobject_get(ev_file_uobj);
+ }
+
+ if (attr.comp_vector >= file->device->num_comp_vectors) {
+ ret = -EINVAL;
+ goto err_event_file;
+ }
+
+ obj->comp_events_reported = 0;
+ obj->async_events_reported = 0;
+ INIT_LIST_HEAD(&obj->comp_list);
+ INIT_LIST_HEAD(&obj->async_list);
+
+ /* Temporary, only until drivers get the new uverbs_attr_bundle */
+ create_udata(attrs, &uhw);
+
+ cq = ib_dev->create_cq(ib_dev, &attr, obj->uobject.context, &uhw);
+ if (IS_ERR(cq)) {
+ ret = PTR_ERR(cq);
+ goto err_event_file;
+ }
+
+ cq->device = ib_dev;
+ cq->uobject = &obj->uobject;
+ cq->comp_handler = ib_uverbs_comp_handler;
+ cq->event_handler = ib_uverbs_cq_event_handler;
+ cq->cq_context = ev_file ? &ev_file->ev_queue : NULL;
+ obj->uobject.object = cq;
+ obj->uobject.user_handle = user_handle;
+ atomic_set(&cq->usecnt, 0);
+ cq->res.type = RDMA_RESTRACK_CQ;
+ rdma_restrack_add(&cq->res);
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &cq->cqe,
+ sizeof(cq->cqe));
+ if (ret)
+ goto err_cq;
+
+ return 0;
+err_cq:
+ ib_destroy_cq(cq);
+
+err_event_file:
+ if (ev_file)
+ uverbs_uobject_put(ev_file_uobj);
+ return ret;
+};
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_CQ_CREATE,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_CQ_HANDLE,
+ UVERBS_OBJECT_CQ,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_CQE,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_USER_HANDLE,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL,
+ UVERBS_OBJECT_COMP_CHANNEL,
+ UVERBS_ACCESS_READ,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_COMP_VECTOR,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_CREATE_CQ_FLAGS,
+ enum ib_uverbs_ex_create_cq_flags),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_CQ_RESP_CQE,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_UHW());
+
+static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)(
+ struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj =
+ uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_CQ_HANDLE);
+ struct ib_ucq_object *obj =
+ container_of(uobj, struct ib_ucq_object, uobject);
+ struct ib_uverbs_destroy_cq_resp resp = {
+ .comp_events_reported = obj->comp_events_reported,
+ .async_events_reported = obj->async_events_reported
+ };
+
+ return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_CQ_RESP, &resp,
+ sizeof(resp));
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_CQ_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_CQ_HANDLE,
+ UVERBS_OBJECT_CQ,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_CQ_RESP,
+ UVERBS_ATTR_TYPE(struct ib_uverbs_destroy_cq_resp),
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+ UVERBS_OBJECT_CQ,
+ UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), uverbs_free_cq),
+
+#if IS_ENABLED(CONFIG_INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI)
+ &UVERBS_METHOD(UVERBS_METHOD_CQ_CREATE),
+ &UVERBS_METHOD(UVERBS_METHOD_CQ_DESTROY)
+#endif
+);
diff --git a/drivers/infiniband/core/uverbs_std_types_dm.c b/drivers/infiniband/core/uverbs_std_types_dm.c
new file mode 100644
index 000000000..edc3ff773
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_dm.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "uverbs.h"
+#include <rdma/uverbs_std_types.h>
+
+static int uverbs_free_dm(struct ib_uobject *uobject,
+ enum rdma_remove_reason why)
+{
+ struct ib_dm *dm = uobject->object;
+ int ret;
+
+ ret = ib_destroy_usecnt(&dm->usecnt, why, uobject);
+ if (ret)
+ return ret;
+
+ return dm->device->dealloc_dm(dm);
+}
+
+static int
+UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(struct ib_uverbs_file *file,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_dm_alloc_attr attr = {};
+ struct ib_uobject *uobj =
+ uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DM_HANDLE)
+ ->obj_attr.uobject;
+ struct ib_device *ib_dev = uobj->context->device;
+ struct ib_dm *dm;
+ int ret;
+
+ if (!ib_dev->alloc_dm)
+ return -EOPNOTSUPP;
+
+ ret = uverbs_copy_from(&attr.length, attrs,
+ UVERBS_ATTR_ALLOC_DM_LENGTH);
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_from(&attr.alignment, attrs,
+ UVERBS_ATTR_ALLOC_DM_ALIGNMENT);
+ if (ret)
+ return ret;
+
+ dm = ib_dev->alloc_dm(ib_dev, uobj->context, &attr, attrs);
+ if (IS_ERR(dm))
+ return PTR_ERR(dm);
+
+ dm->device = ib_dev;
+ dm->length = attr.length;
+ dm->uobject = uobj;
+ atomic_set(&dm->usecnt, 0);
+
+ uobj->object = dm;
+
+ return 0;
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_DM_ALLOC,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_ALLOC_DM_HANDLE,
+ UVERBS_OBJECT_DM,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DM_LENGTH,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DM_ALIGNMENT,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_DM_FREE,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_FREE_DM_HANDLE,
+ UVERBS_OBJECT_DM,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_DM,
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_dm),
+ &UVERBS_METHOD(UVERBS_METHOD_DM_ALLOC),
+ &UVERBS_METHOD(UVERBS_METHOD_DM_FREE));
diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c
new file mode 100644
index 000000000..d8cfafe23
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c
@@ -0,0 +1,443 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "uverbs.h"
+#include <rdma/uverbs_std_types.h>
+
+static int uverbs_free_flow_action(struct ib_uobject *uobject,
+ enum rdma_remove_reason why)
+{
+ struct ib_flow_action *action = uobject->object;
+ int ret;
+
+ ret = ib_destroy_usecnt(&action->usecnt, why, uobject);
+ if (ret)
+ return ret;
+
+ return action->device->destroy_flow_action(action);
+}
+
+static u64 esp_flags_uverbs_to_verbs(struct uverbs_attr_bundle *attrs,
+ u32 flags, bool is_modify)
+{
+ u64 verbs_flags = flags;
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ESN))
+ verbs_flags |= IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED;
+
+ if (is_modify && uverbs_attr_is_valid(attrs,
+ UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS))
+ verbs_flags |= IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS;
+
+ return verbs_flags;
+};
+
+static int validate_flow_action_esp_keymat_aes_gcm(struct ib_flow_action_attrs_esp_keymats *keymat)
+{
+ struct ib_uverbs_flow_action_esp_keymat_aes_gcm *aes_gcm =
+ &keymat->keymat.aes_gcm;
+
+ if (aes_gcm->iv_algo > IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ)
+ return -EOPNOTSUPP;
+
+ if (aes_gcm->key_len != 32 &&
+ aes_gcm->key_len != 24 &&
+ aes_gcm->key_len != 16)
+ return -EINVAL;
+
+ if (aes_gcm->icv_len != 16 &&
+ aes_gcm->icv_len != 8 &&
+ aes_gcm->icv_len != 12)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int (* const flow_action_esp_keymat_validate[])(struct ib_flow_action_attrs_esp_keymats *keymat) = {
+ [IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM] = validate_flow_action_esp_keymat_aes_gcm,
+};
+
+static int flow_action_esp_replay_none(struct ib_flow_action_attrs_esp_replays *replay,
+ bool is_modify)
+{
+ /* This is used in order to modify an esp flow action with an enabled
+ * replay protection to a disabled one. This is only supported via
+ * modify, as in create verb we can simply drop the REPLAY attribute and
+ * achieve the same thing.
+ */
+ return is_modify ? 0 : -EINVAL;
+}
+
+static int flow_action_esp_replay_def_ok(struct ib_flow_action_attrs_esp_replays *replay,
+ bool is_modify)
+{
+ /* Some replay protections could always be enabled without validating
+ * anything.
+ */
+ return 0;
+}
+
+static int (* const flow_action_esp_replay_validate[])(struct ib_flow_action_attrs_esp_replays *replay,
+ bool is_modify) = {
+ [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_NONE] = flow_action_esp_replay_none,
+ [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_BMP] = flow_action_esp_replay_def_ok,
+};
+
+static int parse_esp_ip(enum ib_flow_spec_type proto,
+ const void __user *val_ptr,
+ size_t len, union ib_flow_spec *out)
+{
+ int ret;
+ const struct ib_uverbs_flow_ipv4_filter ipv4 = {
+ .src_ip = cpu_to_be32(0xffffffffUL),
+ .dst_ip = cpu_to_be32(0xffffffffUL),
+ .proto = 0xff,
+ .tos = 0xff,
+ .ttl = 0xff,
+ .flags = 0xff,
+ };
+ const struct ib_uverbs_flow_ipv6_filter ipv6 = {
+ .src_ip = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ .dst_ip = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ .flow_label = cpu_to_be32(0xffffffffUL),
+ .next_hdr = 0xff,
+ .traffic_class = 0xff,
+ .hop_limit = 0xff,
+ };
+ union {
+ struct ib_uverbs_flow_ipv4_filter ipv4;
+ struct ib_uverbs_flow_ipv6_filter ipv6;
+ } user_val = {};
+ const void *user_pmask;
+ size_t val_len;
+
+ /* If the flow IPv4/IPv6 flow specifications are extended, the mask
+ * should be changed as well.
+ */
+ BUILD_BUG_ON(offsetof(struct ib_uverbs_flow_ipv4_filter, flags) +
+ sizeof(ipv4.flags) != sizeof(ipv4));
+ BUILD_BUG_ON(offsetof(struct ib_uverbs_flow_ipv6_filter, reserved) +
+ sizeof(ipv6.reserved) != sizeof(ipv6));
+
+ switch (proto) {
+ case IB_FLOW_SPEC_IPV4:
+ if (len > sizeof(user_val.ipv4) &&
+ !ib_is_buffer_cleared(val_ptr + sizeof(user_val.ipv4),
+ len - sizeof(user_val.ipv4)))
+ return -EOPNOTSUPP;
+
+ val_len = min_t(size_t, len, sizeof(user_val.ipv4));
+ ret = copy_from_user(&user_val.ipv4, val_ptr,
+ val_len);
+ if (ret)
+ return -EFAULT;
+
+ user_pmask = &ipv4;
+ break;
+ case IB_FLOW_SPEC_IPV6:
+ if (len > sizeof(user_val.ipv6) &&
+ !ib_is_buffer_cleared(val_ptr + sizeof(user_val.ipv6),
+ len - sizeof(user_val.ipv6)))
+ return -EOPNOTSUPP;
+
+ val_len = min_t(size_t, len, sizeof(user_val.ipv6));
+ ret = copy_from_user(&user_val.ipv6, val_ptr,
+ val_len);
+ if (ret)
+ return -EFAULT;
+
+ user_pmask = &ipv6;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return ib_uverbs_kern_spec_to_ib_spec_filter(proto, user_pmask,
+ &user_val,
+ val_len, out);
+}
+
+static int flow_action_esp_get_encap(struct ib_flow_spec_list *out,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_flow_action_esp_encap uverbs_encap;
+ int ret;
+
+ ret = uverbs_copy_from(&uverbs_encap, attrs,
+ UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP);
+ if (ret)
+ return ret;
+
+ /* We currently support only one encap */
+ if (uverbs_encap.next_ptr)
+ return -EOPNOTSUPP;
+
+ if (uverbs_encap.type != IB_FLOW_SPEC_IPV4 &&
+ uverbs_encap.type != IB_FLOW_SPEC_IPV6)
+ return -EOPNOTSUPP;
+
+ return parse_esp_ip(uverbs_encap.type,
+ u64_to_user_ptr(uverbs_encap.val_ptr),
+ uverbs_encap.len,
+ &out->spec);
+}
+
+struct ib_flow_action_esp_attr {
+ struct ib_flow_action_attrs_esp hdr;
+ struct ib_flow_action_attrs_esp_keymats keymat;
+ struct ib_flow_action_attrs_esp_replays replay;
+ /* We currently support only one spec */
+ struct ib_flow_spec_list encap;
+};
+
+#define ESP_LAST_SUPPORTED_FLAG IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW
+static int parse_flow_action_esp(struct ib_device *ib_dev,
+ struct ib_uverbs_file *file,
+ struct uverbs_attr_bundle *attrs,
+ struct ib_flow_action_esp_attr *esp_attr,
+ bool is_modify)
+{
+ struct ib_uverbs_flow_action_esp uverbs_esp = {};
+ int ret;
+
+ /* Optional param, if it doesn't exist, we get -ENOENT and skip it */
+ ret = uverbs_copy_from(&esp_attr->hdr.esn, attrs,
+ UVERBS_ATTR_FLOW_ACTION_ESP_ESN);
+ if (IS_UVERBS_COPY_ERR(ret))
+ return ret;
+
+ /* This can be called from FLOW_ACTION_ESP_MODIFY where
+ * UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS is optional
+ */
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS)) {
+ ret = uverbs_copy_from_or_zero(&uverbs_esp, attrs,
+ UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS);
+ if (ret)
+ return ret;
+
+ if (uverbs_esp.flags & ~((ESP_LAST_SUPPORTED_FLAG << 1) - 1))
+ return -EOPNOTSUPP;
+
+ esp_attr->hdr.spi = uverbs_esp.spi;
+ esp_attr->hdr.seq = uverbs_esp.seq;
+ esp_attr->hdr.tfc_pad = uverbs_esp.tfc_pad;
+ esp_attr->hdr.hard_limit_pkts = uverbs_esp.hard_limit_pkts;
+ }
+ esp_attr->hdr.flags = esp_flags_uverbs_to_verbs(attrs, uverbs_esp.flags,
+ is_modify);
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT)) {
+ esp_attr->keymat.protocol =
+ uverbs_attr_get_enum_id(attrs,
+ UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT);
+ ret = uverbs_copy_from_or_zero(&esp_attr->keymat.keymat,
+ attrs,
+ UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT);
+ if (ret)
+ return ret;
+
+ ret = flow_action_esp_keymat_validate[esp_attr->keymat.protocol](&esp_attr->keymat);
+ if (ret)
+ return ret;
+
+ esp_attr->hdr.keymat = &esp_attr->keymat;
+ }
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY)) {
+ esp_attr->replay.protocol =
+ uverbs_attr_get_enum_id(attrs,
+ UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY);
+
+ ret = uverbs_copy_from_or_zero(&esp_attr->replay.replay,
+ attrs,
+ UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY);
+ if (ret)
+ return ret;
+
+ ret = flow_action_esp_replay_validate[esp_attr->replay.protocol](&esp_attr->replay,
+ is_modify);
+ if (ret)
+ return ret;
+
+ esp_attr->hdr.replay = &esp_attr->replay;
+ }
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP)) {
+ ret = flow_action_esp_get_encap(&esp_attr->encap, attrs);
+ if (ret)
+ return ret;
+
+ esp_attr->hdr.encap = &esp_attr->encap;
+ }
+
+ return 0;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(
+ struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj = uverbs_attr_get_uobject(
+ attrs, UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE);
+ struct ib_device *ib_dev = uobj->context->device;
+ int ret;
+ struct ib_flow_action *action;
+ struct ib_flow_action_esp_attr esp_attr = {};
+
+ if (!ib_dev->create_flow_action_esp)
+ return -EOPNOTSUPP;
+
+ ret = parse_flow_action_esp(ib_dev, file, attrs, &esp_attr, false);
+ if (ret)
+ return ret;
+
+ /* No need to check as this attribute is marked as MANDATORY */
+ action = ib_dev->create_flow_action_esp(ib_dev, &esp_attr.hdr, attrs);
+ if (IS_ERR(action))
+ return PTR_ERR(action);
+
+ atomic_set(&action->usecnt, 0);
+ action->device = ib_dev;
+ action->type = IB_FLOW_ACTION_ESP;
+ action->uobject = uobj;
+ uobj->object = action;
+
+ return 0;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)(
+ struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj = uverbs_attr_get_uobject(
+ attrs, UVERBS_ATTR_MODIFY_FLOW_ACTION_ESP_HANDLE);
+ struct ib_flow_action *action = uobj->object;
+ int ret;
+ struct ib_flow_action_esp_attr esp_attr = {};
+
+ if (!action->device->modify_flow_action_esp)
+ return -EOPNOTSUPP;
+
+ ret = parse_flow_action_esp(action->device, file, attrs, &esp_attr,
+ true);
+ if (ret)
+ return ret;
+
+ if (action->type != IB_FLOW_ACTION_ESP)
+ return -EINVAL;
+
+ return action->device->modify_flow_action_esp(action, &esp_attr.hdr,
+ attrs);
+}
+
+static const struct uverbs_attr_spec uverbs_flow_action_esp_keymat[] = {
+ [IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM] = {
+ .type = UVERBS_ATTR_TYPE_PTR_IN,
+ UVERBS_ATTR_STRUCT(
+ struct ib_uverbs_flow_action_esp_keymat_aes_gcm,
+ aes_key),
+ },
+};
+
+static const struct uverbs_attr_spec uverbs_flow_action_esp_replay[] = {
+ [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_NONE] = {
+ .type = UVERBS_ATTR_TYPE_PTR_IN,
+ UVERBS_ATTR_NO_DATA(),
+ },
+ [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_BMP] = {
+ .type = UVERBS_ATTR_TYPE_PTR_IN,
+ UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp_replay_bmp,
+ size),
+ },
+};
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_FLOW_ACTION_ESP_CREATE,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE,
+ UVERBS_OBJECT_FLOW_ACTION,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS,
+ UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp,
+ hard_limit_pkts),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ESN,
+ UVERBS_ATTR_TYPE(__u32),
+ UA_OPTIONAL),
+ UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT,
+ uverbs_flow_action_esp_keymat,
+ UA_MANDATORY),
+ UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY,
+ uverbs_flow_action_esp_replay,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(
+ UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP,
+ UVERBS_ATTR_TYPE(struct ib_uverbs_flow_action_esp_encap),
+ UA_OPTIONAL));
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_MODIFY_FLOW_ACTION_ESP_HANDLE,
+ UVERBS_OBJECT_FLOW_ACTION,
+ UVERBS_ACCESS_WRITE,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS,
+ UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp,
+ hard_limit_pkts),
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ESN,
+ UVERBS_ATTR_TYPE(__u32),
+ UA_OPTIONAL),
+ UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT,
+ uverbs_flow_action_esp_keymat,
+ UA_OPTIONAL),
+ UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY,
+ uverbs_flow_action_esp_replay,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(
+ UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP,
+ UVERBS_ATTR_TYPE(struct ib_uverbs_flow_action_esp_encap),
+ UA_OPTIONAL));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_FLOW_ACTION_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_FLOW_ACTION_HANDLE,
+ UVERBS_OBJECT_FLOW_ACTION,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+ UVERBS_OBJECT_FLOW_ACTION,
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_flow_action),
+ &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE),
+ &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_DESTROY),
+ &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY));
diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c
new file mode 100644
index 000000000..cf02e7743
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_mr.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "uverbs.h"
+#include <rdma/uverbs_std_types.h>
+
+static int uverbs_free_mr(struct ib_uobject *uobject,
+ enum rdma_remove_reason why)
+{
+ return ib_dereg_mr((struct ib_mr *)uobject->object);
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)(
+ struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+{
+ struct ib_dm_mr_attr attr = {};
+ struct ib_uobject *uobj =
+ uverbs_attr_get_uobject(attrs, UVERBS_ATTR_REG_DM_MR_HANDLE);
+ struct ib_dm *dm =
+ uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_DM_MR_DM_HANDLE);
+ struct ib_pd *pd =
+ uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_DM_MR_PD_HANDLE);
+ struct ib_device *ib_dev = pd->device;
+
+ struct ib_mr *mr;
+ int ret;
+
+ if (!ib_dev->reg_dm_mr)
+ return -EOPNOTSUPP;
+
+ ret = uverbs_copy_from(&attr.offset, attrs, UVERBS_ATTR_REG_DM_MR_OFFSET);
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_from(&attr.length, attrs,
+ UVERBS_ATTR_REG_DM_MR_LENGTH);
+ if (ret)
+ return ret;
+
+ ret = uverbs_get_flags32(&attr.access_flags, attrs,
+ UVERBS_ATTR_REG_DM_MR_ACCESS_FLAGS,
+ IB_ACCESS_SUPPORTED);
+ if (ret)
+ return ret;
+
+ if (!(attr.access_flags & IB_ZERO_BASED))
+ return -EINVAL;
+
+ ret = ib_check_mr_access(attr.access_flags);
+ if (ret)
+ return ret;
+
+ if (attr.offset > dm->length || attr.length > dm->length ||
+ attr.length > dm->length - attr.offset)
+ return -EINVAL;
+
+ mr = pd->device->reg_dm_mr(pd, dm, &attr, attrs);
+ if (IS_ERR(mr))
+ return PTR_ERR(mr);
+
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->dm = dm;
+ mr->uobject = uobj;
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&dm->usecnt);
+
+ uobj->object = mr;
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DM_MR_RESP_LKEY, &mr->lkey,
+ sizeof(mr->lkey));
+ if (ret)
+ goto err_dereg;
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DM_MR_RESP_RKEY,
+ &mr->rkey, sizeof(mr->rkey));
+ if (ret)
+ goto err_dereg;
+
+ return 0;
+
+err_dereg:
+ ib_dereg_mr(mr);
+
+ return ret;
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_DM_MR_REG,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_HANDLE,
+ UVERBS_OBJECT_MR,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DM_MR_OFFSET,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DM_MR_LENGTH,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_PD_HANDLE,
+ UVERBS_OBJECT_PD,
+ UVERBS_ACCESS_READ,
+ UA_MANDATORY),
+ UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_REG_DM_MR_ACCESS_FLAGS,
+ enum ib_access_flags),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_DM_HANDLE,
+ UVERBS_OBJECT_DM,
+ UVERBS_ACCESS_READ,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DM_MR_RESP_LKEY,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DM_MR_RESP_RKEY,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+ UVERBS_OBJECT_MR,
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_mr),
+ &UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG));
diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c
new file mode 100644
index 000000000..959a3418a
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_uapi.c
@@ -0,0 +1,349 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved.
+ */
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/rdma_user_ioctl.h>
+#include <linux/bitops.h>
+#include "rdma_core.h"
+#include "uverbs.h"
+
+static void *uapi_add_elm(struct uverbs_api *uapi, u32 key, size_t alloc_size)
+{
+ void *elm;
+ int rc;
+
+ if (key == UVERBS_API_KEY_ERR)
+ return ERR_PTR(-EOVERFLOW);
+
+ elm = kzalloc(alloc_size, GFP_KERNEL);
+ if (!elm)
+ return ERR_PTR(-ENOMEM);
+ rc = radix_tree_insert(&uapi->radix, key, elm);
+ if (rc) {
+ kfree(elm);
+ return ERR_PTR(rc);
+ }
+
+ return elm;
+}
+
+static int uapi_merge_method(struct uverbs_api *uapi,
+ struct uverbs_api_object *obj_elm, u32 obj_key,
+ const struct uverbs_method_def *method,
+ bool is_driver)
+{
+ u32 method_key = obj_key | uapi_key_ioctl_method(method->id);
+ struct uverbs_api_ioctl_method *method_elm;
+ unsigned int i;
+
+ if (!method->attrs)
+ return 0;
+
+ method_elm = uapi_add_elm(uapi, method_key, sizeof(*method_elm));
+ if (IS_ERR(method_elm)) {
+ if (method_elm != ERR_PTR(-EEXIST))
+ return PTR_ERR(method_elm);
+
+ /*
+ * This occurs when a driver uses ADD_UVERBS_ATTRIBUTES_SIMPLE
+ */
+ if (WARN_ON(method->handler))
+ return -EINVAL;
+ method_elm = radix_tree_lookup(&uapi->radix, method_key);
+ if (WARN_ON(!method_elm))
+ return -EINVAL;
+ } else {
+ WARN_ON(!method->handler);
+ rcu_assign_pointer(method_elm->handler, method->handler);
+ if (method->handler != uverbs_destroy_def_handler)
+ method_elm->driver_method = is_driver;
+ }
+
+ for (i = 0; i != method->num_attrs; i++) {
+ const struct uverbs_attr_def *attr = (*method->attrs)[i];
+ struct uverbs_api_attr *attr_slot;
+
+ if (!attr)
+ continue;
+
+ /*
+ * ENUM_IN contains the 'ids' pointer to the driver's .rodata,
+ * so if it is specified by a driver then it always makes this
+ * into a driver method.
+ */
+ if (attr->attr.type == UVERBS_ATTR_TYPE_ENUM_IN)
+ method_elm->driver_method |= is_driver;
+
+ attr_slot =
+ uapi_add_elm(uapi, method_key | uapi_key_attr(attr->id),
+ sizeof(*attr_slot));
+ /* Attributes are not allowed to be modified by drivers */
+ if (IS_ERR(attr_slot))
+ return PTR_ERR(attr_slot);
+
+ attr_slot->spec = attr->attr;
+ }
+
+ return 0;
+}
+
+static int uapi_merge_tree(struct uverbs_api *uapi,
+ const struct uverbs_object_tree_def *tree,
+ bool is_driver)
+{
+ unsigned int i, j;
+ int rc;
+
+ if (!tree->objects)
+ return 0;
+
+ for (i = 0; i != tree->num_objects; i++) {
+ const struct uverbs_object_def *obj = (*tree->objects)[i];
+ struct uverbs_api_object *obj_elm;
+ u32 obj_key;
+
+ if (!obj)
+ continue;
+
+ obj_key = uapi_key_obj(obj->id);
+ obj_elm = uapi_add_elm(uapi, obj_key, sizeof(*obj_elm));
+ if (IS_ERR(obj_elm)) {
+ if (obj_elm != ERR_PTR(-EEXIST))
+ return PTR_ERR(obj_elm);
+
+ /* This occurs when a driver uses ADD_UVERBS_METHODS */
+ if (WARN_ON(obj->type_attrs))
+ return -EINVAL;
+ obj_elm = radix_tree_lookup(&uapi->radix, obj_key);
+ if (WARN_ON(!obj_elm))
+ return -EINVAL;
+ } else {
+ obj_elm->type_attrs = obj->type_attrs;
+ if (obj->type_attrs) {
+ obj_elm->type_class =
+ obj->type_attrs->type_class;
+ /*
+ * Today drivers are only permitted to use
+ * idr_class types. They cannot use FD types
+ * because we currently have no way to revoke
+ * the fops pointer after device
+ * disassociation.
+ */
+ if (WARN_ON(is_driver &&
+ obj->type_attrs->type_class !=
+ &uverbs_idr_class))
+ return -EINVAL;
+ }
+ }
+
+ if (!obj->methods)
+ continue;
+
+ for (j = 0; j != obj->num_methods; j++) {
+ const struct uverbs_method_def *method =
+ (*obj->methods)[j];
+ if (!method)
+ continue;
+
+ rc = uapi_merge_method(uapi, obj_elm, obj_key, method,
+ is_driver);
+ if (rc)
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+static int
+uapi_finalize_ioctl_method(struct uverbs_api *uapi,
+ struct uverbs_api_ioctl_method *method_elm,
+ u32 method_key)
+{
+ struct radix_tree_iter iter;
+ unsigned int num_attrs = 0;
+ unsigned int max_bkey = 0;
+ bool single_uobj = false;
+ void __rcu **slot;
+
+ method_elm->destroy_bkey = UVERBS_API_ATTR_BKEY_LEN;
+ radix_tree_for_each_slot (slot, &uapi->radix, &iter,
+ uapi_key_attrs_start(method_key)) {
+ struct uverbs_api_attr *elm =
+ rcu_dereference_protected(*slot, true);
+ u32 attr_key = iter.index & UVERBS_API_ATTR_KEY_MASK;
+ u32 attr_bkey = uapi_bkey_attr(attr_key);
+ u8 type = elm->spec.type;
+
+ if (uapi_key_attr_to_method(iter.index) !=
+ uapi_key_attr_to_method(method_key))
+ break;
+
+ if (elm->spec.mandatory)
+ __set_bit(attr_bkey, method_elm->attr_mandatory);
+
+ if (type == UVERBS_ATTR_TYPE_IDR ||
+ type == UVERBS_ATTR_TYPE_FD) {
+ u8 access = elm->spec.u.obj.access;
+
+ /*
+ * Verbs specs may only have one NEW/DESTROY, we don't
+ * have the infrastructure to abort multiple NEW's or
+ * cope with multiple DESTROY failure.
+ */
+ if (access == UVERBS_ACCESS_NEW ||
+ access == UVERBS_ACCESS_DESTROY) {
+ if (WARN_ON(single_uobj))
+ return -EINVAL;
+
+ single_uobj = true;
+ if (WARN_ON(!elm->spec.mandatory))
+ return -EINVAL;
+ }
+
+ if (access == UVERBS_ACCESS_DESTROY)
+ method_elm->destroy_bkey = attr_bkey;
+ }
+
+ max_bkey = max(max_bkey, attr_bkey);
+ num_attrs++;
+ }
+
+ method_elm->key_bitmap_len = max_bkey + 1;
+ WARN_ON(method_elm->key_bitmap_len > UVERBS_API_ATTR_BKEY_LEN);
+
+ uapi_compute_bundle_size(method_elm, num_attrs);
+ return 0;
+}
+
+static int uapi_finalize(struct uverbs_api *uapi)
+{
+ struct radix_tree_iter iter;
+ void __rcu **slot;
+ int rc;
+
+ radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) {
+ struct uverbs_api_ioctl_method *method_elm =
+ rcu_dereference_protected(*slot, true);
+
+ if (uapi_key_is_ioctl_method(iter.index)) {
+ rc = uapi_finalize_ioctl_method(uapi, method_elm,
+ iter.index);
+ if (rc)
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+void uverbs_destroy_api(struct uverbs_api *uapi)
+{
+ struct radix_tree_iter iter;
+ void __rcu **slot;
+
+ if (!uapi)
+ return;
+
+ radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) {
+ kfree(rcu_dereference_protected(*slot, true));
+ radix_tree_iter_delete(&uapi->radix, &iter, slot);
+ }
+ kfree(uapi);
+}
+
+struct uverbs_api *uverbs_alloc_api(
+ const struct uverbs_object_tree_def *const *driver_specs,
+ enum rdma_driver_id driver_id)
+{
+ struct uverbs_api *uapi;
+ int rc;
+
+ uapi = kzalloc(sizeof(*uapi), GFP_KERNEL);
+ if (!uapi)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_RADIX_TREE(&uapi->radix, GFP_KERNEL);
+ uapi->driver_id = driver_id;
+
+ rc = uapi_merge_tree(uapi, uverbs_default_get_objects(), false);
+ if (rc)
+ goto err;
+
+ for (; driver_specs && *driver_specs; driver_specs++) {
+ rc = uapi_merge_tree(uapi, *driver_specs, true);
+ if (rc)
+ goto err;
+ }
+
+ rc = uapi_finalize(uapi);
+ if (rc)
+ goto err;
+
+ return uapi;
+err:
+ if (rc != -ENOMEM)
+ pr_err("Setup of uverbs_api failed, kernel parsing tree description is not valid (%d)??\n",
+ rc);
+
+ uverbs_destroy_api(uapi);
+ return ERR_PTR(rc);
+}
+
+/*
+ * The pre version is done before destroying the HW objects, it only blocks
+ * off method access. All methods that require the ib_dev or the module data
+ * must test one of these assignments prior to continuing.
+ */
+void uverbs_disassociate_api_pre(struct ib_uverbs_device *uverbs_dev)
+{
+ struct uverbs_api *uapi = uverbs_dev->uapi;
+ struct radix_tree_iter iter;
+ void __rcu **slot;
+
+ rcu_assign_pointer(uverbs_dev->ib_dev, NULL);
+
+ radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) {
+ if (uapi_key_is_ioctl_method(iter.index)) {
+ struct uverbs_api_ioctl_method *method_elm =
+ rcu_dereference_protected(*slot, true);
+
+ if (method_elm->driver_method)
+ rcu_assign_pointer(method_elm->handler, NULL);
+ }
+ }
+
+ synchronize_srcu(&uverbs_dev->disassociate_srcu);
+}
+
+/*
+ * Called when a driver disassociates from the ib_uverbs_device. The
+ * assumption is that the driver module will unload after. Replace everything
+ * related to the driver with NULL as a safety measure.
+ */
+void uverbs_disassociate_api(struct uverbs_api *uapi)
+{
+ struct radix_tree_iter iter;
+ void __rcu **slot;
+
+ radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) {
+ if (uapi_key_is_object(iter.index)) {
+ struct uverbs_api_object *object_elm =
+ rcu_dereference_protected(*slot, true);
+
+ /*
+ * Some type_attrs are in the driver module. We don't
+ * bother to keep track of which since there should be
+ * no use of this after disassociate.
+ */
+ object_elm->type_attrs = NULL;
+ } else if (uapi_key_is_attr(iter.index)) {
+ struct uverbs_api_attr *elm =
+ rcu_dereference_protected(*slot, true);
+
+ if (elm->spec.type == UVERBS_ATTR_TYPE_ENUM_IN)
+ elm->spec.u2.enum_def.ids = NULL;
+ }
+ }
+}
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
new file mode 100644
index 000000000..e1ecd4682
--- /dev/null
+++ b/drivers/infiniband/core/verbs.c
@@ -0,0 +1,2630 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <net/addrconf.h>
+#include <linux/security.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+#include <rdma/rw.h>
+
+#include "core_priv.h"
+
+static int ib_resolve_eth_dmac(struct ib_device *device,
+ struct rdma_ah_attr *ah_attr);
+
+static const char * const ib_events[] = {
+ [IB_EVENT_CQ_ERR] = "CQ error",
+ [IB_EVENT_QP_FATAL] = "QP fatal error",
+ [IB_EVENT_QP_REQ_ERR] = "QP request error",
+ [IB_EVENT_QP_ACCESS_ERR] = "QP access error",
+ [IB_EVENT_COMM_EST] = "communication established",
+ [IB_EVENT_SQ_DRAINED] = "send queue drained",
+ [IB_EVENT_PATH_MIG] = "path migration successful",
+ [IB_EVENT_PATH_MIG_ERR] = "path migration error",
+ [IB_EVENT_DEVICE_FATAL] = "device fatal error",
+ [IB_EVENT_PORT_ACTIVE] = "port active",
+ [IB_EVENT_PORT_ERR] = "port error",
+ [IB_EVENT_LID_CHANGE] = "LID change",
+ [IB_EVENT_PKEY_CHANGE] = "P_key change",
+ [IB_EVENT_SM_CHANGE] = "SM change",
+ [IB_EVENT_SRQ_ERR] = "SRQ error",
+ [IB_EVENT_SRQ_LIMIT_REACHED] = "SRQ limit reached",
+ [IB_EVENT_QP_LAST_WQE_REACHED] = "last WQE reached",
+ [IB_EVENT_CLIENT_REREGISTER] = "client reregister",
+ [IB_EVENT_GID_CHANGE] = "GID changed",
+};
+
+const char *__attribute_const__ ib_event_msg(enum ib_event_type event)
+{
+ size_t index = event;
+
+ return (index < ARRAY_SIZE(ib_events) && ib_events[index]) ?
+ ib_events[index] : "unrecognized event";
+}
+EXPORT_SYMBOL(ib_event_msg);
+
+static const char * const wc_statuses[] = {
+ [IB_WC_SUCCESS] = "success",
+ [IB_WC_LOC_LEN_ERR] = "local length error",
+ [IB_WC_LOC_QP_OP_ERR] = "local QP operation error",
+ [IB_WC_LOC_EEC_OP_ERR] = "local EE context operation error",
+ [IB_WC_LOC_PROT_ERR] = "local protection error",
+ [IB_WC_WR_FLUSH_ERR] = "WR flushed",
+ [IB_WC_MW_BIND_ERR] = "memory management operation error",
+ [IB_WC_BAD_RESP_ERR] = "bad response error",
+ [IB_WC_LOC_ACCESS_ERR] = "local access error",
+ [IB_WC_REM_INV_REQ_ERR] = "invalid request error",
+ [IB_WC_REM_ACCESS_ERR] = "remote access error",
+ [IB_WC_REM_OP_ERR] = "remote operation error",
+ [IB_WC_RETRY_EXC_ERR] = "transport retry counter exceeded",
+ [IB_WC_RNR_RETRY_EXC_ERR] = "RNR retry counter exceeded",
+ [IB_WC_LOC_RDD_VIOL_ERR] = "local RDD violation error",
+ [IB_WC_REM_INV_RD_REQ_ERR] = "remote invalid RD request",
+ [IB_WC_REM_ABORT_ERR] = "operation aborted",
+ [IB_WC_INV_EECN_ERR] = "invalid EE context number",
+ [IB_WC_INV_EEC_STATE_ERR] = "invalid EE context state",
+ [IB_WC_FATAL_ERR] = "fatal error",
+ [IB_WC_RESP_TIMEOUT_ERR] = "response timeout error",
+ [IB_WC_GENERAL_ERR] = "general error",
+};
+
+const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status)
+{
+ size_t index = status;
+
+ return (index < ARRAY_SIZE(wc_statuses) && wc_statuses[index]) ?
+ wc_statuses[index] : "unrecognized status";
+}
+EXPORT_SYMBOL(ib_wc_status_msg);
+
+__attribute_const__ int ib_rate_to_mult(enum ib_rate rate)
+{
+ switch (rate) {
+ case IB_RATE_2_5_GBPS: return 1;
+ case IB_RATE_5_GBPS: return 2;
+ case IB_RATE_10_GBPS: return 4;
+ case IB_RATE_20_GBPS: return 8;
+ case IB_RATE_30_GBPS: return 12;
+ case IB_RATE_40_GBPS: return 16;
+ case IB_RATE_60_GBPS: return 24;
+ case IB_RATE_80_GBPS: return 32;
+ case IB_RATE_120_GBPS: return 48;
+ case IB_RATE_14_GBPS: return 6;
+ case IB_RATE_56_GBPS: return 22;
+ case IB_RATE_112_GBPS: return 45;
+ case IB_RATE_168_GBPS: return 67;
+ case IB_RATE_25_GBPS: return 10;
+ case IB_RATE_100_GBPS: return 40;
+ case IB_RATE_200_GBPS: return 80;
+ case IB_RATE_300_GBPS: return 120;
+ default: return -1;
+ }
+}
+EXPORT_SYMBOL(ib_rate_to_mult);
+
+__attribute_const__ enum ib_rate mult_to_ib_rate(int mult)
+{
+ switch (mult) {
+ case 1: return IB_RATE_2_5_GBPS;
+ case 2: return IB_RATE_5_GBPS;
+ case 4: return IB_RATE_10_GBPS;
+ case 8: return IB_RATE_20_GBPS;
+ case 12: return IB_RATE_30_GBPS;
+ case 16: return IB_RATE_40_GBPS;
+ case 24: return IB_RATE_60_GBPS;
+ case 32: return IB_RATE_80_GBPS;
+ case 48: return IB_RATE_120_GBPS;
+ case 6: return IB_RATE_14_GBPS;
+ case 22: return IB_RATE_56_GBPS;
+ case 45: return IB_RATE_112_GBPS;
+ case 67: return IB_RATE_168_GBPS;
+ case 10: return IB_RATE_25_GBPS;
+ case 40: return IB_RATE_100_GBPS;
+ case 80: return IB_RATE_200_GBPS;
+ case 120: return IB_RATE_300_GBPS;
+ default: return IB_RATE_PORT_CURRENT;
+ }
+}
+EXPORT_SYMBOL(mult_to_ib_rate);
+
+__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
+{
+ switch (rate) {
+ case IB_RATE_2_5_GBPS: return 2500;
+ case IB_RATE_5_GBPS: return 5000;
+ case IB_RATE_10_GBPS: return 10000;
+ case IB_RATE_20_GBPS: return 20000;
+ case IB_RATE_30_GBPS: return 30000;
+ case IB_RATE_40_GBPS: return 40000;
+ case IB_RATE_60_GBPS: return 60000;
+ case IB_RATE_80_GBPS: return 80000;
+ case IB_RATE_120_GBPS: return 120000;
+ case IB_RATE_14_GBPS: return 14062;
+ case IB_RATE_56_GBPS: return 56250;
+ case IB_RATE_112_GBPS: return 112500;
+ case IB_RATE_168_GBPS: return 168750;
+ case IB_RATE_25_GBPS: return 25781;
+ case IB_RATE_100_GBPS: return 103125;
+ case IB_RATE_200_GBPS: return 206250;
+ case IB_RATE_300_GBPS: return 309375;
+ default: return -1;
+ }
+}
+EXPORT_SYMBOL(ib_rate_to_mbps);
+
+__attribute_const__ enum rdma_transport_type
+rdma_node_get_transport(enum rdma_node_type node_type)
+{
+
+ if (node_type == RDMA_NODE_USNIC)
+ return RDMA_TRANSPORT_USNIC;
+ if (node_type == RDMA_NODE_USNIC_UDP)
+ return RDMA_TRANSPORT_USNIC_UDP;
+ if (node_type == RDMA_NODE_RNIC)
+ return RDMA_TRANSPORT_IWARP;
+
+ return RDMA_TRANSPORT_IB;
+}
+EXPORT_SYMBOL(rdma_node_get_transport);
+
+enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num)
+{
+ enum rdma_transport_type lt;
+ if (device->get_link_layer)
+ return device->get_link_layer(device, port_num);
+
+ lt = rdma_node_get_transport(device->node_type);
+ if (lt == RDMA_TRANSPORT_IB)
+ return IB_LINK_LAYER_INFINIBAND;
+
+ return IB_LINK_LAYER_ETHERNET;
+}
+EXPORT_SYMBOL(rdma_port_get_link_layer);
+
+/* Protection domains */
+
+/**
+ * ib_alloc_pd - Allocates an unused protection domain.
+ * @device: The device on which to allocate the protection domain.
+ *
+ * A protection domain object provides an association between QPs, shared
+ * receive queues, address handles, memory regions, and memory windows.
+ *
+ * Every PD has a local_dma_lkey which can be used as the lkey value for local
+ * memory operations.
+ */
+struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
+ const char *caller)
+{
+ struct ib_pd *pd;
+ int mr_access_flags = 0;
+
+ pd = device->alloc_pd(device, NULL, NULL);
+ if (IS_ERR(pd))
+ return pd;
+
+ pd->device = device;
+ pd->uobject = NULL;
+ pd->__internal_mr = NULL;
+ atomic_set(&pd->usecnt, 0);
+ pd->flags = flags;
+
+ if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
+ pd->local_dma_lkey = device->local_dma_lkey;
+ else
+ mr_access_flags |= IB_ACCESS_LOCAL_WRITE;
+
+ if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
+ pr_warn("%s: enabling unsafe global rkey\n", caller);
+ mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE;
+ }
+
+ pd->res.type = RDMA_RESTRACK_PD;
+ pd->res.kern_name = caller;
+ rdma_restrack_add(&pd->res);
+
+ if (mr_access_flags) {
+ struct ib_mr *mr;
+
+ mr = pd->device->get_dma_mr(pd, mr_access_flags);
+ if (IS_ERR(mr)) {
+ ib_dealloc_pd(pd);
+ return ERR_CAST(mr);
+ }
+
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->uobject = NULL;
+ mr->need_inval = false;
+
+ pd->__internal_mr = mr;
+
+ if (!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY))
+ pd->local_dma_lkey = pd->__internal_mr->lkey;
+
+ if (flags & IB_PD_UNSAFE_GLOBAL_RKEY)
+ pd->unsafe_global_rkey = pd->__internal_mr->rkey;
+ }
+
+ return pd;
+}
+EXPORT_SYMBOL(__ib_alloc_pd);
+
+/**
+ * ib_dealloc_pd - Deallocates a protection domain.
+ * @pd: The protection domain to deallocate.
+ *
+ * It is an error to call this function while any resources in the pd still
+ * exist. The caller is responsible to synchronously destroy them and
+ * guarantee no new allocations will happen.
+ */
+void ib_dealloc_pd(struct ib_pd *pd)
+{
+ int ret;
+
+ if (pd->__internal_mr) {
+ ret = pd->device->dereg_mr(pd->__internal_mr);
+ WARN_ON(ret);
+ pd->__internal_mr = NULL;
+ }
+
+ /* uverbs manipulates usecnt with proper locking, while the kabi
+ requires the caller to guarantee we can't race here. */
+ WARN_ON(atomic_read(&pd->usecnt));
+
+ rdma_restrack_del(&pd->res);
+ /* Making delalloc_pd a void return is a WIP, no driver should return
+ an error here. */
+ ret = pd->device->dealloc_pd(pd);
+ WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
+}
+EXPORT_SYMBOL(ib_dealloc_pd);
+
+/* Address handles */
+
+/**
+ * rdma_copy_ah_attr - Copy rdma ah attribute from source to destination.
+ * @dest: Pointer to destination ah_attr. Contents of the destination
+ * pointer is assumed to be invalid and attribute are overwritten.
+ * @src: Pointer to source ah_attr.
+ */
+void rdma_copy_ah_attr(struct rdma_ah_attr *dest,
+ const struct rdma_ah_attr *src)
+{
+ *dest = *src;
+ if (dest->grh.sgid_attr)
+ rdma_hold_gid_attr(dest->grh.sgid_attr);
+}
+EXPORT_SYMBOL(rdma_copy_ah_attr);
+
+/**
+ * rdma_replace_ah_attr - Replace valid ah_attr with new new one.
+ * @old: Pointer to existing ah_attr which needs to be replaced.
+ * old is assumed to be valid or zero'd
+ * @new: Pointer to the new ah_attr.
+ *
+ * rdma_replace_ah_attr() first releases any reference in the old ah_attr if
+ * old the ah_attr is valid; after that it copies the new attribute and holds
+ * the reference to the replaced ah_attr.
+ */
+void rdma_replace_ah_attr(struct rdma_ah_attr *old,
+ const struct rdma_ah_attr *new)
+{
+ rdma_destroy_ah_attr(old);
+ *old = *new;
+ if (old->grh.sgid_attr)
+ rdma_hold_gid_attr(old->grh.sgid_attr);
+}
+EXPORT_SYMBOL(rdma_replace_ah_attr);
+
+/**
+ * rdma_move_ah_attr - Move ah_attr pointed by source to destination.
+ * @dest: Pointer to destination ah_attr to copy to.
+ * dest is assumed to be valid or zero'd
+ * @src: Pointer to the new ah_attr.
+ *
+ * rdma_move_ah_attr() first releases any reference in the destination ah_attr
+ * if it is valid. This also transfers ownership of internal references from
+ * src to dest, making src invalid in the process. No new reference of the src
+ * ah_attr is taken.
+ */
+void rdma_move_ah_attr(struct rdma_ah_attr *dest, struct rdma_ah_attr *src)
+{
+ rdma_destroy_ah_attr(dest);
+ *dest = *src;
+ src->grh.sgid_attr = NULL;
+}
+EXPORT_SYMBOL(rdma_move_ah_attr);
+
+/*
+ * Validate that the rdma_ah_attr is valid for the device before passing it
+ * off to the driver.
+ */
+static int rdma_check_ah_attr(struct ib_device *device,
+ struct rdma_ah_attr *ah_attr)
+{
+ if (!rdma_is_port_valid(device, ah_attr->port_num))
+ return -EINVAL;
+
+ if ((rdma_is_grh_required(device, ah_attr->port_num) ||
+ ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) &&
+ !(ah_attr->ah_flags & IB_AH_GRH))
+ return -EINVAL;
+
+ if (ah_attr->grh.sgid_attr) {
+ /*
+ * Make sure the passed sgid_attr is consistent with the
+ * parameters
+ */
+ if (ah_attr->grh.sgid_attr->index != ah_attr->grh.sgid_index ||
+ ah_attr->grh.sgid_attr->port_num != ah_attr->port_num)
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * If the ah requires a GRH then ensure that sgid_attr pointer is filled in.
+ * On success the caller is responsible to call rdma_unfill_sgid_attr().
+ */
+static int rdma_fill_sgid_attr(struct ib_device *device,
+ struct rdma_ah_attr *ah_attr,
+ const struct ib_gid_attr **old_sgid_attr)
+{
+ const struct ib_gid_attr *sgid_attr;
+ struct ib_global_route *grh;
+ int ret;
+
+ *old_sgid_attr = ah_attr->grh.sgid_attr;
+
+ ret = rdma_check_ah_attr(device, ah_attr);
+ if (ret)
+ return ret;
+
+ if (!(ah_attr->ah_flags & IB_AH_GRH))
+ return 0;
+
+ grh = rdma_ah_retrieve_grh(ah_attr);
+ if (grh->sgid_attr)
+ return 0;
+
+ sgid_attr =
+ rdma_get_gid_attr(device, ah_attr->port_num, grh->sgid_index);
+ if (IS_ERR(sgid_attr))
+ return PTR_ERR(sgid_attr);
+
+ /* Move ownerhip of the kref into the ah_attr */
+ grh->sgid_attr = sgid_attr;
+ return 0;
+}
+
+static void rdma_unfill_sgid_attr(struct rdma_ah_attr *ah_attr,
+ const struct ib_gid_attr *old_sgid_attr)
+{
+ /*
+ * Fill didn't change anything, the caller retains ownership of
+ * whatever it passed
+ */
+ if (ah_attr->grh.sgid_attr == old_sgid_attr)
+ return;
+
+ /*
+ * Otherwise, we need to undo what rdma_fill_sgid_attr so the caller
+ * doesn't see any change in the rdma_ah_attr. If we get here
+ * old_sgid_attr is NULL.
+ */
+ rdma_destroy_ah_attr(ah_attr);
+}
+
+static const struct ib_gid_attr *
+rdma_update_sgid_attr(struct rdma_ah_attr *ah_attr,
+ const struct ib_gid_attr *old_attr)
+{
+ if (old_attr)
+ rdma_put_gid_attr(old_attr);
+ if (ah_attr->ah_flags & IB_AH_GRH) {
+ rdma_hold_gid_attr(ah_attr->grh.sgid_attr);
+ return ah_attr->grh.sgid_attr;
+ }
+ return NULL;
+}
+
+static struct ib_ah *_rdma_create_ah(struct ib_pd *pd,
+ struct rdma_ah_attr *ah_attr,
+ struct ib_udata *udata)
+{
+ struct ib_ah *ah;
+
+ if (!pd->device->create_ah)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ ah = pd->device->create_ah(pd, ah_attr, udata);
+
+ if (!IS_ERR(ah)) {
+ ah->device = pd->device;
+ ah->pd = pd;
+ ah->uobject = NULL;
+ ah->type = ah_attr->type;
+ ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL);
+
+ atomic_inc(&pd->usecnt);
+ }
+
+ return ah;
+}
+
+/**
+ * rdma_create_ah - Creates an address handle for the
+ * given address vector.
+ * @pd: The protection domain associated with the address handle.
+ * @ah_attr: The attributes of the address vector.
+ *
+ * It returns 0 on success and returns appropriate error code on error.
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr)
+{
+ const struct ib_gid_attr *old_sgid_attr;
+ struct ib_ah *ah;
+ int ret;
+
+ ret = rdma_fill_sgid_attr(pd->device, ah_attr, &old_sgid_attr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ ah = _rdma_create_ah(pd, ah_attr, NULL);
+
+ rdma_unfill_sgid_attr(ah_attr, old_sgid_attr);
+ return ah;
+}
+EXPORT_SYMBOL(rdma_create_ah);
+
+/**
+ * rdma_create_user_ah - Creates an address handle for the
+ * given address vector.
+ * It resolves destination mac address for ah attribute of RoCE type.
+ * @pd: The protection domain associated with the address handle.
+ * @ah_attr: The attributes of the address vector.
+ * @udata: pointer to user's input output buffer information need by
+ * provider driver.
+ *
+ * It returns 0 on success and returns appropriate error code on error.
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ib_ah *rdma_create_user_ah(struct ib_pd *pd,
+ struct rdma_ah_attr *ah_attr,
+ struct ib_udata *udata)
+{
+ const struct ib_gid_attr *old_sgid_attr;
+ struct ib_ah *ah;
+ int err;
+
+ err = rdma_fill_sgid_attr(pd->device, ah_attr, &old_sgid_attr);
+ if (err)
+ return ERR_PTR(err);
+
+ if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) {
+ err = ib_resolve_eth_dmac(pd->device, ah_attr);
+ if (err) {
+ ah = ERR_PTR(err);
+ goto out;
+ }
+ }
+
+ ah = _rdma_create_ah(pd, ah_attr, udata);
+
+out:
+ rdma_unfill_sgid_attr(ah_attr, old_sgid_attr);
+ return ah;
+}
+EXPORT_SYMBOL(rdma_create_user_ah);
+
+int ib_get_rdma_header_version(const union rdma_network_hdr *hdr)
+{
+ const struct iphdr *ip4h = (struct iphdr *)&hdr->roce4grh;
+ struct iphdr ip4h_checked;
+ const struct ipv6hdr *ip6h = (struct ipv6hdr *)&hdr->ibgrh;
+
+ /* If it's IPv6, the version must be 6, otherwise, the first
+ * 20 bytes (before the IPv4 header) are garbled.
+ */
+ if (ip6h->version != 6)
+ return (ip4h->version == 4) ? 4 : 0;
+ /* version may be 6 or 4 because the first 20 bytes could be garbled */
+
+ /* RoCE v2 requires no options, thus header length
+ * must be 5 words
+ */
+ if (ip4h->ihl != 5)
+ return 6;
+
+ /* Verify checksum.
+ * We can't write on scattered buffers so we need to copy to
+ * temp buffer.
+ */
+ memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked));
+ ip4h_checked.check = 0;
+ ip4h_checked.check = ip_fast_csum((u8 *)&ip4h_checked, 5);
+ /* if IPv4 header checksum is OK, believe it */
+ if (ip4h->check == ip4h_checked.check)
+ return 4;
+ return 6;
+}
+EXPORT_SYMBOL(ib_get_rdma_header_version);
+
+static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device,
+ u8 port_num,
+ const struct ib_grh *grh)
+{
+ int grh_version;
+
+ if (rdma_protocol_ib(device, port_num))
+ return RDMA_NETWORK_IB;
+
+ grh_version = ib_get_rdma_header_version((union rdma_network_hdr *)grh);
+
+ if (grh_version == 4)
+ return RDMA_NETWORK_IPV4;
+
+ if (grh->next_hdr == IPPROTO_UDP)
+ return RDMA_NETWORK_IPV6;
+
+ return RDMA_NETWORK_ROCE_V1;
+}
+
+struct find_gid_index_context {
+ u16 vlan_id;
+ enum ib_gid_type gid_type;
+};
+
+static bool find_gid_index(const union ib_gid *gid,
+ const struct ib_gid_attr *gid_attr,
+ void *context)
+{
+ struct find_gid_index_context *ctx = context;
+
+ if (ctx->gid_type != gid_attr->gid_type)
+ return false;
+
+ if ((!!(ctx->vlan_id != 0xffff) == !is_vlan_dev(gid_attr->ndev)) ||
+ (is_vlan_dev(gid_attr->ndev) &&
+ vlan_dev_vlan_id(gid_attr->ndev) != ctx->vlan_id))
+ return false;
+
+ return true;
+}
+
+static const struct ib_gid_attr *
+get_sgid_attr_from_eth(struct ib_device *device, u8 port_num,
+ u16 vlan_id, const union ib_gid *sgid,
+ enum ib_gid_type gid_type)
+{
+ struct find_gid_index_context context = {.vlan_id = vlan_id,
+ .gid_type = gid_type};
+
+ return rdma_find_gid_by_filter(device, sgid, port_num, find_gid_index,
+ &context);
+}
+
+int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr,
+ enum rdma_network_type net_type,
+ union ib_gid *sgid, union ib_gid *dgid)
+{
+ struct sockaddr_in src_in;
+ struct sockaddr_in dst_in;
+ __be32 src_saddr, dst_saddr;
+
+ if (!sgid || !dgid)
+ return -EINVAL;
+
+ if (net_type == RDMA_NETWORK_IPV4) {
+ memcpy(&src_in.sin_addr.s_addr,
+ &hdr->roce4grh.saddr, 4);
+ memcpy(&dst_in.sin_addr.s_addr,
+ &hdr->roce4grh.daddr, 4);
+ src_saddr = src_in.sin_addr.s_addr;
+ dst_saddr = dst_in.sin_addr.s_addr;
+ ipv6_addr_set_v4mapped(src_saddr,
+ (struct in6_addr *)sgid);
+ ipv6_addr_set_v4mapped(dst_saddr,
+ (struct in6_addr *)dgid);
+ return 0;
+ } else if (net_type == RDMA_NETWORK_IPV6 ||
+ net_type == RDMA_NETWORK_IB) {
+ *dgid = hdr->ibgrh.dgid;
+ *sgid = hdr->ibgrh.sgid;
+ return 0;
+ } else {
+ return -EINVAL;
+ }
+}
+EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr);
+
+/* Resolve destination mac address and hop limit for unicast destination
+ * GID entry, considering the source GID entry as well.
+ * ah_attribute must have have valid port_num, sgid_index.
+ */
+static int ib_resolve_unicast_gid_dmac(struct ib_device *device,
+ struct rdma_ah_attr *ah_attr)
+{
+ struct ib_global_route *grh = rdma_ah_retrieve_grh(ah_attr);
+ const struct ib_gid_attr *sgid_attr = grh->sgid_attr;
+ int hop_limit = 0xff;
+ int ret = 0;
+
+ /* If destination is link local and source GID is RoCEv1,
+ * IP stack is not used.
+ */
+ if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw) &&
+ sgid_attr->gid_type == IB_GID_TYPE_ROCE) {
+ rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw,
+ ah_attr->roce.dmac);
+ return ret;
+ }
+
+ ret = rdma_addr_find_l2_eth_by_grh(&sgid_attr->gid, &grh->dgid,
+ ah_attr->roce.dmac,
+ sgid_attr->ndev, &hop_limit);
+
+ grh->hop_limit = hop_limit;
+ return ret;
+}
+
+/*
+ * This function initializes address handle attributes from the incoming packet.
+ * Incoming packet has dgid of the receiver node on which this code is
+ * getting executed and, sgid contains the GID of the sender.
+ *
+ * When resolving mac address of destination, the arrived dgid is used
+ * as sgid and, sgid is used as dgid because sgid contains destinations
+ * GID whom to respond to.
+ *
+ * On success the caller is responsible to call rdma_destroy_ah_attr on the
+ * attr.
+ */
+int ib_init_ah_attr_from_wc(struct ib_device *device, u8 port_num,
+ const struct ib_wc *wc, const struct ib_grh *grh,
+ struct rdma_ah_attr *ah_attr)
+{
+ u32 flow_class;
+ int ret;
+ enum rdma_network_type net_type = RDMA_NETWORK_IB;
+ enum ib_gid_type gid_type = IB_GID_TYPE_IB;
+ const struct ib_gid_attr *sgid_attr;
+ int hoplimit = 0xff;
+ union ib_gid dgid;
+ union ib_gid sgid;
+
+ might_sleep();
+
+ memset(ah_attr, 0, sizeof *ah_attr);
+ ah_attr->type = rdma_ah_find_type(device, port_num);
+ if (rdma_cap_eth_ah(device, port_num)) {
+ if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE)
+ net_type = wc->network_hdr_type;
+ else
+ net_type = ib_get_net_type_by_grh(device, port_num, grh);
+ gid_type = ib_network_to_gid_type(net_type);
+ }
+ ret = ib_get_gids_from_rdma_hdr((union rdma_network_hdr *)grh, net_type,
+ &sgid, &dgid);
+ if (ret)
+ return ret;
+
+ rdma_ah_set_sl(ah_attr, wc->sl);
+ rdma_ah_set_port_num(ah_attr, port_num);
+
+ if (rdma_protocol_roce(device, port_num)) {
+ u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ?
+ wc->vlan_id : 0xffff;
+
+ if (!(wc->wc_flags & IB_WC_GRH))
+ return -EPROTOTYPE;
+
+ sgid_attr = get_sgid_attr_from_eth(device, port_num,
+ vlan_id, &dgid,
+ gid_type);
+ if (IS_ERR(sgid_attr))
+ return PTR_ERR(sgid_attr);
+
+ flow_class = be32_to_cpu(grh->version_tclass_flow);
+ rdma_move_grh_sgid_attr(ah_attr,
+ &sgid,
+ flow_class & 0xFFFFF,
+ hoplimit,
+ (flow_class >> 20) & 0xFF,
+ sgid_attr);
+
+ ret = ib_resolve_unicast_gid_dmac(device, ah_attr);
+ if (ret)
+ rdma_destroy_ah_attr(ah_attr);
+
+ return ret;
+ } else {
+ rdma_ah_set_dlid(ah_attr, wc->slid);
+ rdma_ah_set_path_bits(ah_attr, wc->dlid_path_bits);
+
+ if ((wc->wc_flags & IB_WC_GRH) == 0)
+ return 0;
+
+ if (dgid.global.interface_id !=
+ cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) {
+ sgid_attr = rdma_find_gid_by_port(
+ device, &dgid, IB_GID_TYPE_IB, port_num, NULL);
+ } else
+ sgid_attr = rdma_get_gid_attr(device, port_num, 0);
+
+ if (IS_ERR(sgid_attr))
+ return PTR_ERR(sgid_attr);
+ flow_class = be32_to_cpu(grh->version_tclass_flow);
+ rdma_move_grh_sgid_attr(ah_attr,
+ &sgid,
+ flow_class & 0xFFFFF,
+ hoplimit,
+ (flow_class >> 20) & 0xFF,
+ sgid_attr);
+
+ return 0;
+ }
+}
+EXPORT_SYMBOL(ib_init_ah_attr_from_wc);
+
+/**
+ * rdma_move_grh_sgid_attr - Sets the sgid attribute of GRH, taking ownership
+ * of the reference
+ *
+ * @attr: Pointer to AH attribute structure
+ * @dgid: Destination GID
+ * @flow_label: Flow label
+ * @hop_limit: Hop limit
+ * @traffic_class: traffic class
+ * @sgid_attr: Pointer to SGID attribute
+ *
+ * This takes ownership of the sgid_attr reference. The caller must ensure
+ * rdma_destroy_ah_attr() is called before destroying the rdma_ah_attr after
+ * calling this function.
+ */
+void rdma_move_grh_sgid_attr(struct rdma_ah_attr *attr, union ib_gid *dgid,
+ u32 flow_label, u8 hop_limit, u8 traffic_class,
+ const struct ib_gid_attr *sgid_attr)
+{
+ rdma_ah_set_grh(attr, dgid, flow_label, sgid_attr->index, hop_limit,
+ traffic_class);
+ attr->grh.sgid_attr = sgid_attr;
+}
+EXPORT_SYMBOL(rdma_move_grh_sgid_attr);
+
+/**
+ * rdma_destroy_ah_attr - Release reference to SGID attribute of
+ * ah attribute.
+ * @ah_attr: Pointer to ah attribute
+ *
+ * Release reference to the SGID attribute of the ah attribute if it is
+ * non NULL. It is safe to call this multiple times, and safe to call it on
+ * a zero initialized ah_attr.
+ */
+void rdma_destroy_ah_attr(struct rdma_ah_attr *ah_attr)
+{
+ if (ah_attr->grh.sgid_attr) {
+ rdma_put_gid_attr(ah_attr->grh.sgid_attr);
+ ah_attr->grh.sgid_attr = NULL;
+ }
+}
+EXPORT_SYMBOL(rdma_destroy_ah_attr);
+
+struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc,
+ const struct ib_grh *grh, u8 port_num)
+{
+ struct rdma_ah_attr ah_attr;
+ struct ib_ah *ah;
+ int ret;
+
+ ret = ib_init_ah_attr_from_wc(pd->device, port_num, wc, grh, &ah_attr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ ah = rdma_create_ah(pd, &ah_attr);
+
+ rdma_destroy_ah_attr(&ah_attr);
+ return ah;
+}
+EXPORT_SYMBOL(ib_create_ah_from_wc);
+
+int rdma_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr)
+{
+ const struct ib_gid_attr *old_sgid_attr;
+ int ret;
+
+ if (ah->type != ah_attr->type)
+ return -EINVAL;
+
+ ret = rdma_fill_sgid_attr(ah->device, ah_attr, &old_sgid_attr);
+ if (ret)
+ return ret;
+
+ ret = ah->device->modify_ah ?
+ ah->device->modify_ah(ah, ah_attr) :
+ -EOPNOTSUPP;
+
+ ah->sgid_attr = rdma_update_sgid_attr(ah_attr, ah->sgid_attr);
+ rdma_unfill_sgid_attr(ah_attr, old_sgid_attr);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_modify_ah);
+
+int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr)
+{
+ ah_attr->grh.sgid_attr = NULL;
+
+ return ah->device->query_ah ?
+ ah->device->query_ah(ah, ah_attr) :
+ -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(rdma_query_ah);
+
+int rdma_destroy_ah(struct ib_ah *ah)
+{
+ const struct ib_gid_attr *sgid_attr = ah->sgid_attr;
+ struct ib_pd *pd;
+ int ret;
+
+ pd = ah->pd;
+ ret = ah->device->destroy_ah(ah);
+ if (!ret) {
+ atomic_dec(&pd->usecnt);
+ if (sgid_attr)
+ rdma_put_gid_attr(sgid_attr);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(rdma_destroy_ah);
+
+/* Shared receive queues */
+
+struct ib_srq *ib_create_srq(struct ib_pd *pd,
+ struct ib_srq_init_attr *srq_init_attr)
+{
+ struct ib_srq *srq;
+
+ if (!pd->device->create_srq)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ srq = pd->device->create_srq(pd, srq_init_attr, NULL);
+
+ if (!IS_ERR(srq)) {
+ srq->device = pd->device;
+ srq->pd = pd;
+ srq->uobject = NULL;
+ srq->event_handler = srq_init_attr->event_handler;
+ srq->srq_context = srq_init_attr->srq_context;
+ srq->srq_type = srq_init_attr->srq_type;
+ if (ib_srq_has_cq(srq->srq_type)) {
+ srq->ext.cq = srq_init_attr->ext.cq;
+ atomic_inc(&srq->ext.cq->usecnt);
+ }
+ if (srq->srq_type == IB_SRQT_XRC) {
+ srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd;
+ atomic_inc(&srq->ext.xrc.xrcd->usecnt);
+ }
+ atomic_inc(&pd->usecnt);
+ atomic_set(&srq->usecnt, 0);
+ }
+
+ return srq;
+}
+EXPORT_SYMBOL(ib_create_srq);
+
+int ib_modify_srq(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr,
+ enum ib_srq_attr_mask srq_attr_mask)
+{
+ return srq->device->modify_srq ?
+ srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) :
+ -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(ib_modify_srq);
+
+int ib_query_srq(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr)
+{
+ return srq->device->query_srq ?
+ srq->device->query_srq(srq, srq_attr) : -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(ib_query_srq);
+
+int ib_destroy_srq(struct ib_srq *srq)
+{
+ struct ib_pd *pd;
+ enum ib_srq_type srq_type;
+ struct ib_xrcd *uninitialized_var(xrcd);
+ struct ib_cq *uninitialized_var(cq);
+ int ret;
+
+ if (atomic_read(&srq->usecnt))
+ return -EBUSY;
+
+ pd = srq->pd;
+ srq_type = srq->srq_type;
+ if (ib_srq_has_cq(srq_type))
+ cq = srq->ext.cq;
+ if (srq_type == IB_SRQT_XRC)
+ xrcd = srq->ext.xrc.xrcd;
+
+ ret = srq->device->destroy_srq(srq);
+ if (!ret) {
+ atomic_dec(&pd->usecnt);
+ if (srq_type == IB_SRQT_XRC)
+ atomic_dec(&xrcd->usecnt);
+ if (ib_srq_has_cq(srq_type))
+ atomic_dec(&cq->usecnt);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_destroy_srq);
+
+/* Queue pairs */
+
+static void __ib_shared_qp_event_handler(struct ib_event *event, void *context)
+{
+ struct ib_qp *qp = context;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->device->event_handler_lock, flags);
+ list_for_each_entry(event->element.qp, &qp->open_list, open_list)
+ if (event->element.qp->event_handler)
+ event->element.qp->event_handler(event, event->element.qp->qp_context);
+ spin_unlock_irqrestore(&qp->device->event_handler_lock, flags);
+}
+
+static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp)
+{
+ mutex_lock(&xrcd->tgt_qp_mutex);
+ list_add(&qp->xrcd_list, &xrcd->tgt_qp_list);
+ mutex_unlock(&xrcd->tgt_qp_mutex);
+}
+
+static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp,
+ void (*event_handler)(struct ib_event *, void *),
+ void *qp_context)
+{
+ struct ib_qp *qp;
+ unsigned long flags;
+ int err;
+
+ qp = kzalloc(sizeof *qp, GFP_KERNEL);
+ if (!qp)
+ return ERR_PTR(-ENOMEM);
+
+ qp->real_qp = real_qp;
+ err = ib_open_shared_qp_security(qp, real_qp->device);
+ if (err) {
+ kfree(qp);
+ return ERR_PTR(err);
+ }
+
+ qp->real_qp = real_qp;
+ atomic_inc(&real_qp->usecnt);
+ qp->device = real_qp->device;
+ qp->event_handler = event_handler;
+ qp->qp_context = qp_context;
+ qp->qp_num = real_qp->qp_num;
+ qp->qp_type = real_qp->qp_type;
+
+ spin_lock_irqsave(&real_qp->device->event_handler_lock, flags);
+ list_add(&qp->open_list, &real_qp->open_list);
+ spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
+
+ return qp;
+}
+
+struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
+ struct ib_qp_open_attr *qp_open_attr)
+{
+ struct ib_qp *qp, *real_qp;
+
+ if (qp_open_attr->qp_type != IB_QPT_XRC_TGT)
+ return ERR_PTR(-EINVAL);
+
+ qp = ERR_PTR(-EINVAL);
+ mutex_lock(&xrcd->tgt_qp_mutex);
+ list_for_each_entry(real_qp, &xrcd->tgt_qp_list, xrcd_list) {
+ if (real_qp->qp_num == qp_open_attr->qp_num) {
+ qp = __ib_open_qp(real_qp, qp_open_attr->event_handler,
+ qp_open_attr->qp_context);
+ break;
+ }
+ }
+ mutex_unlock(&xrcd->tgt_qp_mutex);
+ return qp;
+}
+EXPORT_SYMBOL(ib_open_qp);
+
+static struct ib_qp *create_xrc_qp(struct ib_qp *qp,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ struct ib_qp *real_qp = qp;
+
+ qp->event_handler = __ib_shared_qp_event_handler;
+ qp->qp_context = qp;
+ qp->pd = NULL;
+ qp->send_cq = qp->recv_cq = NULL;
+ qp->srq = NULL;
+ qp->xrcd = qp_init_attr->xrcd;
+ atomic_inc(&qp_init_attr->xrcd->usecnt);
+ INIT_LIST_HEAD(&qp->open_list);
+
+ qp = __ib_open_qp(real_qp, qp_init_attr->event_handler,
+ qp_init_attr->qp_context);
+ if (IS_ERR(qp))
+ return qp;
+
+ __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp);
+ return qp;
+}
+
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device;
+ struct ib_qp *qp;
+ int ret;
+
+ if (qp_init_attr->rwq_ind_tbl &&
+ (qp_init_attr->recv_cq ||
+ qp_init_attr->srq || qp_init_attr->cap.max_recv_wr ||
+ qp_init_attr->cap.max_recv_sge))
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * If the callers is using the RDMA API calculate the resources
+ * needed for the RDMA READ/WRITE operations.
+ *
+ * Note that these callers need to pass in a port number.
+ */
+ if (qp_init_attr->cap.max_rdma_ctxs)
+ rdma_rw_init_qp(device, qp_init_attr);
+
+ qp = _ib_create_qp(device, pd, qp_init_attr, NULL, NULL);
+ if (IS_ERR(qp))
+ return qp;
+
+ ret = ib_create_qp_security(qp, device);
+ if (ret)
+ goto err;
+
+ qp->real_qp = qp;
+ qp->qp_type = qp_init_attr->qp_type;
+ qp->rwq_ind_tbl = qp_init_attr->rwq_ind_tbl;
+
+ atomic_set(&qp->usecnt, 0);
+ qp->mrs_used = 0;
+ spin_lock_init(&qp->mr_lock);
+ INIT_LIST_HEAD(&qp->rdma_mrs);
+ INIT_LIST_HEAD(&qp->sig_mrs);
+ qp->port = 0;
+
+ if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) {
+ struct ib_qp *xrc_qp = create_xrc_qp(qp, qp_init_attr);
+
+ if (IS_ERR(xrc_qp)) {
+ ret = PTR_ERR(xrc_qp);
+ goto err;
+ }
+ return xrc_qp;
+ }
+
+ qp->event_handler = qp_init_attr->event_handler;
+ qp->qp_context = qp_init_attr->qp_context;
+ if (qp_init_attr->qp_type == IB_QPT_XRC_INI) {
+ qp->recv_cq = NULL;
+ qp->srq = NULL;
+ } else {
+ qp->recv_cq = qp_init_attr->recv_cq;
+ if (qp_init_attr->recv_cq)
+ atomic_inc(&qp_init_attr->recv_cq->usecnt);
+ qp->srq = qp_init_attr->srq;
+ if (qp->srq)
+ atomic_inc(&qp_init_attr->srq->usecnt);
+ }
+
+ qp->send_cq = qp_init_attr->send_cq;
+ qp->xrcd = NULL;
+
+ atomic_inc(&pd->usecnt);
+ if (qp_init_attr->send_cq)
+ atomic_inc(&qp_init_attr->send_cq->usecnt);
+ if (qp_init_attr->rwq_ind_tbl)
+ atomic_inc(&qp->rwq_ind_tbl->usecnt);
+
+ if (qp_init_attr->cap.max_rdma_ctxs) {
+ ret = rdma_rw_init_mrs(qp, qp_init_attr);
+ if (ret)
+ goto err;
+ }
+
+ /*
+ * Note: all hw drivers guarantee that max_send_sge is lower than
+ * the device RDMA WRITE SGE limit but not all hw drivers ensure that
+ * max_send_sge <= max_sge_rd.
+ */
+ qp->max_write_sge = qp_init_attr->cap.max_send_sge;
+ qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge,
+ device->attrs.max_sge_rd);
+
+ return qp;
+
+err:
+ ib_destroy_qp(qp);
+ return ERR_PTR(ret);
+
+}
+EXPORT_SYMBOL(ib_create_qp);
+
+static const struct {
+ int valid;
+ enum ib_qp_attr_mask req_param[IB_QPT_MAX];
+ enum ib_qp_attr_mask opt_param[IB_QPT_MAX];
+} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+ [IB_QPS_RESET] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_INIT] = {
+ .valid = 1,
+ .req_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY),
+ [IB_QPT_RAW_PACKET] = IB_QP_PORT,
+ [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ },
+ },
+ [IB_QPS_INIT] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_INIT] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_RTR] = {
+ .valid = 1,
+ .req_param = {
+ [IB_QPT_UC] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN),
+ [IB_QPT_RC] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_MIN_RNR_TIMER),
+ [IB_QPT_XRC_INI] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN),
+ [IB_QPT_XRC_TGT] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_MIN_RNR_TIMER),
+ },
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_RC] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ },
+ },
+ },
+ [IB_QPS_RTR] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .req_param = {
+ [IB_QPT_UD] = IB_QP_SQ_PSN,
+ [IB_QPT_UC] = IB_QP_SQ_PSN,
+ [IB_QPT_RC] = (IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_SQ_PSN |
+ IB_QP_MAX_QP_RD_ATOMIC),
+ [IB_QPT_XRC_INI] = (IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_SQ_PSN |
+ IB_QP_MAX_QP_RD_ATOMIC),
+ [IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT |
+ IB_QP_SQ_PSN),
+ [IB_QPT_SMI] = IB_QP_SQ_PSN,
+ [IB_QPT_GSI] = IB_QP_SQ_PSN,
+ },
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_RAW_PACKET] = IB_QP_RATE_LIMIT,
+ }
+ }
+ },
+ [IB_QPS_RTS] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE |
+ IB_QP_MIN_RNR_TIMER),
+ [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE |
+ IB_QP_MIN_RNR_TIMER),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_RAW_PACKET] = IB_QP_RATE_LIMIT,
+ }
+ },
+ [IB_QPS_SQD] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */
+ [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY
+ }
+ },
+ },
+ [IB_QPS_SQD] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_SQD] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_AV |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_PORT |
+ IB_QP_AV |
+ IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_INI] = (IB_QP_PORT |
+ IB_QP_AV |
+ IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_TGT] = (IB_QP_PORT |
+ IB_QP_AV |
+ IB_QP_TIMEOUT |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_SQE] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_ERR] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 }
+ }
+};
+
+bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
+ enum ib_qp_type type, enum ib_qp_attr_mask mask,
+ enum rdma_link_layer ll)
+{
+ enum ib_qp_attr_mask req_param, opt_param;
+
+ if (mask & IB_QP_CUR_STATE &&
+ cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS &&
+ cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE)
+ return false;
+
+ if (!qp_state_table[cur_state][next_state].valid)
+ return false;
+
+ req_param = qp_state_table[cur_state][next_state].req_param[type];
+ opt_param = qp_state_table[cur_state][next_state].opt_param[type];
+
+ if ((mask & req_param) != req_param)
+ return false;
+
+ if (mask & ~(req_param | opt_param | IB_QP_STATE))
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL(ib_modify_qp_is_ok);
+
+/**
+ * ib_resolve_eth_dmac - Resolve destination mac address
+ * @device: Device to consider
+ * @ah_attr: address handle attribute which describes the
+ * source and destination parameters
+ * ib_resolve_eth_dmac() resolves destination mac address and L3 hop limit It
+ * returns 0 on success or appropriate error code. It initializes the
+ * necessary ah_attr fields when call is successful.
+ */
+static int ib_resolve_eth_dmac(struct ib_device *device,
+ struct rdma_ah_attr *ah_attr)
+{
+ int ret = 0;
+
+ if (rdma_is_multicast_addr((struct in6_addr *)ah_attr->grh.dgid.raw)) {
+ if (ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw)) {
+ __be32 addr = 0;
+
+ memcpy(&addr, ah_attr->grh.dgid.raw + 12, 4);
+ ip_eth_mc_map(addr, (char *)ah_attr->roce.dmac);
+ } else {
+ ipv6_eth_mc_map((struct in6_addr *)ah_attr->grh.dgid.raw,
+ (char *)ah_attr->roce.dmac);
+ }
+ } else {
+ ret = ib_resolve_unicast_gid_dmac(device, ah_attr);
+ }
+ return ret;
+}
+
+static bool is_qp_type_connected(const struct ib_qp *qp)
+{
+ return (qp->qp_type == IB_QPT_UC ||
+ qp->qp_type == IB_QPT_RC ||
+ qp->qp_type == IB_QPT_XRC_INI ||
+ qp->qp_type == IB_QPT_XRC_TGT);
+}
+
+/**
+ * IB core internal function to perform QP attributes modification.
+ */
+static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata)
+{
+ u8 port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+ const struct ib_gid_attr *old_sgid_attr_av;
+ const struct ib_gid_attr *old_sgid_attr_alt_av;
+ int ret;
+
+ if (attr_mask & IB_QP_AV) {
+ ret = rdma_fill_sgid_attr(qp->device, &attr->ah_attr,
+ &old_sgid_attr_av);
+ if (ret)
+ return ret;
+ }
+ if (attr_mask & IB_QP_ALT_PATH) {
+ /*
+ * FIXME: This does not track the migration state, so if the
+ * user loads a new alternate path after the HW has migrated
+ * from primary->alternate we will keep the wrong
+ * references. This is OK for IB because the reference
+ * counting does not serve any functional purpose.
+ */
+ ret = rdma_fill_sgid_attr(qp->device, &attr->alt_ah_attr,
+ &old_sgid_attr_alt_av);
+ if (ret)
+ goto out_av;
+
+ /*
+ * Today the core code can only handle alternate paths and APM
+ * for IB. Ban them in roce mode.
+ */
+ if (!(rdma_protocol_ib(qp->device,
+ attr->alt_ah_attr.port_num) &&
+ rdma_protocol_ib(qp->device, port))) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /*
+ * If the user provided the qp_attr then we have to resolve it. Kernel
+ * users have to provide already resolved rdma_ah_attr's
+ */
+ if (udata && (attr_mask & IB_QP_AV) &&
+ attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE &&
+ is_qp_type_connected(qp)) {
+ ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr);
+ if (ret)
+ goto out;
+ }
+
+ if (rdma_ib_or_roce(qp->device, port)) {
+ if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) {
+ pr_warn("%s: %s rq_psn overflow, masking to 24 bits\n",
+ __func__, qp->device->name);
+ attr->rq_psn &= 0xffffff;
+ }
+
+ if (attr_mask & IB_QP_SQ_PSN && attr->sq_psn & ~0xffffff) {
+ pr_warn("%s: %s sq_psn overflow, masking to 24 bits\n",
+ __func__, qp->device->name);
+ attr->sq_psn &= 0xffffff;
+ }
+ }
+
+ ret = ib_security_modify_qp(qp, attr, attr_mask, udata);
+ if (ret)
+ goto out;
+
+ if (attr_mask & IB_QP_PORT)
+ qp->port = attr->port_num;
+ if (attr_mask & IB_QP_AV)
+ qp->av_sgid_attr =
+ rdma_update_sgid_attr(&attr->ah_attr, qp->av_sgid_attr);
+ if (attr_mask & IB_QP_ALT_PATH)
+ qp->alt_path_sgid_attr = rdma_update_sgid_attr(
+ &attr->alt_ah_attr, qp->alt_path_sgid_attr);
+
+out:
+ if (attr_mask & IB_QP_ALT_PATH)
+ rdma_unfill_sgid_attr(&attr->alt_ah_attr, old_sgid_attr_alt_av);
+out_av:
+ if (attr_mask & IB_QP_AV)
+ rdma_unfill_sgid_attr(&attr->ah_attr, old_sgid_attr_av);
+ return ret;
+}
+
+/**
+ * ib_modify_qp_with_udata - Modifies the attributes for the specified QP.
+ * @ib_qp: The QP to modify.
+ * @attr: On input, specifies the QP attributes to modify. On output,
+ * the current values of selected QP attributes are returned.
+ * @attr_mask: A bit-mask used to specify which attributes of the QP
+ * are being modified.
+ * @udata: pointer to user's input output buffer information
+ * are being modified.
+ * It returns 0 on success and returns appropriate error code on error.
+ */
+int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata)
+{
+ return _ib_modify_qp(ib_qp->real_qp, attr, attr_mask, udata);
+}
+EXPORT_SYMBOL(ib_modify_qp_with_udata);
+
+int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width)
+{
+ int rc;
+ u32 netdev_speed;
+ struct net_device *netdev;
+ struct ethtool_link_ksettings lksettings;
+
+ if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET)
+ return -EINVAL;
+
+ if (!dev->get_netdev)
+ return -EOPNOTSUPP;
+
+ netdev = dev->get_netdev(dev, port_num);
+ if (!netdev)
+ return -ENODEV;
+
+ rtnl_lock();
+ rc = __ethtool_get_link_ksettings(netdev, &lksettings);
+ rtnl_unlock();
+
+ dev_put(netdev);
+
+ if (!rc && lksettings.base.speed != (u32)SPEED_UNKNOWN) {
+ netdev_speed = lksettings.base.speed;
+ } else {
+ netdev_speed = SPEED_1000;
+ pr_warn("%s speed is unknown, defaulting to %d\n", netdev->name,
+ netdev_speed);
+ }
+
+ if (netdev_speed <= SPEED_1000) {
+ *width = IB_WIDTH_1X;
+ *speed = IB_SPEED_SDR;
+ } else if (netdev_speed <= SPEED_10000) {
+ *width = IB_WIDTH_1X;
+ *speed = IB_SPEED_FDR10;
+ } else if (netdev_speed <= SPEED_20000) {
+ *width = IB_WIDTH_4X;
+ *speed = IB_SPEED_DDR;
+ } else if (netdev_speed <= SPEED_25000) {
+ *width = IB_WIDTH_1X;
+ *speed = IB_SPEED_EDR;
+ } else if (netdev_speed <= SPEED_40000) {
+ *width = IB_WIDTH_4X;
+ *speed = IB_SPEED_FDR10;
+ } else {
+ *width = IB_WIDTH_4X;
+ *speed = IB_SPEED_EDR;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_get_eth_speed);
+
+int ib_modify_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask)
+{
+ return _ib_modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL);
+}
+EXPORT_SYMBOL(ib_modify_qp);
+
+int ib_query_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ qp_attr->ah_attr.grh.sgid_attr = NULL;
+ qp_attr->alt_ah_attr.grh.sgid_attr = NULL;
+
+ return qp->device->query_qp ?
+ qp->device->query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) :
+ -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(ib_query_qp);
+
+int ib_close_qp(struct ib_qp *qp)
+{
+ struct ib_qp *real_qp;
+ unsigned long flags;
+
+ real_qp = qp->real_qp;
+ if (real_qp == qp)
+ return -EINVAL;
+
+ spin_lock_irqsave(&real_qp->device->event_handler_lock, flags);
+ list_del(&qp->open_list);
+ spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
+
+ atomic_dec(&real_qp->usecnt);
+ if (qp->qp_sec)
+ ib_close_shared_qp_security(qp->qp_sec);
+ kfree(qp);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_close_qp);
+
+static int __ib_destroy_shared_qp(struct ib_qp *qp)
+{
+ struct ib_xrcd *xrcd;
+ struct ib_qp *real_qp;
+ int ret;
+
+ real_qp = qp->real_qp;
+ xrcd = real_qp->xrcd;
+
+ mutex_lock(&xrcd->tgt_qp_mutex);
+ ib_close_qp(qp);
+ if (atomic_read(&real_qp->usecnt) == 0)
+ list_del(&real_qp->xrcd_list);
+ else
+ real_qp = NULL;
+ mutex_unlock(&xrcd->tgt_qp_mutex);
+
+ if (real_qp) {
+ ret = ib_destroy_qp(real_qp);
+ if (!ret)
+ atomic_dec(&xrcd->usecnt);
+ else
+ __ib_insert_xrcd_qp(xrcd, real_qp);
+ }
+
+ return 0;
+}
+
+int ib_destroy_qp(struct ib_qp *qp)
+{
+ const struct ib_gid_attr *alt_path_sgid_attr = qp->alt_path_sgid_attr;
+ const struct ib_gid_attr *av_sgid_attr = qp->av_sgid_attr;
+ struct ib_pd *pd;
+ struct ib_cq *scq, *rcq;
+ struct ib_srq *srq;
+ struct ib_rwq_ind_table *ind_tbl;
+ struct ib_qp_security *sec;
+ int ret;
+
+ WARN_ON_ONCE(qp->mrs_used > 0);
+
+ if (atomic_read(&qp->usecnt))
+ return -EBUSY;
+
+ if (qp->real_qp != qp)
+ return __ib_destroy_shared_qp(qp);
+
+ pd = qp->pd;
+ scq = qp->send_cq;
+ rcq = qp->recv_cq;
+ srq = qp->srq;
+ ind_tbl = qp->rwq_ind_tbl;
+ sec = qp->qp_sec;
+ if (sec)
+ ib_destroy_qp_security_begin(sec);
+
+ if (!qp->uobject)
+ rdma_rw_cleanup_mrs(qp);
+
+ rdma_restrack_del(&qp->res);
+ ret = qp->device->destroy_qp(qp);
+ if (!ret) {
+ if (alt_path_sgid_attr)
+ rdma_put_gid_attr(alt_path_sgid_attr);
+ if (av_sgid_attr)
+ rdma_put_gid_attr(av_sgid_attr);
+ if (pd)
+ atomic_dec(&pd->usecnt);
+ if (scq)
+ atomic_dec(&scq->usecnt);
+ if (rcq)
+ atomic_dec(&rcq->usecnt);
+ if (srq)
+ atomic_dec(&srq->usecnt);
+ if (ind_tbl)
+ atomic_dec(&ind_tbl->usecnt);
+ if (sec)
+ ib_destroy_qp_security_end(sec);
+ } else {
+ if (sec)
+ ib_destroy_qp_security_abort(sec);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_destroy_qp);
+
+/* Completion queues */
+
+struct ib_cq *__ib_create_cq(struct ib_device *device,
+ ib_comp_handler comp_handler,
+ void (*event_handler)(struct ib_event *, void *),
+ void *cq_context,
+ const struct ib_cq_init_attr *cq_attr,
+ const char *caller)
+{
+ struct ib_cq *cq;
+
+ cq = device->create_cq(device, cq_attr, NULL, NULL);
+
+ if (!IS_ERR(cq)) {
+ cq->device = device;
+ cq->uobject = NULL;
+ cq->comp_handler = comp_handler;
+ cq->event_handler = event_handler;
+ cq->cq_context = cq_context;
+ atomic_set(&cq->usecnt, 0);
+ cq->res.type = RDMA_RESTRACK_CQ;
+ cq->res.kern_name = caller;
+ rdma_restrack_add(&cq->res);
+ }
+
+ return cq;
+}
+EXPORT_SYMBOL(__ib_create_cq);
+
+int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+ return cq->device->modify_cq ?
+ cq->device->modify_cq(cq, cq_count, cq_period) : -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(rdma_set_cq_moderation);
+
+int ib_destroy_cq(struct ib_cq *cq)
+{
+ if (atomic_read(&cq->usecnt))
+ return -EBUSY;
+
+ rdma_restrack_del(&cq->res);
+ return cq->device->destroy_cq(cq);
+}
+EXPORT_SYMBOL(ib_destroy_cq);
+
+int ib_resize_cq(struct ib_cq *cq, int cqe)
+{
+ return cq->device->resize_cq ?
+ cq->device->resize_cq(cq, cqe, NULL) : -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(ib_resize_cq);
+
+/* Memory regions */
+
+int ib_dereg_mr(struct ib_mr *mr)
+{
+ struct ib_pd *pd = mr->pd;
+ struct ib_dm *dm = mr->dm;
+ int ret;
+
+ rdma_restrack_del(&mr->res);
+ ret = mr->device->dereg_mr(mr);
+ if (!ret) {
+ atomic_dec(&pd->usecnt);
+ if (dm)
+ atomic_dec(&dm->usecnt);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_dereg_mr);
+
+/**
+ * ib_alloc_mr() - Allocates a memory region
+ * @pd: protection domain associated with the region
+ * @mr_type: memory region type
+ * @max_num_sg: maximum sg entries available for registration.
+ *
+ * Notes:
+ * Memory registeration page/sg lists must not exceed max_num_sg.
+ * For mr_type IB_MR_TYPE_MEM_REG, the total length cannot exceed
+ * max_num_sg * used_page_size.
+ *
+ */
+struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
+ enum ib_mr_type mr_type,
+ u32 max_num_sg)
+{
+ struct ib_mr *mr;
+
+ if (!pd->device->alloc_mr)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ mr = pd->device->alloc_mr(pd, mr_type, max_num_sg);
+ if (!IS_ERR(mr)) {
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->dm = NULL;
+ mr->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ mr->need_inval = false;
+ mr->res.type = RDMA_RESTRACK_MR;
+ rdma_restrack_add(&mr->res);
+ }
+
+ return mr;
+}
+EXPORT_SYMBOL(ib_alloc_mr);
+
+/* "Fast" memory regions */
+
+struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
+ int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr)
+{
+ struct ib_fmr *fmr;
+
+ if (!pd->device->alloc_fmr)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr);
+ if (!IS_ERR(fmr)) {
+ fmr->device = pd->device;
+ fmr->pd = pd;
+ atomic_inc(&pd->usecnt);
+ }
+
+ return fmr;
+}
+EXPORT_SYMBOL(ib_alloc_fmr);
+
+int ib_unmap_fmr(struct list_head *fmr_list)
+{
+ struct ib_fmr *fmr;
+
+ if (list_empty(fmr_list))
+ return 0;
+
+ fmr = list_entry(fmr_list->next, struct ib_fmr, list);
+ return fmr->device->unmap_fmr(fmr_list);
+}
+EXPORT_SYMBOL(ib_unmap_fmr);
+
+int ib_dealloc_fmr(struct ib_fmr *fmr)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ pd = fmr->pd;
+ ret = fmr->device->dealloc_fmr(fmr);
+ if (!ret)
+ atomic_dec(&pd->usecnt);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_fmr);
+
+/* Multicast groups */
+
+static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid)
+{
+ struct ib_qp_init_attr init_attr = {};
+ struct ib_qp_attr attr = {};
+ int num_eth_ports = 0;
+ int port;
+
+ /* If QP state >= init, it is assigned to a port and we can check this
+ * port only.
+ */
+ if (!ib_query_qp(qp, &attr, IB_QP_STATE | IB_QP_PORT, &init_attr)) {
+ if (attr.qp_state >= IB_QPS_INIT) {
+ if (rdma_port_get_link_layer(qp->device, attr.port_num) !=
+ IB_LINK_LAYER_INFINIBAND)
+ return true;
+ goto lid_check;
+ }
+ }
+
+ /* Can't get a quick answer, iterate over all ports */
+ for (port = 0; port < qp->device->phys_port_cnt; port++)
+ if (rdma_port_get_link_layer(qp->device, port) !=
+ IB_LINK_LAYER_INFINIBAND)
+ num_eth_ports++;
+
+ /* If we have at lease one Ethernet port, RoCE annex declares that
+ * multicast LID should be ignored. We can't tell at this step if the
+ * QP belongs to an IB or Ethernet port.
+ */
+ if (num_eth_ports)
+ return true;
+
+ /* If all the ports are IB, we can check according to IB spec. */
+lid_check:
+ return !(lid < be16_to_cpu(IB_MULTICAST_LID_BASE) ||
+ lid == be16_to_cpu(IB_LID_PERMISSIVE));
+}
+
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+ int ret;
+
+ if (!qp->device->attach_mcast)
+ return -EOPNOTSUPP;
+
+ if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) ||
+ qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid))
+ return -EINVAL;
+
+ ret = qp->device->attach_mcast(qp, gid, lid);
+ if (!ret)
+ atomic_inc(&qp->usecnt);
+ return ret;
+}
+EXPORT_SYMBOL(ib_attach_mcast);
+
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+ int ret;
+
+ if (!qp->device->detach_mcast)
+ return -EOPNOTSUPP;
+
+ if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) ||
+ qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid))
+ return -EINVAL;
+
+ ret = qp->device->detach_mcast(qp, gid, lid);
+ if (!ret)
+ atomic_dec(&qp->usecnt);
+ return ret;
+}
+EXPORT_SYMBOL(ib_detach_mcast);
+
+struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller)
+{
+ struct ib_xrcd *xrcd;
+
+ if (!device->alloc_xrcd)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ xrcd = device->alloc_xrcd(device, NULL, NULL);
+ if (!IS_ERR(xrcd)) {
+ xrcd->device = device;
+ xrcd->inode = NULL;
+ atomic_set(&xrcd->usecnt, 0);
+ mutex_init(&xrcd->tgt_qp_mutex);
+ INIT_LIST_HEAD(&xrcd->tgt_qp_list);
+ }
+
+ return xrcd;
+}
+EXPORT_SYMBOL(__ib_alloc_xrcd);
+
+int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+{
+ struct ib_qp *qp;
+ int ret;
+
+ if (atomic_read(&xrcd->usecnt))
+ return -EBUSY;
+
+ while (!list_empty(&xrcd->tgt_qp_list)) {
+ qp = list_entry(xrcd->tgt_qp_list.next, struct ib_qp, xrcd_list);
+ ret = ib_destroy_qp(qp);
+ if (ret)
+ return ret;
+ }
+
+ return xrcd->device->dealloc_xrcd(xrcd);
+}
+EXPORT_SYMBOL(ib_dealloc_xrcd);
+
+/**
+ * ib_create_wq - Creates a WQ associated with the specified protection
+ * domain.
+ * @pd: The protection domain associated with the WQ.
+ * @wq_attr: A list of initial attributes required to create the
+ * WQ. If WQ creation succeeds, then the attributes are updated to
+ * the actual capabilities of the created WQ.
+ *
+ * wq_attr->max_wr and wq_attr->max_sge determine
+ * the requested size of the WQ, and set to the actual values allocated
+ * on return.
+ * If ib_create_wq() succeeds, then max_wr and max_sge will always be
+ * at least as large as the requested values.
+ */
+struct ib_wq *ib_create_wq(struct ib_pd *pd,
+ struct ib_wq_init_attr *wq_attr)
+{
+ struct ib_wq *wq;
+
+ if (!pd->device->create_wq)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ wq = pd->device->create_wq(pd, wq_attr, NULL);
+ if (!IS_ERR(wq)) {
+ wq->event_handler = wq_attr->event_handler;
+ wq->wq_context = wq_attr->wq_context;
+ wq->wq_type = wq_attr->wq_type;
+ wq->cq = wq_attr->cq;
+ wq->device = pd->device;
+ wq->pd = pd;
+ wq->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&wq_attr->cq->usecnt);
+ atomic_set(&wq->usecnt, 0);
+ }
+ return wq;
+}
+EXPORT_SYMBOL(ib_create_wq);
+
+/**
+ * ib_destroy_wq - Destroys the specified WQ.
+ * @wq: The WQ to destroy.
+ */
+int ib_destroy_wq(struct ib_wq *wq)
+{
+ int err;
+ struct ib_cq *cq = wq->cq;
+ struct ib_pd *pd = wq->pd;
+
+ if (atomic_read(&wq->usecnt))
+ return -EBUSY;
+
+ err = wq->device->destroy_wq(wq);
+ if (!err) {
+ atomic_dec(&pd->usecnt);
+ atomic_dec(&cq->usecnt);
+ }
+ return err;
+}
+EXPORT_SYMBOL(ib_destroy_wq);
+
+/**
+ * ib_modify_wq - Modifies the specified WQ.
+ * @wq: The WQ to modify.
+ * @wq_attr: On input, specifies the WQ attributes to modify.
+ * @wq_attr_mask: A bit-mask used to specify which attributes of the WQ
+ * are being modified.
+ * On output, the current values of selected WQ attributes are returned.
+ */
+int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+ u32 wq_attr_mask)
+{
+ int err;
+
+ if (!wq->device->modify_wq)
+ return -EOPNOTSUPP;
+
+ err = wq->device->modify_wq(wq, wq_attr, wq_attr_mask, NULL);
+ return err;
+}
+EXPORT_SYMBOL(ib_modify_wq);
+
+/*
+ * ib_create_rwq_ind_table - Creates a RQ Indirection Table.
+ * @device: The device on which to create the rwq indirection table.
+ * @ib_rwq_ind_table_init_attr: A list of initial attributes required to
+ * create the Indirection Table.
+ *
+ * Note: The life time of ib_rwq_ind_table_init_attr->ind_tbl is not less
+ * than the created ib_rwq_ind_table object and the caller is responsible
+ * for its memory allocation/free.
+ */
+struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device,
+ struct ib_rwq_ind_table_init_attr *init_attr)
+{
+ struct ib_rwq_ind_table *rwq_ind_table;
+ int i;
+ u32 table_size;
+
+ if (!device->create_rwq_ind_table)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ table_size = (1 << init_attr->log_ind_tbl_size);
+ rwq_ind_table = device->create_rwq_ind_table(device,
+ init_attr, NULL);
+ if (IS_ERR(rwq_ind_table))
+ return rwq_ind_table;
+
+ rwq_ind_table->ind_tbl = init_attr->ind_tbl;
+ rwq_ind_table->log_ind_tbl_size = init_attr->log_ind_tbl_size;
+ rwq_ind_table->device = device;
+ rwq_ind_table->uobject = NULL;
+ atomic_set(&rwq_ind_table->usecnt, 0);
+
+ for (i = 0; i < table_size; i++)
+ atomic_inc(&rwq_ind_table->ind_tbl[i]->usecnt);
+
+ return rwq_ind_table;
+}
+EXPORT_SYMBOL(ib_create_rwq_ind_table);
+
+/*
+ * ib_destroy_rwq_ind_table - Destroys the specified Indirection Table.
+ * @wq_ind_table: The Indirection Table to destroy.
+*/
+int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *rwq_ind_table)
+{
+ int err, i;
+ u32 table_size = (1 << rwq_ind_table->log_ind_tbl_size);
+ struct ib_wq **ind_tbl = rwq_ind_table->ind_tbl;
+
+ if (atomic_read(&rwq_ind_table->usecnt))
+ return -EBUSY;
+
+ err = rwq_ind_table->device->destroy_rwq_ind_table(rwq_ind_table);
+ if (!err) {
+ for (i = 0; i < table_size; i++)
+ atomic_dec(&ind_tbl[i]->usecnt);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(ib_destroy_rwq_ind_table);
+
+int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
+ struct ib_mr_status *mr_status)
+{
+ return mr->device->check_mr_status ?
+ mr->device->check_mr_status(mr, check_mask, mr_status) : -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(ib_check_mr_status);
+
+int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port,
+ int state)
+{
+ if (!device->set_vf_link_state)
+ return -EOPNOTSUPP;
+
+ return device->set_vf_link_state(device, vf, port, state);
+}
+EXPORT_SYMBOL(ib_set_vf_link_state);
+
+int ib_get_vf_config(struct ib_device *device, int vf, u8 port,
+ struct ifla_vf_info *info)
+{
+ if (!device->get_vf_config)
+ return -EOPNOTSUPP;
+
+ return device->get_vf_config(device, vf, port, info);
+}
+EXPORT_SYMBOL(ib_get_vf_config);
+
+int ib_get_vf_stats(struct ib_device *device, int vf, u8 port,
+ struct ifla_vf_stats *stats)
+{
+ if (!device->get_vf_stats)
+ return -EOPNOTSUPP;
+
+ return device->get_vf_stats(device, vf, port, stats);
+}
+EXPORT_SYMBOL(ib_get_vf_stats);
+
+int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid,
+ int type)
+{
+ if (!device->set_vf_guid)
+ return -EOPNOTSUPP;
+
+ return device->set_vf_guid(device, vf, port, guid, type);
+}
+EXPORT_SYMBOL(ib_set_vf_guid);
+
+/**
+ * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list
+ * and set it the memory region.
+ * @mr: memory region
+ * @sg: dma mapped scatterlist
+ * @sg_nents: number of entries in sg
+ * @sg_offset: offset in bytes into sg
+ * @page_size: page vector desired page size
+ *
+ * Constraints:
+ * - The first sg element is allowed to have an offset.
+ * - Each sg element must either be aligned to page_size or virtually
+ * contiguous to the previous element. In case an sg element has a
+ * non-contiguous offset, the mapping prefix will not include it.
+ * - The last sg element is allowed to have length less than page_size.
+ * - If sg_nents total byte length exceeds the mr max_num_sge * page_size
+ * then only max_num_sg entries will be mapped.
+ * - If the MR was allocated with type IB_MR_TYPE_SG_GAPS, none of these
+ * constraints holds and the page_size argument is ignored.
+ *
+ * Returns the number of sg elements that were mapped to the memory region.
+ *
+ * After this completes successfully, the memory region
+ * is ready for registration.
+ */
+int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset, unsigned int page_size)
+{
+ if (unlikely(!mr->device->map_mr_sg))
+ return -EOPNOTSUPP;
+
+ mr->page_size = page_size;
+
+ return mr->device->map_mr_sg(mr, sg, sg_nents, sg_offset);
+}
+EXPORT_SYMBOL(ib_map_mr_sg);
+
+/**
+ * ib_sg_to_pages() - Convert the largest prefix of a sg list
+ * to a page vector
+ * @mr: memory region
+ * @sgl: dma mapped scatterlist
+ * @sg_nents: number of entries in sg
+ * @sg_offset_p: IN: start offset in bytes into sg
+ * OUT: offset in bytes for element n of the sg of the first
+ * byte that has not been processed where n is the return
+ * value of this function.
+ * @set_page: driver page assignment function pointer
+ *
+ * Core service helper for drivers to convert the largest
+ * prefix of given sg list to a page vector. The sg list
+ * prefix converted is the prefix that meet the requirements
+ * of ib_map_mr_sg.
+ *
+ * Returns the number of sg elements that were assigned to
+ * a page vector.
+ */
+int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents,
+ unsigned int *sg_offset_p, int (*set_page)(struct ib_mr *, u64))
+{
+ struct scatterlist *sg;
+ u64 last_end_dma_addr = 0;
+ unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
+ unsigned int last_page_off = 0;
+ u64 page_mask = ~((u64)mr->page_size - 1);
+ int i, ret;
+
+ if (unlikely(sg_nents <= 0 || sg_offset > sg_dma_len(&sgl[0])))
+ return -EINVAL;
+
+ mr->iova = sg_dma_address(&sgl[0]) + sg_offset;
+ mr->length = 0;
+
+ for_each_sg(sgl, sg, sg_nents, i) {
+ u64 dma_addr = sg_dma_address(sg) + sg_offset;
+ u64 prev_addr = dma_addr;
+ unsigned int dma_len = sg_dma_len(sg) - sg_offset;
+ u64 end_dma_addr = dma_addr + dma_len;
+ u64 page_addr = dma_addr & page_mask;
+
+ /*
+ * For the second and later elements, check whether either the
+ * end of element i-1 or the start of element i is not aligned
+ * on a page boundary.
+ */
+ if (i && (last_page_off != 0 || page_addr != dma_addr)) {
+ /* Stop mapping if there is a gap. */
+ if (last_end_dma_addr != dma_addr)
+ break;
+
+ /*
+ * Coalesce this element with the last. If it is small
+ * enough just update mr->length. Otherwise start
+ * mapping from the next page.
+ */
+ goto next_page;
+ }
+
+ do {
+ ret = set_page(mr, page_addr);
+ if (unlikely(ret < 0)) {
+ sg_offset = prev_addr - sg_dma_address(sg);
+ mr->length += prev_addr - dma_addr;
+ if (sg_offset_p)
+ *sg_offset_p = sg_offset;
+ return i || sg_offset ? i : ret;
+ }
+ prev_addr = page_addr;
+next_page:
+ page_addr += mr->page_size;
+ } while (page_addr < end_dma_addr);
+
+ mr->length += dma_len;
+ last_end_dma_addr = end_dma_addr;
+ last_page_off = end_dma_addr & ~page_mask;
+
+ sg_offset = 0;
+ }
+
+ if (sg_offset_p)
+ *sg_offset_p = 0;
+ return i;
+}
+EXPORT_SYMBOL(ib_sg_to_pages);
+
+struct ib_drain_cqe {
+ struct ib_cqe cqe;
+ struct completion done;
+};
+
+static void ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_drain_cqe *cqe = container_of(wc->wr_cqe, struct ib_drain_cqe,
+ cqe);
+
+ complete(&cqe->done);
+}
+
+/*
+ * Post a WR and block until its completion is reaped for the SQ.
+ */
+static void __ib_drain_sq(struct ib_qp *qp)
+{
+ struct ib_cq *cq = qp->send_cq;
+ struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
+ struct ib_drain_cqe sdrain;
+ struct ib_rdma_wr swr = {
+ .wr = {
+ .next = NULL,
+ { .wr_cqe = &sdrain.cqe, },
+ .opcode = IB_WR_RDMA_WRITE,
+ },
+ };
+ int ret;
+
+ ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
+ if (ret) {
+ WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
+ return;
+ }
+
+ sdrain.cqe.done = ib_drain_qp_done;
+ init_completion(&sdrain.done);
+
+ ret = ib_post_send(qp, &swr.wr, NULL);
+ if (ret) {
+ WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
+ return;
+ }
+
+ if (cq->poll_ctx == IB_POLL_DIRECT)
+ while (wait_for_completion_timeout(&sdrain.done, HZ / 10) <= 0)
+ ib_process_cq_direct(cq, -1);
+ else
+ wait_for_completion(&sdrain.done);
+}
+
+/*
+ * Post a WR and block until its completion is reaped for the RQ.
+ */
+static void __ib_drain_rq(struct ib_qp *qp)
+{
+ struct ib_cq *cq = qp->recv_cq;
+ struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
+ struct ib_drain_cqe rdrain;
+ struct ib_recv_wr rwr = {};
+ int ret;
+
+ ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
+ if (ret) {
+ WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
+ return;
+ }
+
+ rwr.wr_cqe = &rdrain.cqe;
+ rdrain.cqe.done = ib_drain_qp_done;
+ init_completion(&rdrain.done);
+
+ ret = ib_post_recv(qp, &rwr, NULL);
+ if (ret) {
+ WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
+ return;
+ }
+
+ if (cq->poll_ctx == IB_POLL_DIRECT)
+ while (wait_for_completion_timeout(&rdrain.done, HZ / 10) <= 0)
+ ib_process_cq_direct(cq, -1);
+ else
+ wait_for_completion(&rdrain.done);
+}
+
+/**
+ * ib_drain_sq() - Block until all SQ CQEs have been consumed by the
+ * application.
+ * @qp: queue pair to drain
+ *
+ * If the device has a provider-specific drain function, then
+ * call that. Otherwise call the generic drain function
+ * __ib_drain_sq().
+ *
+ * The caller must:
+ *
+ * ensure there is room in the CQ and SQ for the drain work request and
+ * completion.
+ *
+ * allocate the CQ using ib_alloc_cq().
+ *
+ * ensure that there are no other contexts that are posting WRs concurrently.
+ * Otherwise the drain is not guaranteed.
+ */
+void ib_drain_sq(struct ib_qp *qp)
+{
+ if (qp->device->drain_sq)
+ qp->device->drain_sq(qp);
+ else
+ __ib_drain_sq(qp);
+}
+EXPORT_SYMBOL(ib_drain_sq);
+
+/**
+ * ib_drain_rq() - Block until all RQ CQEs have been consumed by the
+ * application.
+ * @qp: queue pair to drain
+ *
+ * If the device has a provider-specific drain function, then
+ * call that. Otherwise call the generic drain function
+ * __ib_drain_rq().
+ *
+ * The caller must:
+ *
+ * ensure there is room in the CQ and RQ for the drain work request and
+ * completion.
+ *
+ * allocate the CQ using ib_alloc_cq().
+ *
+ * ensure that there are no other contexts that are posting WRs concurrently.
+ * Otherwise the drain is not guaranteed.
+ */
+void ib_drain_rq(struct ib_qp *qp)
+{
+ if (qp->device->drain_rq)
+ qp->device->drain_rq(qp);
+ else
+ __ib_drain_rq(qp);
+}
+EXPORT_SYMBOL(ib_drain_rq);
+
+/**
+ * ib_drain_qp() - Block until all CQEs have been consumed by the
+ * application on both the RQ and SQ.
+ * @qp: queue pair to drain
+ *
+ * The caller must:
+ *
+ * ensure there is room in the CQ(s), SQ, and RQ for drain work requests
+ * and completions.
+ *
+ * allocate the CQs using ib_alloc_cq().
+ *
+ * ensure that there are no other contexts that are posting WRs concurrently.
+ * Otherwise the drain is not guaranteed.
+ */
+void ib_drain_qp(struct ib_qp *qp)
+{
+ ib_drain_sq(qp);
+ if (!qp->srq)
+ ib_drain_rq(qp);
+}
+EXPORT_SYMBOL(ib_drain_qp);